aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig15
-rw-r--r--arch/x86/Makefile13
-rw-r--r--arch/x86/boot/compressed/Makefile1
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S2
-rw-r--r--arch/x86/boot/compressed/head_64.S172
-rw-r--r--arch/x86/boot/compressed/idt_64.c14
-rw-r--r--arch/x86/boot/compressed/kaslr.c4
-rw-r--r--arch/x86/boot/compressed/mem_encrypt.S130
-rw-r--r--arch/x86/boot/compressed/misc.c9
-rw-r--r--arch/x86/boot/compressed/misc.h6
-rw-r--r--arch/x86/boot/compressed/sev-es.c23
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S28
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S5
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c2
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S7
-rw-r--r--arch/x86/crypto/curve25519-x86_64.c6
-rw-r--r--arch/x86/crypto/poly1305_glue.c6
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S8
-rw-r--r--arch/x86/crypto/sha1_ni_asm.S8
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S13
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S41
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S42
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S41
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64-3way.S2
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c2
-rw-r--r--arch/x86/entry/common.c3
-rw-r--r--arch/x86/entry/entry_32.S103
-rw-r--r--arch/x86/entry/entry_64.S4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/vdso2c.c2
-rw-r--r--arch/x86/entry/vdso/vdso2c.h2
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S4
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/entry/vdso/vsgx.S2
-rw-r--r--arch/x86/events/amd/core.c2
-rw-r--r--arch/x86/events/amd/iommu.c6
-rw-r--r--arch/x86/events/amd/iommu.h2
-rw-r--r--arch/x86/events/amd/uncore.c6
-rw-r--r--arch/x86/events/core.c334
-rw-r--r--arch/x86/events/intel/Makefile2
-rw-r--r--arch/x86/events/intel/bts.c2
-rw-r--r--arch/x86/events/intel/core.c705
-rw-r--r--arch/x86/events/intel/cstate.c39
-rw-r--r--arch/x86/events/intel/ds.c45
-rw-r--r--arch/x86/events/intel/lbr.c21
-rw-r--r--arch/x86/events/intel/p4.c22
-rw-r--r--arch/x86/events/intel/pt.c2
-rw-r--r--arch/x86/events/intel/uncore.c207
-rw-r--r--arch/x86/events/intel/uncore.h20
-rw-r--r--arch/x86/events/intel/uncore_discovery.c622
-rw-r--r--arch/x86/events/intel/uncore_discovery.h131
-rw-r--r--arch/x86/events/intel/uncore_snb.c131
-rw-r--r--arch/x86/events/intel/uncore_snbep.c133
-rw-r--r--arch/x86/events/msr.c2
-rw-r--r--arch/x86/events/perf_event.h119
-rw-r--r--arch/x86/events/rapl.c2
-rw-r--r--arch/x86/events/zhaoxin/core.c2
-rw-r--r--arch/x86/hyperv/hv_apic.c18
-rw-r--r--arch/x86/hyperv/hv_init.c110
-rw-r--r--arch/x86/hyperv/hv_proc.c26
-rw-r--r--arch/x86/hyperv/hv_spinlock.c8
-rw-r--r--arch/x86/hyperv/irqdomain.c6
-rw-r--r--arch/x86/hyperv/mmu.c18
-rw-r--r--arch/x86/hyperv/nested.c8
-rw-r--r--arch/x86/include/asm/agp.h2
-rw-r--r--arch/x86/include/asm/alternative-asm.h114
-rw-r--r--arch/x86/include/asm/alternative.h149
-rw-r--r--arch/x86/include/asm/asm-prototypes.h13
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/cpu.h13
-rw-r--r--arch/x86/include/asm/cpufeature.h41
-rw-r--r--arch/x86/include/asm/cpufeatures.h8
-rw-r--r--arch/x86/include/asm/elf.h10
-rw-r--r--arch/x86/include/asm/entry-common.h16
-rw-r--r--arch/x86/include/asm/floppy.h1
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h131
-rw-r--r--arch/x86/include/asm/idtentry.h2
-rw-r--r--arch/x86/include/asm/inat.h2
-rw-r--r--arch/x86/include/asm/insn-eval.h4
-rw-r--r--arch/x86/include/asm/insn.h46
-rw-r--r--arch/x86/include/asm/intel-family.h52
-rw-r--r--arch/x86/include/asm/intel_pconfig.h2
-rw-r--r--arch/x86/include/asm/intel_pt.h2
-rw-r--r--arch/x86/include/asm/io.h2
-rw-r--r--arch/x86/include/asm/irq_stack.h2
-rw-r--r--arch/x86/include/asm/irqflags.h7
-rw-r--r--arch/x86/include/asm/jump_label.h16
-rw-r--r--arch/x86/include/asm/kexec.h5
-rw-r--r--arch/x86/include/asm/kprobes.h21
-rw-r--r--arch/x86/include/asm/kvm_host.h4
-rw-r--r--arch/x86/include/asm/mshyperv.h100
-rw-r--r--arch/x86/include/asm/msr-index.h6
-rw-r--r--arch/x86/include/asm/nops.h176
-rw-r--r--arch/x86/include/asm/nospec-branch.h9
-rw-r--r--arch/x86/include/asm/paravirt.h173
-rw-r--r--arch/x86/include/asm/paravirt_types.h216
-rw-r--r--arch/x86/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/processor.h27
-rw-r--r--arch/x86/include/asm/proto.h2
-rw-r--r--arch/x86/include/asm/ptrace.h5
-rw-r--r--arch/x86/include/asm/segment.h30
-rw-r--r--arch/x86/include/asm/set_memory.h4
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/sgx.h (renamed from arch/x86/kernel/cpu/sgx/arch.h)52
-rw-r--r--arch/x86/include/asm/smap.h5
-rw-r--r--arch/x86/include/asm/special_insns.h4
-rw-r--r--arch/x86/include/asm/stackprotector.h79
-rw-r--r--arch/x86/include/asm/suspend_32.h6
-rw-r--r--arch/x86/include/asm/switch_to.h7
-rw-r--r--arch/x86/include/asm/syscall_wrapper.h1
-rw-r--r--arch/x86/include/asm/thread_info.h8
-rw-r--r--arch/x86/include/asm/tlbflush.h48
-rw-r--r--arch/x86/include/asm/trace/hyperv.h2
-rw-r--r--arch/x86/include/asm/uv/uv_geo.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h2
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h3
-rw-r--r--arch/x86/include/asm/vmalloc.h20
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h4
-rw-r--r--arch/x86/include/uapi/asm/debugreg.h1
-rw-r--r--arch/x86/include/uapi/asm/msgbuf.h2
-rw-r--r--arch/x86/include/uapi/asm/sgx.h2
-rw-r--r--arch/x86/include/uapi/asm/shmbuf.h2
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h2
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/acpi/sleep.c2
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S2
-rw-r--r--arch/x86/kernel/alternative.c301
-rw-r--r--arch/x86/kernel/amd_nb.c2
-rw-r--r--arch/x86/kernel/apic/apic.c10
-rw-r--r--arch/x86/kernel/apic/io_apic.c8
-rw-r--r--arch/x86/kernel/apic/vector.c17
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c33
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/asm-offsets.c7
-rw-r--r--arch/x86/kernel/asm-offsets_32.c5
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/cacheinfo.c2
-rw-r--r--arch/x86/kernel/cpu/common.c13
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c3
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c71
-rw-r--r--arch/x86/kernel/cpu/hygon.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c129
-rw-r--r--arch/x86/kernel/cpu/mce/core.c2
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c6
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c14
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c8
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c36
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c6
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c6
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c6
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/sgx/Makefile1
-rw-r--r--arch/x86/kernel/cpu/sgx/driver.c17
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c33
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h1
-rw-r--r--arch/x86/kernel/cpu/sgx/encls.h30
-rw-r--r--arch/x86/kernel/cpu/sgx/ioctl.c43
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c268
-rw-r--r--arch/x86/kernel/cpu/sgx/sgx.h40
-rw-r--r--arch/x86/kernel/cpu/sgx/virt.c376
-rw-r--r--arch/x86/kernel/cpu/topology.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c7
-rw-r--r--arch/x86/kernel/crash.c16
-rw-r--r--arch/x86/kernel/doublefault_32.c4
-rw-r--r--arch/x86/kernel/e820.c6
-rw-r--r--arch/x86/kernel/early-quirks.c1
-rw-r--r--arch/x86/kernel/fpu/xstate.c2
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S18
-rw-r--r--arch/x86/kernel/idt.c2
-rw-r--r--arch/x86/kernel/irq.c2
-rw-r--r--arch/x86/kernel/jump_label.c32
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c596
-rw-r--r--arch/x86/kernel/kprobes/ftrace.c2
-rw-r--r--arch/x86/kernel/kprobes/opt.c9
-rw-r--r--arch/x86/kernel/kvm.c13
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/machine_kexec_64.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c9
-rw-r--r--arch/x86/kernel/paravirt.c77
-rw-r--r--arch/x86/kernel/paravirt_patch.c99
-rw-r--r--arch/x86/kernel/process.c9
-rw-r--r--arch/x86/kernel/pvclock.c2
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S2
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S2
-rw-r--r--arch/x86/kernel/setup.c121
-rw-r--r--arch/x86/kernel/setup_percpu.c1
-rw-r--r--arch/x86/kernel/sev-es-shared.c16
-rw-r--r--arch/x86/kernel/sev-es.c99
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/signal_compat.c5
-rw-r--r--arch/x86/kernel/smp.c4
-rw-r--r--arch/x86/kernel/smpboot.c92
-rw-r--r--arch/x86/kernel/stacktrace.c6
-rw-r--r--arch/x86/kernel/static_call.c4
-rw-r--r--arch/x86/kernel/sysfb_efi.c2
-rw-r--r--arch/x86/kernel/tboot.c44
-rw-r--r--arch/x86/kernel/tls.c8
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/traps.c16
-rw-r--r--arch/x86/kernel/tsc.c9
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/umip.c4
-rw-r--r--arch/x86/kernel/uprobes.c8
-rw-r--r--arch/x86/kvm/Kconfig12
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/emulate.c2
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/mmu/mmu.c2
-rw-r--r--arch/x86/kvm/mmu/mmu_internal.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/pmu.h2
-rw-r--r--arch/x86/kvm/svm/avic.c4
-rw-r--r--arch/x86/kvm/svm/sev.c72
-rw-r--r--arch/x86/kvm/svm/svm.c2
-rw-r--r--arch/x86/kvm/svm/svm.h1
-rw-r--r--arch/x86/kvm/vmx/nested.c2
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/vmx.c16
-rw-r--r--arch/x86/kvm/x86.c34
-rw-r--r--arch/x86/lib/atomic64_386_32.S2
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S2
-rw-r--r--arch/x86/lib/copy_page_64.S2
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/inat.c2
-rw-r--r--arch/x86/lib/insn-eval.c50
-rw-r--r--arch/x86/lib/insn.c230
-rw-r--r--arch/x86/lib/memcpy_64.S2
-rw-r--r--arch/x86/lib/memmove_64.S2
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/lib/mmx_32.c2
-rw-r--r--arch/x86/lib/msr-smp.c4
-rw-r--r--arch/x86/lib/msr.c4
-rw-r--r--arch/x86/lib/retpoline.S67
-rw-r--r--arch/x86/math-emu/fpu_trig.c11
-rw-r--r--arch/x86/math-emu/reg_ld_str.c2
-rw-r--r--arch/x86/math-emu/reg_round.S2
-rw-r--r--arch/x86/mm/fault.c4
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c210
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/kaslr.c2
-rw-r--r--arch/x86/mm/kmmio.c2
-rw-r--r--arch/x86/mm/mem_encrypt.c6
-rw-r--r--arch/x86/mm/mem_encrypt_boot.S2
-rw-r--r--arch/x86/mm/mem_encrypt_identity.c35
-rw-r--r--arch/x86/mm/pat/memtype.c4
-rw-r--r--arch/x86/mm/pat/set_memory.c2
-rw-r--r--arch/x86/mm/pgtable.c13
-rw-r--r--arch/x86/mm/pkeys.c2
-rw-r--r--arch/x86/mm/pti.c11
-rw-r--r--arch/x86/mm/tlb.c182
-rw-r--r--arch/x86/net/bpf_jit_comp.c19
-rw-r--r--arch/x86/net/bpf_jit_comp32.c198
-rw-r--r--arch/x86/pci/fixup.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c4
-rw-r--r--arch/x86/platform/efi/quirks.c4
-rw-r--r--arch/x86/platform/intel-quark/imr.c4
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c2
-rw-r--r--arch/x86/platform/intel/iosf_mbi.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c2
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c2
-rw-r--r--arch/x86/platform/pvh/head.S20
-rw-r--r--arch/x86/platform/uv/uv_nmi.c48
-rw-r--r--arch/x86/power/cpu.c8
-rw-r--r--arch/x86/power/hibernate.c89
-rw-r--r--arch/x86/realmode/init.c2
-rw-r--r--arch/x86/tools/insn_decoder_test.c10
-rw-r--r--arch/x86/tools/insn_sanity.c8
-rw-r--r--arch/x86/xen/enlighten_pv.c5
-rw-r--r--arch/x86/xen/mmu_pv.c13
-rw-r--r--arch/x86/xen/time.c26
284 files changed, 6192 insertions, 3394 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..dac15f646f79 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -100,6 +100,7 @@ config X86
select ARCH_SUPPORTS_LTO_CLANG if X86_64
select ARCH_SUPPORTS_LTO_CLANG_THIN if X86_64
select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_MEMTEST
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_USE_SYM_ANNOTATIONS
@@ -165,6 +166,7 @@ config X86
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
+ select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_ASM_MODVERSIONS
select HAVE_CMPXCHG_DOUBLE
@@ -360,10 +362,6 @@ config X86_64_SMP
def_bool y
depends on X86_64 && SMP
-config X86_32_LAZY_GS
- def_bool y
- depends on X86_32 && !STACKPROTECTOR
-
config ARCH_SUPPORTS_UPROBES
def_bool y
@@ -386,7 +384,8 @@ config CC_HAS_SANE_STACKPROTECTOR
default $(success,$(srctree)/scripts/gcc-x86_32-has-stack-protector.sh $(CC))
help
We have to make sure stack protector is unconditionally disabled if
- the compiler produces broken code.
+ the compiler produces broken code or if it does not let us control
+ the segment on 32-bit kernels.
menu "Processor type and features"
@@ -571,6 +570,7 @@ config X86_UV
depends on X86_EXTENDED_PLATFORM
depends on NUMA
depends on EFI
+ depends on KEXEC_CORE
depends on X86_X2APIC
depends on PCI
help
@@ -777,6 +777,7 @@ if HYPERVISOR_GUEST
config PARAVIRT
bool "Enable paravirtualization code"
+ depends on HAVE_STATIC_CALL
help
This changes the kernel so it can modify itself when it is run
under a hypervisor, potentially improving performance significantly
@@ -1406,7 +1407,7 @@ config HIGHMEM4G
config HIGHMEM64G
bool "64GB"
- depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
+ depends on !M486SX && !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6
select X86_PAE
help
Select this if you have a 32-bit processor and more than 4
@@ -1518,6 +1519,7 @@ config AMD_MEM_ENCRYPT
select ARCH_USE_MEMREMAP_PROT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
+ select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
help
Say yes to enable support for the encryption of system memory.
This requires an AMD processor that supports Secure Memory
@@ -1931,6 +1933,7 @@ config X86_SGX
depends on CRYPTO_SHA256=y
select SRCU
select MMU_NOTIFIER
+ select NUMA_KEEP_MEMINFO if NUMA
help
Intel(R) Software Guard eXtensions (SGX) is a set of CPU instructions
that can be used by applications to set aside private regions of code
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 9a85eae37b17..c77c5d8a7b3e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -33,6 +33,7 @@ REALMODE_CFLAGS += -ffreestanding
REALMODE_CFLAGS += -fno-stack-protector
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -Wno-address-of-packed-member)
REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4))
+REALMODE_CFLAGS += $(CLANG_FLAGS)
export REALMODE_CFLAGS
# BITS is used as extension for files which are available in a 32 bit
@@ -79,6 +80,14 @@ ifeq ($(CONFIG_X86_32),y)
# temporary until string.h is fixed
KBUILD_CFLAGS += -ffreestanding
+
+ ifeq ($(CONFIG_STACKPROTECTOR),y)
+ ifeq ($(CONFIG_SMP),y)
+ KBUILD_CFLAGS += -mstack-protector-guard-reg=fs -mstack-protector-guard-symbol=__stack_chk_guard
+ else
+ KBUILD_CFLAGS += -mstack-protector-guard=global
+ endif
+ endif
else
BITS := 64
UTS_MACHINE := x86_64
@@ -129,8 +138,8 @@ ifdef CONFIG_X86_X32
x32_ld_ok := $(call try-run,\
/bin/echo -e '1: .quad 1b' | \
$(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" - && \
- $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMPO" && \
- $(LD) -m elf32_x86_64 "$$TMPO" -o "$$TMP",y,n)
+ $(OBJCOPY) -O elf32-x86-64 "$$TMP" "$$TMP.o" && \
+ $(LD) -m elf32_x86_64 "$$TMP.o" -o "$$TMP",y,n)
ifeq ($(x32_ld_ok),y)
CONFIG_X86_X32_ABI := y
KBUILD_AFLAGS += -DCONFIG_X86_X32_ABI
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index e0bc3988c3fa..6e5522aebbbd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -46,6 +46,7 @@ KBUILD_CFLAGS += -D__DISABLE_EXPORTS
# Disable relocation relaxation in case the link is not PIE.
KBUILD_CFLAGS += $(call as-option,-Wa$(comma)-mrelax-relocations=no)
KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
+KBUILD_CFLAGS += $(CLANG_FLAGS)
# sev-es.c indirectly inludes inat-table.h which is generated during
# compilation and stored in $(objtree). Add the directory to the includes so
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
index c4bb0f9363f5..95a223b3e56a 100644
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -5,7 +5,7 @@
* Early support for invoking 32-bit EFI services from a 64-bit kernel.
*
* Because this thunking occurs before ExitBootServices() we have to
- * restore the firmware's 32-bit GDT before we make EFI serivce calls,
+ * restore the firmware's 32-bit GDT before we make EFI service calls,
* since the firmware's 32-bit IDT is still currently installed and it
* needs to be able to service interrupts.
*
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index e94874f4bbc1..a2347ded77ea 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -34,6 +34,7 @@
#include <asm/asm-offsets.h>
#include <asm/bootparam.h>
#include <asm/desc_defs.h>
+#include <asm/trapnr.h>
#include "pgtable.h"
/*
@@ -107,9 +108,19 @@ SYM_FUNC_START(startup_32)
movl %eax, %gs
movl %eax, %ss
-/* setup a stack and make sure cpu supports long mode. */
+ /* Setup a stack and load CS from current GDT */
leal rva(boot_stack_end)(%ebp), %esp
+ pushl $__KERNEL32_CS
+ leal rva(1f)(%ebp), %eax
+ pushl %eax
+ lretl
+1:
+
+ /* Setup Exception handling for SEV-ES */
+ call startup32_load_idt
+
+ /* Make sure cpu supports long mode. */
call verify_cpu
testl %eax, %eax
jnz .Lno_longmode
@@ -172,11 +183,21 @@ SYM_FUNC_START(startup_32)
*/
call get_sev_encryption_bit
xorl %edx, %edx
+#ifdef CONFIG_AMD_MEM_ENCRYPT
testl %eax, %eax
jz 1f
subl $32, %eax /* Encryption bit is always above bit 31 */
bts %eax, %edx /* Set encryption mask for page tables */
+ /*
+ * Mark SEV as active in sev_status so that startup32_check_sev_cbit()
+ * will do a check. The sev_status memory will be fully initialized
+ * with the contents of MSR_AMD_SEV_STATUS later in
+ * set_sev_encryption_mask(). For now it is sufficient to know that SEV
+ * is active.
+ */
+ movl $1, rva(sev_status)(%ebp)
1:
+#endif
/* Initialize Page tables to 0 */
leal rva(pgtable)(%ebx), %edi
@@ -231,7 +252,7 @@ SYM_FUNC_START(startup_32)
/*
* Setup for the jump to 64bit mode
*
- * When the jump is performend we will be in long mode but
+ * When the jump is performed we will be in long mode but
* in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
* (and in turn EFER.LMA = 1). To jump into 64bit mode we use
* the new gdt/idt that has __KERNEL_CS with CS.L = 1.
@@ -261,6 +282,9 @@ SYM_FUNC_START(startup_32)
movl %esi, %edx
1:
#endif
+ /* Check if the C-bit position is correct when SEV is active */
+ call startup32_check_sev_cbit
+
pushl $__KERNEL_CS
pushl %eax
@@ -694,6 +718,19 @@ SYM_DATA_START(boot_idt)
.endr
SYM_DATA_END_LABEL(boot_idt, SYM_L_GLOBAL, boot_idt_end)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+SYM_DATA_START(boot32_idt_desc)
+ .word boot32_idt_end - boot32_idt - 1
+ .long 0
+SYM_DATA_END(boot32_idt_desc)
+ .balign 8
+SYM_DATA_START(boot32_idt)
+ .rept 32
+ .quad 0
+ .endr
+SYM_DATA_END_LABEL(boot32_idt, SYM_L_GLOBAL, boot32_idt_end)
+#endif
+
#ifdef CONFIG_EFI_STUB
SYM_DATA(image_offset, .long 0)
#endif
@@ -786,6 +823,137 @@ SYM_DATA_START_LOCAL(loaded_image_proto)
SYM_DATA_END(loaded_image_proto)
#endif
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ __HEAD
+ .code32
+/*
+ * Write an IDT entry into boot32_idt
+ *
+ * Parameters:
+ *
+ * %eax: Handler address
+ * %edx: Vector number
+ *
+ * Physical offset is expected in %ebp
+ */
+SYM_FUNC_START(startup32_set_idt_entry)
+ push %ebx
+ push %ecx
+
+ /* IDT entry address to %ebx */
+ leal rva(boot32_idt)(%ebp), %ebx
+ shl $3, %edx
+ addl %edx, %ebx
+
+ /* Build IDT entry, lower 4 bytes */
+ movl %eax, %edx
+ andl $0x0000ffff, %edx # Target code segment offset [15:0]
+ movl $__KERNEL32_CS, %ecx # Target code segment selector
+ shl $16, %ecx
+ orl %ecx, %edx
+
+ /* Store lower 4 bytes to IDT */
+ movl %edx, (%ebx)
+
+ /* Build IDT entry, upper 4 bytes */
+ movl %eax, %edx
+ andl $0xffff0000, %edx # Target code segment offset [31:16]
+ orl $0x00008e00, %edx # Present, Type 32-bit Interrupt Gate
+
+ /* Store upper 4 bytes to IDT */
+ movl %edx, 4(%ebx)
+
+ pop %ecx
+ pop %ebx
+ ret
+SYM_FUNC_END(startup32_set_idt_entry)
+#endif
+
+SYM_FUNC_START(startup32_load_idt)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ /* #VC handler */
+ leal rva(startup32_vc_handler)(%ebp), %eax
+ movl $X86_TRAP_VC, %edx
+ call startup32_set_idt_entry
+
+ /* Load IDT */
+ leal rva(boot32_idt)(%ebp), %eax
+ movl %eax, rva(boot32_idt_desc+2)(%ebp)
+ lidt rva(boot32_idt_desc)(%ebp)
+#endif
+ ret
+SYM_FUNC_END(startup32_load_idt)
+
+/*
+ * Check for the correct C-bit position when the startup_32 boot-path is used.
+ *
+ * The check makes use of the fact that all memory is encrypted when paging is
+ * disabled. The function creates 64 bits of random data using the RDRAND
+ * instruction. RDRAND is mandatory for SEV guests, so always available. If the
+ * hypervisor violates that the kernel will crash right here.
+ *
+ * The 64 bits of random data are stored to a memory location and at the same
+ * time kept in the %eax and %ebx registers. Since encryption is always active
+ * when paging is off the random data will be stored encrypted in main memory.
+ *
+ * Then paging is enabled. When the C-bit position is correct all memory is
+ * still mapped encrypted and comparing the register values with memory will
+ * succeed. An incorrect C-bit position will map all memory unencrypted, so that
+ * the compare will use the encrypted random data and fail.
+ */
+SYM_FUNC_START(startup32_check_sev_cbit)
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Check for non-zero sev_status */
+ movl rva(sev_status)(%ebp), %eax
+ testl %eax, %eax
+ jz 4f
+
+ /*
+ * Get two 32-bit random values - Don't bail out if RDRAND fails
+ * because it is better to prevent forward progress if no random value
+ * can be gathered.
+ */
+1: rdrand %eax
+ jnc 1b
+2: rdrand %ebx
+ jnc 2b
+
+ /* Store to memory and keep it in the registers */
+ movl %eax, rva(sev_check_data)(%ebp)
+ movl %ebx, rva(sev_check_data+4)(%ebp)
+
+ /* Enable paging to see if encryption is active */
+ movl %cr0, %edx /* Backup %cr0 in %edx */
+ movl $(X86_CR0_PG | X86_CR0_PE), %ecx /* Enable Paging and Protected mode */
+ movl %ecx, %cr0
+
+ cmpl %eax, rva(sev_check_data)(%ebp)
+ jne 3f
+ cmpl %ebx, rva(sev_check_data+4)(%ebp)
+ jne 3f
+
+ movl %edx, %cr0 /* Restore previous %cr0 */
+
+ jmp 4f
+
+3: /* Check failed - hlt the machine */
+ hlt
+ jmp 3b
+
+4:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+#endif
+ ret
+SYM_FUNC_END(startup32_check_sev_cbit)
+
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c
index 804a502ee0d2..9b93567d663a 100644
--- a/arch/x86/boot/compressed/idt_64.c
+++ b/arch/x86/boot/compressed/idt_64.c
@@ -52,3 +52,17 @@ void load_stage2_idt(void)
load_boot_idt(&boot_idt_desc);
}
+
+void cleanup_exception_handling(void)
+{
+ /*
+ * Flush GHCB from cache and map it encrypted again when running as
+ * SEV-ES guest.
+ */
+ sev_es_shutdown_ghcb();
+
+ /* Set a null-idt, disabling #PF and #VC handling */
+ boot_idt_desc.size = 0;
+ boot_idt_desc.address = 0;
+ load_boot_idt(&boot_idt_desc);
+}
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index b92fffbe761f..e36690778497 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -639,9 +639,9 @@ static bool process_mem_region(struct mem_vector *region,
if (slot_area_index == MAX_SLOT_AREA) {
debug_putstr("Aborted e820/efi memmap scan (slot_areas full)!\n");
- return 1;
+ return true;
}
- return 0;
+ return false;
}
#if defined(CONFIG_MEMORY_HOTREMOVE) && defined(CONFIG_ACPI)
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
index aa561795efd1..c1e81a848b2a 100644
--- a/arch/x86/boot/compressed/mem_encrypt.S
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -23,12 +23,6 @@ SYM_FUNC_START(get_sev_encryption_bit)
push %ecx
push %edx
- /* Check if running under a hypervisor */
- movl $1, %eax
- cpuid
- bt $31, %ecx /* Check the hypervisor bit */
- jnc .Lno_sev
-
movl $0x80000000, %eax /* CPUID to check the highest leaf */
cpuid
cmpl $0x8000001f, %eax /* See if 0x8000001f is available */
@@ -67,10 +61,132 @@ SYM_FUNC_START(get_sev_encryption_bit)
ret
SYM_FUNC_END(get_sev_encryption_bit)
+/**
+ * sev_es_req_cpuid - Request a CPUID value from the Hypervisor using
+ * the GHCB MSR protocol
+ *
+ * @%eax: Register to request (0=EAX, 1=EBX, 2=ECX, 3=EDX)
+ * @%edx: CPUID Function
+ *
+ * Returns 0 in %eax on success, non-zero on failure
+ * %edx returns CPUID value on success
+ */
+SYM_CODE_START_LOCAL(sev_es_req_cpuid)
+ shll $30, %eax
+ orl $0x00000004, %eax
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall # VMGEXIT
+ rdmsr
+
+ /* Check response */
+ movl %eax, %ecx
+ andl $0x3ffff000, %ecx # Bits [12-29] MBZ
+ jnz 2f
+
+ /* Check return code */
+ andl $0xfff, %eax
+ cmpl $5, %eax
+ jne 2f
+
+ /* All good - return success */
+ xorl %eax, %eax
+1:
+ ret
+2:
+ movl $-1, %eax
+ jmp 1b
+SYM_CODE_END(sev_es_req_cpuid)
+
+SYM_CODE_START(startup32_vc_handler)
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ /* Keep CPUID function in %ebx */
+ movl %eax, %ebx
+
+ /* Check if error-code == SVM_EXIT_CPUID */
+ cmpl $0x72, 16(%esp)
+ jne .Lfail
+
+ movl $0, %eax # Request CPUID[fn].EAX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 12(%esp) # Store result
+
+ movl $1, %eax # Request CPUID[fn].EBX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 8(%esp) # Store result
+
+ movl $2, %eax # Request CPUID[fn].ECX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 4(%esp) # Store result
+
+ movl $3, %eax # Request CPUID[fn].EDX
+ movl %ebx, %edx # CPUID fn
+ call sev_es_req_cpuid # Call helper
+ testl %eax, %eax # Check return code
+ jnz .Lfail
+ movl %edx, 0(%esp) # Store result
+
+ /*
+ * Sanity check CPUID results from the Hypervisor. See comment in
+ * do_vc_no_ghcb() for more details on why this is necessary.
+ */
+
+ /* Fail if SEV leaf not available in CPUID[0x80000000].EAX */
+ cmpl $0x80000000, %ebx
+ jne .Lcheck_sev
+ cmpl $0x8000001f, 12(%esp)
+ jb .Lfail
+ jmp .Ldone
+
+.Lcheck_sev:
+ /* Fail if SEV bit not set in CPUID[0x8000001f].EAX[1] */
+ cmpl $0x8000001f, %ebx
+ jne .Ldone
+ btl $1, 12(%esp)
+ jnc .Lfail
+
+.Ldone:
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+
+ /* Remove error code */
+ addl $4, %esp
+
+ /* Jump over CPUID instruction */
+ addl $2, (%esp)
+
+ iret
+.Lfail:
+ /* Send terminate request to Hypervisor */
+ movl $0x100, %eax
+ xorl %edx, %edx
+ movl $MSR_AMD64_SEV_ES_GHCB, %ecx
+ wrmsr
+ rep; vmmcall
+
+ /* If request fails, go to hlt loop */
+ hlt
+ jmp .Lfail
+SYM_CODE_END(startup32_vc_handler)
+
.code64
#include "../../kernel/sev_verify_cbit.S"
-
SYM_FUNC_START(set_sev_encryption_mask)
#ifdef CONFIG_AMD_MEM_ENCRYPT
push %rbp
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 267e7f93050e..dde042f64cca 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -430,8 +430,6 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
- if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
- error("Destination address does not match LOAD_PHYSICAL_ADDR");
if (virt_addr != LOAD_PHYSICAL_ADDR)
error("Destination virtual address changed when not relocatable");
#endif
@@ -443,11 +441,8 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
handle_relocations(output, output_len, virt_addr);
debug_putstr("done.\nBooting the kernel.\n");
- /*
- * Flush GHCB from cache and map it encrypted again when running as
- * SEV-ES guest.
- */
- sev_es_shutdown_ghcb();
+ /* Disable exception handling before booting the kernel */
+ cleanup_exception_handling();
return output;
}
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 901ea5ebec22..e5612f035498 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -155,6 +155,12 @@ extern pteval_t __default_kernel_pte_mask;
extern gate_desc boot_idt[BOOT_IDT_ENTRIES];
extern struct desc_ptr boot_idt_desc;
+#ifdef CONFIG_X86_64
+void cleanup_exception_handling(void);
+#else
+static inline void cleanup_exception_handling(void) { }
+#endif
+
/* IDT Entry Points */
void boot_page_fault(void);
void boot_stage1_vc(void);
diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c
index 27826c265aab..82041bd380e5 100644
--- a/arch/x86/boot/compressed/sev-es.c
+++ b/arch/x86/boot/compressed/sev-es.c
@@ -78,16 +78,15 @@ static inline void sev_es_wr_ghcb_msr(u64 val)
static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
- enum es_result ret;
+ int ret;
memcpy(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
- insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE, 1);
- insn_get_length(&ctxt->insn);
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
- ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
-
- return ret;
+ return ES_OK;
}
static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
@@ -200,14 +199,8 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code)
}
finish:
- if (result == ES_OK) {
+ if (result == ES_OK)
vc_finish_insn(&ctxt);
- } else if (result != ES_RETRY) {
- /*
- * For now, just halt the machine. That makes debugging easier,
- * later we just call sev_es_terminate() here.
- */
- while (true)
- asm volatile("hlt\n");
- }
+ else if (result != ES_RETRY)
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
}
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index b28e36b7c96b..d0959e7b809f 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
#
# x86 crypto algorithms
-OBJECT_FILES_NON_STANDARD := y
-
obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 2cf8e94d986a..98e3552b6e03 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -212,10 +212,6 @@ HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
#define arg4 %rcx
#define arg5 %r8
#define arg6 %r9
-#define arg7 STACK_OFFSET+8*1(%r14)
-#define arg8 STACK_OFFSET+8*2(%r14)
-#define arg9 STACK_OFFSET+8*3(%r14)
-#define arg10 STACK_OFFSET+8*4(%r14)
#define keysize 2*15*16(arg1)
i = 0
@@ -237,9 +233,6 @@ define_reg j %j
.noaltmacro
.endm
-# need to push 4 registers into stack to maintain
-STACK_OFFSET = 8*4
-
TMP1 = 16*0 # Temporary storage for AAD
TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
TMP3 = 16*2 # Temporary storage for AES State 3
@@ -256,25 +249,22 @@ VARIABLE_OFFSET = 16*8
################################
.macro FUNC_SAVE
- #the number of pushes must equal STACK_OFFSET
push %r12
push %r13
- push %r14
push %r15
- mov %rsp, %r14
-
-
+ push %rbp
+ mov %rsp, %rbp
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
.endm
.macro FUNC_RESTORE
- mov %r14, %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
- pop %r14
pop %r13
pop %r12
.endm
@@ -294,7 +284,7 @@ VARIABLE_OFFSET = 16*8
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
+# clobbering r10, r11, r12, r13, r15, rax
.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
vmovdqu AadHash(arg2), %xmm8
vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
@@ -996,7 +986,7 @@ _partial_block_done_\@:
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
@@ -1231,7 +1221,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
@@ -1944,7 +1934,7 @@ SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg3, arg4, r14 are used as a pointer only, not modified
+## arg1, arg2, arg3, arg4 are used as pointers only, not modified
.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
@@ -2186,7 +2176,7 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg3, arg4 are used as pointers only, not modified
+# arg1, arg2, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 782e9712a1ec..706f70829a07 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -990,6 +990,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* %rdx: src (32 blocks)
*/
FRAME_BEGIN
+ subq $(16 * 32), %rsp;
vzeroupper;
@@ -1002,7 +1003,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, %rdx, (key_table)(CTX, %r8, 8));
- movq %rsp, %r10;
cmpq %rsi, %rdx;
je .Lcbc_dec_use_stack;
@@ -1015,7 +1015,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
* dst still in-use (because dst == src), so use stack for temporary
* storage.
*/
- subq $(16 * 32), %rsp;
movq %rsp, %rax;
.Lcbc_dec_continue:
@@ -1025,7 +1024,6 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vpxor %ymm7, %ymm7, %ymm7;
vinserti128 $1, (%rdx), %ymm7, %ymm7;
vpxor (%rax), %ymm7, %ymm7;
- movq %r10, %rsp;
vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
@@ -1047,6 +1045,7 @@ SYM_FUNC_START(camellia_cbc_dec_32way)
vzeroupper;
+ addq $(16 * 32), %rsp;
FRAME_END
ret;
SYM_FUNC_END(camellia_cbc_dec_32way)
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index 7c4c7b2fbf05..98cf3b4e4c9f 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -24,7 +24,7 @@
/*
* Copyright 2012 Xyratex Technology Limited
*
- * Wrappers for kernel crypto shash api to pclmulqdq crc32 imlementation.
+ * Wrappers for kernel crypto shash api to pclmulqdq crc32 implementation.
*/
#include <linux/init.h>
#include <linux/module.h>
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 884dc767b051..ac1f303eed0f 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -53,7 +53,7 @@
.endm
.macro JMPTBL_ENTRY i
-.word crc_\i - crc_array
+.quad crc_\i
.endm
.macro JNC_LESS_THAN j
@@ -168,10 +168,7 @@ continue_block:
xor crc2, crc2
## branch into array
- lea jump_table(%rip), %bufp
- movzwq (%bufp, %rax, 2), len
- lea crc_array(%rip), %bufp
- lea (%bufp, len, 1), %bufp
+ mov jump_table(,%rax,8), %bufp
JMP_NOSPEC bufp
################################################################
diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c
index 5af8021b98ce..6706b6cb1d0f 100644
--- a/arch/x86/crypto/curve25519-x86_64.c
+++ b/arch/x86/crypto/curve25519-x86_64.c
@@ -114,11 +114,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
);
}
-/* Computes the field substraction of two field elements */
+/* Computes the field subtraction of two field elements */
static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- /* Compute the raw substraction of f1-f2 */
+ /* Compute the raw subtraction of f1-f2 */
" movq 0(%1), %%r8;"
" subq 0(%2), %%r8;"
" movq 8(%1), %%r9;"
@@ -135,7 +135,7 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" mov $38, %%rcx;"
" cmovc %%rcx, %%rax;"
- /* Step 2: Substract carry*38 from the original difference */
+ /* Step 2: Subtract carry*38 from the original difference */
" sub %%rax, %%r8;"
" sbb $0, %%r9;"
" sbb $0, %%r10;"
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 646da46e8d10..1dfb8af48a3c 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -16,7 +16,7 @@
#include <asm/simd.h>
asmlinkage void poly1305_init_x86_64(void *ctx,
- const u8 key[POLY1305_KEY_SIZE]);
+ const u8 key[POLY1305_BLOCK_SIZE]);
asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
const size_t len, const u32 padbit);
asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
@@ -81,7 +81,7 @@ static void convert_to_base2_64(void *ctx)
state->is_base2_26 = 0;
}
-static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_BLOCK_SIZE])
{
poly1305_init_x86_64(ctx, key);
}
@@ -129,7 +129,7 @@ static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
poly1305_emit_avx(ctx, mac, nonce);
}
-void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
{
poly1305_simd_init(&dctx->h, key);
dctx->s[0] = get_unaligned_le32(&key[16]);
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1e594d60afa5..5eed620f4676 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -645,9 +645,9 @@ _loop3:
RESERVE_STACK = (W_SIZE*4 + 8+24)
/* Align stack */
- mov %rsp, %rbx
+ push %rbp
+ mov %rsp, %rbp
and $~(0x20-1), %rsp
- push %rbx
sub $RESERVE_STACK, %rsp
avx2_zeroupper
@@ -665,8 +665,8 @@ _loop3:
avx2_zeroupper
- add $RESERVE_STACK, %rsp
- pop %rsp
+ mov %rbp, %rsp
+ pop %rbp
pop %r15
pop %r14
diff --git a/arch/x86/crypto/sha1_ni_asm.S b/arch/x86/crypto/sha1_ni_asm.S
index 11efe3a45a1f..5d8415f482bd 100644
--- a/arch/x86/crypto/sha1_ni_asm.S
+++ b/arch/x86/crypto/sha1_ni_asm.S
@@ -59,8 +59,6 @@
#define DATA_PTR %rsi /* 2nd arg */
#define NUM_BLKS %rdx /* 3rd arg */
-#define RSPSAVE %rax
-
/* gcc conversion */
#define FRAME_SIZE 32 /* space for 2x16 bytes */
@@ -96,7 +94,8 @@
.text
.align 32
SYM_FUNC_START(sha1_ni_transform)
- mov %rsp, RSPSAVE
+ push %rbp
+ mov %rsp, %rbp
sub $FRAME_SIZE, %rsp
and $~0xF, %rsp
@@ -288,7 +287,8 @@ SYM_FUNC_START(sha1_ni_transform)
pextrd $3, E0, 1*16(DIGEST_PTR)
.Ldone_hash:
- mov RSPSAVE, %rsp
+ mov %rbp, %rsp
+ pop %rbp
ret
SYM_FUNC_END(sha1_ni_transform)
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 11ff60c29c8b..4087f7432a7e 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -117,15 +117,13 @@ _XMM_SAVE_SIZE = 0
_INP_END_SIZE = 8
_INP_SIZE = 8
_CTX_SIZE = 8
-_RSP_SIZE = 8
_XFER = 0
_XMM_SAVE = _XFER + _XFER_SIZE
_INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
_INP = _INP_END + _INP_END_SIZE
_CTX = _INP + _INP_SIZE
-_RSP = _CTX + _CTX_SIZE
-STACK_SIZE = _RSP + _RSP_SIZE
+STACK_SIZE = _CTX + _CTX_SIZE
# rotate_Xs
# Rotate values of symbols X0...X3
@@ -533,11 +531,11 @@ SYM_FUNC_START(sha256_transform_rorx)
pushq %r14
pushq %r15
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
+
subq $STACK_SIZE, %rsp
and $-32, %rsp # align rsp to 32 byte boundary
- mov %rax, _RSP(%rsp)
-
shl $6, NUM_BLKS # convert to bytes
jz done_hash
@@ -704,7 +702,8 @@ only_one_block:
done_hash:
- mov _RSP(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
popq %r15
popq %r14
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 684d58c8bc4f..3d8f0fd4eea8 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -76,14 +76,10 @@ tmp0 = %rax
W_SIZE = 80*8
# W[t] + K[t] | W[t+1] + K[t+1]
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -281,18 +277,18 @@ SYM_FUNC_START(sha512_transform_avx)
test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -353,15 +349,16 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
ret
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 3a44bdcfd583..072cb0f0deae 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -102,17 +102,13 @@ SRND_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
CTX_SIZE = 1*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
frame_INP = frame_SRND + SRND_SIZE
frame_INPEND = frame_INP + INP_SIZE
frame_CTX = frame_INPEND + INPEND_SIZE
-frame_RSPSAVE = frame_CTX + CTX_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_CTX + CTX_SIZE
## assume buffers not aligned
#define VMOVDQ vmovdqu
@@ -570,18 +566,18 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_rorx)
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, 8*0+frame_GPRSAVE(%rsp)
- mov %r12, 8*1+frame_GPRSAVE(%rsp)
- mov %r13, 8*2+frame_GPRSAVE(%rsp)
- mov %r14, 8*3+frame_GPRSAVE(%rsp)
- mov %r15, 8*4+frame_GPRSAVE(%rsp)
shl $7, NUM_BLKS # convert to bytes
jz done_hash
@@ -672,15 +668,17 @@ loop2:
done_hash:
-# Restore GPRs
- mov 8*0+frame_GPRSAVE(%rsp), %rbx
- mov 8*1+frame_GPRSAVE(%rsp), %r12
- mov 8*2+frame_GPRSAVE(%rsp), %r13
- mov 8*3+frame_GPRSAVE(%rsp), %r14
- mov 8*4+frame_GPRSAVE(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+
ret
SYM_FUNC_END(sha512_transform_rorx)
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 50812af0b083..bd51c9070bed 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -74,14 +74,10 @@ tmp0 = %rax
W_SIZE = 80*8
WK_SIZE = 2*8
-RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 5*8
frame_W = 0
frame_WK = frame_W + W_SIZE
-frame_RSPSAVE = frame_WK + WK_SIZE
-frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
-frame_size = frame_GPRSAVE + GPRSAVE_SIZE
+frame_size = frame_WK + WK_SIZE
# Useful QWORD "arrays" for simpler memory references
# MSG, DIGEST, K_t, W_t are arrays
@@ -283,18 +279,18 @@ SYM_FUNC_START(sha512_transform_ssse3)
test msglen, msglen
je nowork
+ # Save GPRs
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
# Allocate Stack Space
- mov %rsp, %rax
+ push %rbp
+ mov %rsp, %rbp
sub $frame_size, %rsp
and $~(0x20 - 1), %rsp
- mov %rax, frame_RSPSAVE(%rsp)
-
- # Save GPRs
- mov %rbx, frame_GPRSAVE(%rsp)
- mov %r12, frame_GPRSAVE +8*1(%rsp)
- mov %r13, frame_GPRSAVE +8*2(%rsp)
- mov %r14, frame_GPRSAVE +8*3(%rsp)
- mov %r15, frame_GPRSAVE +8*4(%rsp)
updateblock:
@@ -355,15 +351,16 @@ updateblock:
dec msglen
jnz updateblock
- # Restore GPRs
- mov frame_GPRSAVE(%rsp), %rbx
- mov frame_GPRSAVE +8*1(%rsp), %r12
- mov frame_GPRSAVE +8*2(%rsp), %r13
- mov frame_GPRSAVE +8*3(%rsp), %r14
- mov frame_GPRSAVE +8*4(%rsp), %r15
-
# Restore Stack Pointer
- mov frame_RSPSAVE(%rsp), %rsp
+ mov %rbp, %rsp
+ pop %rbp
+
+ # Restore GPRs
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
nowork:
ret
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index fc23552afe37..bca4cea757ce 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -88,7 +88,7 @@
/*
* Combined G1 & G2 function. Reordered with help of rotates to have moves
- * at begining.
+ * at beginning.
*/
#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \
/* G1,1 && G2,1 */ \
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 03725696397c..3507cf2064f1 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -117,7 +117,7 @@ static bool is_blacklisted_cpu(void)
* storing blocks in 64bit registers to allow three blocks to
* be processed parallel. Parallel operation then allows gaining
* more performance than was trade off, on out-of-order CPUs.
- * However Atom does not benefit from this parallellism and
+ * However Atom does not benefit from this parallelism and
* should be blacklisted.
*/
return true;
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4efd39aacb9f..7b2542b13ebd 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -38,6 +38,7 @@
#ifdef CONFIG_X86_64
__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
+ add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
@@ -83,6 +84,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
unsigned int nr = syscall_32_enter(regs);
+ add_random_kstack_offset();
/*
* Subtlety here: if ptrace pokes something larger than 2^32-1 into
* orig_ax, the unsigned int return value truncates it. This may
@@ -102,6 +104,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
unsigned int nr = syscall_32_enter(regs);
int res;
+ add_random_kstack_offset();
/*
* This cannot use syscall_enter_from_user_mode() as it has to
* fetch EBP before invoking any of the syscall entry work
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..ccb9d32768f3 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -20,7 +20,7 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 28(%esp) - unused -- was %gs on old stackprotector kernels
* 2C(%esp) - orig_eax
* 30(%esp) - %eip
* 34(%esp) - %cs
@@ -40,7 +40,7 @@
#include <asm/processor-flags.h>
#include <asm/irq_vectors.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
@@ -53,83 +53,6 @@
#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl $0
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl %gs
-.endm
-
-.macro POP_GS pop=0
-98: popl %gs
- .if \pop <> 0
- add $\pop, %esp
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b, 99b)
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
/* Unconditionally switch to user cr3 */
.macro SWITCH_TO_USER_CR3 scratch_reg:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
@@ -209,7 +132,7 @@
*
* Lets build a 5 entry IRET frame after that, such that struct pt_regs
* is complete and in particular regs->sp is correct. This gives us
- * the original 6 enties as gap:
+ * the original 6 entries as gap:
*
* 14*4(%esp) - <previous context>
* 13*4(%esp) - gap / flags
@@ -282,7 +205,7 @@
.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
cld
.if \skip_gs == 0
- PUSH_GS
+ pushl $0
.endif
pushl %fs
@@ -307,9 +230,6 @@
movl $(__USER_DS), %edx
movl %edx, %ds
movl %edx, %es
-.if \skip_gs == 0
- SET_KERNEL_GS %edx
-.endif
/* Switch to kernel stack if necessary */
.if \switch_stacks > 0
SWITCH_TO_KERNEL_STACK
@@ -348,7 +268,7 @@
1: popl %ds
2: popl %es
3: popl %fs
- POP_GS \pop
+ addl $(4 + \pop), %esp /* pop the unused "gs" slot */
IRET_FRAME
.pushsection .fixup, "ax"
4: movl $0, (%esp)
@@ -361,7 +281,6 @@
_ASM_EXTABLE(1b, 4b)
_ASM_EXTABLE(2b, 5b)
_ASM_EXTABLE(3b, 6b)
- POP_GS_EX
.endm
.macro RESTORE_ALL_NMI cr3_reg:req pop=0
@@ -430,7 +349,7 @@
* will soon execute iret and the tracer was already set to
* the irqstate after the IRET:
*/
- DISABLE_INTERRUPTS(CLBR_ANY)
+ cli
lss (%esp), %esp /* switch to espfix segment */
.Lend_\@:
#endif /* CONFIG_X86_ESPFIX32 */
@@ -779,7 +698,7 @@ SYM_CODE_START(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR
movl TASK_stack_canary(%edx), %ebx
- movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset
+ movl %ebx, PER_CPU_VAR(__stack_chk_guard)
#endif
#ifdef CONFIG_RETPOLINE
@@ -976,7 +895,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
movl PT_EIP(%esp), %edx /* pt_regs->ip */
movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
popl %ebx /* pt_regs->bx */
addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
@@ -1012,7 +930,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
jmp 1b
.popsection
_ASM_EXTABLE(1b, 2b)
- PTGS_TO_GS_EX
.Lsysenter_fix_flags:
pushl $X86_EFLAGS_FIXED
@@ -1077,7 +994,7 @@ restore_all_switch_stack:
* when returning from IPI handler and when returning from
* scheduler to user-space.
*/
- INTERRUPT_RETURN
+ iret
.section .fixup, "ax"
SYM_CODE_START(asm_iret_error)
@@ -1154,11 +1071,7 @@ SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
- /* fixup %gs */
- GS_TO_REG %ecx
movl PT_GS(%esp), %edi # get the function address
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
/* fixup orig %eax */
movl PT_ORIG_EAX(%esp), %edx # get the error code
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 400908dff42e..a16a5294d55f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -305,7 +305,7 @@ SYM_CODE_END(ret_from_fork)
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
pushq %rax
- SAVE_FLAGS(CLBR_RAX)
+ SAVE_FLAGS
testl $X86_EFLAGS_IF, %eax
jz .Lokay_\@
ud2
@@ -511,7 +511,7 @@ SYM_CODE_START(\asmsym)
/*
* No need to switch back to the IST stack. The current stack is either
* identical to the stack in the IRET frame or the VC fall-back stack,
- * so it is definitly mapped even with PTI enabled.
+ * so it is definitely mapped even with PTI enabled.
*/
jmp paranoid_exit
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index a1c9f496fca6..f52a443eede0 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -447,3 +447,4 @@
440 i386 process_madvise sys_process_madvise
441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2
442 i386 mount_setattr sys_mount_setattr
+443 i386 quotactl_path sys_quotactl_path
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 7bf01cbe582f..7eb007b8cab5 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -364,6 +364,7 @@
440 common process_madvise sys_process_madvise
441 common epoll_pwait2 sys_epoll_pwait2
442 common mount_setattr sys_mount_setattr
+443 common quotactl_path sys_quotactl_path
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 2d0f3d8bcc25..edfe9780f6d1 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -218,7 +218,7 @@ int main(int argc, char **argv)
/*
* Figure out the struct name. If we're writing to a .so file,
- * generate raw output insted.
+ * generate raw output instead.
*/
name = strdup(argv[3]);
namelen = strlen(name);
diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 1c7cfac7e64a..5264daa8859f 100644
--- a/arch/x86/entry/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
@@ -35,7 +35,7 @@ static void BITSFUNC(extract)(const unsigned char *data, size_t data_len,
if (offset + len > data_len)
fail("section to extract overruns input data");
- fprintf(outfile, "static const unsigned char %s[%lu] = {", name, len);
+ fprintf(outfile, "static const unsigned char %s[%zu] = {", name, len);
BITSFUNC(copy)(outfile, data + offset, len);
fprintf(outfile, "\n};\n\n");
}
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
index de1fff7188aa..6ddd7a937b3e 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -6,7 +6,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.text
.globl __kernel_vsyscall
@@ -29,7 +29,7 @@ __kernel_vsyscall:
* anyone with an AMD CPU, for example). Nonetheless, we try to keep
* it working approximately as well as it ever worked.
*
- * This link may eludicate some of the history:
+ * This link may elucidate some of the history:
* https://android-review.googlesource.com/#/q/Iac3295376d61ef83e713ac9b528f3b50aa780cd7
* personally, I find it hard to understand what's going on there.
*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 825e829ffff1..235a5794296a 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -358,7 +358,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr)
mmap_write_lock(mm);
/*
* Check if we have already mapped vdso blob - fail to prevent
- * abusing from userspace install_speciall_mapping, which may
+ * abusing from userspace install_special_mapping, which may
* not do accounting and rlimit right.
* We could search vma near context.vdso, but it's a slowpath,
* so let's explicitly check all VMAs to be completely sure.
diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 86a0e94f68df..99dafac992e2 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -137,7 +137,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
/*
* If the return from callback is zero or negative, return immediately,
- * else re-execute ENCLU with the postive return value interpreted as
+ * else re-execute ENCLU with the positive return value interpreted as
* the requested ENCLU function.
*/
cmp $0, %eax
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 2c1791c4a518..9687a8aef01c 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -623,7 +623,7 @@ static void amd_pmu_disable_all(void)
/*
* Check each counter for overflow and wait for it to be reset by the
* NMI if it has overflowed. This relies on the fact that all active
- * counters are always enabled when this function is caled and
+ * counters are always enabled when this function is called and
* ARCH_PERFMON_EVENTSEL_INT is always set.
*/
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index a573fac1ece8..1c1a7e45dc64 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -82,12 +82,12 @@ static struct attribute_group amd_iommu_events_group = {
};
struct amd_iommu_event_desc {
- struct kobj_attribute attr;
+ struct device_attribute attr;
const char *event;
};
-static ssize_t _iommu_event_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t _iommu_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct amd_iommu_event_desc *event =
container_of(attr, struct amd_iommu_event_desc, attr);
diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h
index 095505c6213b..e6310c635c8b 100644
--- a/arch/x86/events/amd/iommu.h
+++ b/arch/x86/events/amd/iommu.h
@@ -17,7 +17,7 @@
#define IOMMU_PC_DEVID_MATCH_REG 0x20
#define IOMMU_PC_COUNTER_REPORT_REG 0x28
-/* maximun specified bank/counters */
+/* maximum specified bank/counters */
#define PC_MAX_SPEC_BNKS 64
#define PC_MAX_SPEC_CNTRS 16
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index 7f014d450bc2..582c0ffb5e98 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -275,14 +275,14 @@ static struct attribute_group amd_uncore_attr_group = {
};
#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
-static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
- struct kobj_attribute *attr, \
+static ssize_t __uncore_##_var##_show(struct device *dev, \
+ struct device_attribute *attr, \
char *page) \
{ \
BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
return sprintf(page, _format "\n"); \
} \
-static struct kobj_attribute format_attr_##_var = \
+static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __uncore_##_var##_show, NULL)
DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 18df17129695..8e509325c2c3 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -45,13 +45,16 @@
#include "perf_event.h"
struct x86_pmu x86_pmu __read_mostly;
+static struct pmu pmu;
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
+ .pmu = &pmu,
};
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
+DEFINE_STATIC_KEY_FALSE(perf_is_hybrid);
/*
* This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
@@ -151,15 +154,16 @@ again:
*/
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
struct hw_perf_event_extra *reg;
struct extra_reg *er;
reg = &event->hw.extra_reg;
- if (!x86_pmu.extra_regs)
+ if (!extra_regs)
return 0;
- for (er = x86_pmu.extra_regs; er->msr; er++) {
+ for (er = extra_regs; er->msr; er++) {
if (er->event != (config & er->config_mask))
continue;
if (event->attr.config1 & ~er->valid_mask)
@@ -182,16 +186,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
#ifdef CONFIG_X86_LOCAL_APIC
+static inline int get_possible_num_counters(void)
+{
+ int i, num_counters = x86_pmu.num_counters;
+
+ if (!is_hybrid())
+ return num_counters;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
+ num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters);
+
+ return num_counters;
+}
+
static bool reserve_pmc_hardware(void)
{
- int i;
+ int i, num_counters = get_possible_num_counters();
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
@@ -202,7 +219,7 @@ eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu_config_addr(i));
- i = x86_pmu.num_counters;
+ i = num_counters;
perfctr_fail:
for (i--; i >= 0; i--)
@@ -213,9 +230,9 @@ perfctr_fail:
static void release_pmc_hardware(void)
{
- int i;
+ int i, num_counters = get_possible_num_counters();
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
@@ -228,7 +245,7 @@ static void release_pmc_hardware(void) {}
#endif
-static bool check_hw_exists(void)
+bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
{
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
@@ -239,7 +256,7 @@ static bool check_hw_exists(void)
* Check to see if the BIOS enabled any of the counters, if so
* complain and bail.
*/
- for (i = 0; i < x86_pmu.num_counters; i++) {
+ for (i = 0; i < num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, &val);
if (ret)
@@ -253,15 +270,15 @@ static bool check_hw_exists(void)
}
}
- if (x86_pmu.num_counters_fixed) {
+ if (num_counters_fixed) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, &val);
if (ret)
goto msr_fail;
- for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
- if (fixed_counter_disabled(i))
+ for (i = 0; i < num_counters_fixed; i++) {
+ if (fixed_counter_disabled(i, pmu))
continue;
- if (val & (0x03 << i*4)) {
+ if (val & (0x03ULL << i*4)) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;
@@ -360,8 +377,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return -EINVAL;
cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX);
- val = hw_cache_event_ids[cache_type][cache_op][cache_result];
-
+ val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result];
if (val == 0)
return -ENOENT;
@@ -369,7 +385,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return -EINVAL;
hwc->config |= val;
- attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+ attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result];
return x86_pmu_extra_regs(val, event);
}
@@ -462,7 +478,7 @@ int x86_setup_perfctr(struct perf_event *event)
local64_set(&hwc->period_left, hwc->sample_period);
}
- if (attr->type == PERF_TYPE_RAW)
+ if (attr->type == event->pmu->type)
return x86_pmu_extra_regs(event->attr.config, event);
if (attr->type == PERF_TYPE_HW_CACHE)
@@ -597,7 +613,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
- if (event->attr.type == PERF_TYPE_RAW)
+ if (event->attr.type == event->pmu->type)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
if (event->attr.sample_period && x86_pmu.limit_period) {
@@ -724,16 +740,33 @@ void x86_pmu_enable_all(int added)
}
}
-static struct pmu pmu;
-
static inline int is_x86_event(struct perf_event *event)
{
- return event->pmu == &pmu;
+ int i;
+
+ if (!is_hybrid())
+ return event->pmu == &pmu;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu)
+ return true;
+ }
+
+ return false;
}
-struct pmu *x86_get_pmu(void)
+struct pmu *x86_get_pmu(unsigned int cpu)
{
- return &pmu;
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ /*
+ * All CPUs of the hybrid type have been offline.
+ * The x86_get_pmu() should not be invoked.
+ */
+ if (WARN_ON_ONCE(!cpuc->pmu))
+ return &pmu;
+
+ return cpuc->pmu;
}
/*
* Event scheduler state:
@@ -765,7 +798,7 @@ struct perf_sched {
};
/*
- * Initialize interator that runs through all events and counters.
+ * Initialize iterator that runs through all events and counters.
*/
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
int num, int wmin, int wmax, int gpmax)
@@ -936,6 +969,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
+ int num_counters = hybrid(cpuc->pmu, num_counters);
struct event_constraint *c;
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
@@ -1011,7 +1045,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n) {
- int gpmax = x86_pmu.num_counters;
+ int gpmax = num_counters;
/*
* Do not allow scheduling of more than half the available
@@ -1032,7 +1066,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* the extra Merge events needed by large increment events.
*/
if (x86_pmu.flags & PMU_FL_PAIR) {
- gpmax = x86_pmu.num_counters - cpuc->n_pair;
+ gpmax = num_counters - cpuc->n_pair;
WARN_ON(gpmax <= 0);
}
@@ -1096,8 +1130,9 @@ static void del_nr_metric_event(struct cpu_hw_events *cpuc,
static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
int max_count, int n)
{
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
- if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
+ if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
return -EINVAL;
if (n >= max_count + cpuc->n_metric)
@@ -1118,10 +1153,12 @@ static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
*/
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
+ int num_counters = hybrid(cpuc->pmu, num_counters);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct perf_event *event;
int n, max_count;
- max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+ max_count = num_counters + num_counters_fixed;
/* current number of events already accepted */
n = cpuc->n_events;
@@ -1480,7 +1517,6 @@ static void x86_pmu_start(struct perf_event *event, int flags)
cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask);
- __set_bit(idx, cpuc->running);
static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event);
}
@@ -1489,18 +1525,19 @@ void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
u64 pebs, debugctl;
- struct cpu_hw_events *cpuc;
+ int cpu = smp_processor_id();
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ int num_counters = hybrid(cpuc->pmu, num_counters);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints);
unsigned long flags;
- int cpu, idx;
+ int idx;
- if (!x86_pmu.num_counters)
+ if (!num_counters)
return;
local_irq_save(flags);
- cpu = smp_processor_id();
- cpuc = &per_cpu(cpu_hw_events, cpu);
-
if (x86_pmu.version >= 2) {
rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -1512,7 +1549,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- if (x86_pmu.pebs_constraints) {
+ if (pebs_constraints) {
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
}
@@ -1523,7 +1560,7 @@ void perf_event_print_debug(void)
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < num_counters; idx++) {
rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
rdmsrl(x86_pmu_event_addr(idx), pmc_count);
@@ -1536,8 +1573,8 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- if (fixed_counter_disabled(idx))
+ for (idx = 0; idx < num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
@@ -1573,6 +1610,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
static void x86_pmu_del(struct perf_event *event, int flags)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
int i;
/*
@@ -1612,7 +1650,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
- if (x86_pmu.intel_cap.perf_metrics)
+ if (intel_cap.perf_metrics)
del_nr_metric_event(cpuc, event);
perf_event_update_userpage(event);
@@ -1822,6 +1860,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
pmu_attr->event_str_noht);
}
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+ struct x86_hybrid_pmu *pmu;
+ const char *str, *next_str;
+ int i;
+
+ if (hweight64(pmu_attr->pmu_type) == 1)
+ return sprintf(page, "%s", pmu_attr->event_str);
+
+ /*
+ * Hybrid PMUs may support the same event name, but with different
+ * event encoding, e.g., the mem-loads event on an Atom PMU has
+ * different event encoding from a Core PMU.
+ *
+ * The event_str includes all event encodings. Each event encoding
+ * is divided by ";". The order of the event encodings must follow
+ * the order of the hybrid PMU index.
+ */
+ pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ str = pmu_attr->event_str;
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+ continue;
+ if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+ next_str = strchr(str, ';');
+ if (next_str)
+ return snprintf(page, next_str - str + 1, "%s", str);
+ else
+ return sprintf(page, "%s", str);
+ }
+ str = strchr(str, ';');
+ str++;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
EVENT_ATTR(cache-references, CACHE_REFERENCES );
@@ -1948,6 +2029,37 @@ static void _x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl)
+{
+ pr_info("... version: %d\n", x86_pmu.version);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic registers: %d\n", num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
+ pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+ pr_info("... fixed-purpose events: %lu\n",
+ hweight64((((1ULL << num_counters_fixed) - 1)
+ << INTEL_PMC_IDX_FIXED) & intel_ctrl));
+ pr_info("... event mask: %016Lx\n", intel_ctrl);
+}
+
+/*
+ * The generic code is not hybrid friendly. The hybrid_pmu->pmu
+ * of the first registered PMU is unconditionally assigned to
+ * each possible cpuctx->ctx.pmu.
+ * Update the correct hybrid PMU to the cpuctx->ctx.pmu.
+ */
+void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!pmu->pmu_cpu_context)
+ return;
+
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx->ctx.pmu = pmu;
+}
+
static int __init init_hw_perf_events(void)
{
struct x86_pmu_quirk *quirk;
@@ -1981,7 +2093,7 @@ static int __init init_hw_perf_events(void)
pmu_check_apic();
/* sanity check that the hardware exists or is emulated */
- if (!check_hw_exists())
+ if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed))
return 0;
pr_cont("%s PMU driver.\n", x86_pmu.name);
@@ -2008,15 +2120,11 @@ static int __init init_hw_perf_events(void)
pmu.attr_update = x86_pmu.attr_update;
- pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_counters);
- pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
- pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %lu\n",
- hweight64((((1ULL << x86_pmu.num_counters_fixed) - 1)
- << INTEL_PMC_IDX_FIXED) & x86_pmu.intel_ctrl));
- pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+ if (!is_hybrid()) {
+ x86_pmu_show_pmu_cap(x86_pmu.num_counters,
+ x86_pmu.num_counters_fixed,
+ x86_pmu.intel_ctrl);
+ }
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
@@ -2046,9 +2154,46 @@ static int __init init_hw_perf_events(void)
if (err)
goto out1;
- err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
- if (err)
- goto out2;
+ if (!is_hybrid()) {
+ err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
+ if (err)
+ goto out2;
+ } else {
+ u8 cpu_type = get_this_hybrid_cpu_type();
+ struct x86_hybrid_pmu *hybrid_pmu;
+ int i, j;
+
+ if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ hybrid_pmu = &x86_pmu.hybrid_pmu[i];
+
+ hybrid_pmu->pmu = pmu;
+ hybrid_pmu->pmu.type = -1;
+ hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
+ hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+ hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE;
+
+ err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name,
+ (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1);
+ if (err)
+ break;
+
+ if (cpu_type == hybrid_pmu->cpu_type)
+ x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id());
+ }
+
+ if (i < x86_pmu.num_hybrid_pmus) {
+ for (j = 0; j < i; j++)
+ perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu);
+ pr_warn("Failed to register hybrid PMUs\n");
+ kfree(x86_pmu.hybrid_pmu);
+ x86_pmu.hybrid_pmu = NULL;
+ x86_pmu.num_hybrid_pmus = 0;
+ goto out2;
+ }
+ }
return 0;
@@ -2173,16 +2318,27 @@ static void free_fake_cpuc(struct cpu_hw_events *cpuc)
kfree(cpuc);
}
-static struct cpu_hw_events *allocate_fake_cpuc(void)
+static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu)
{
struct cpu_hw_events *cpuc;
- int cpu = raw_smp_processor_id();
+ int cpu;
cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
if (!cpuc)
return ERR_PTR(-ENOMEM);
cpuc->is_fake = 1;
+ if (is_hybrid()) {
+ struct x86_hybrid_pmu *h_pmu;
+
+ h_pmu = hybrid_pmu(event_pmu);
+ if (cpumask_empty(&h_pmu->supported_cpus))
+ goto error;
+ cpu = cpumask_first(&h_pmu->supported_cpus);
+ } else
+ cpu = raw_smp_processor_id();
+ cpuc->pmu = event_pmu;
+
if (intel_cpuc_prepare(cpuc, cpu))
goto error;
@@ -2201,7 +2357,7 @@ static int validate_event(struct perf_event *event)
struct event_constraint *c;
int ret = 0;
- fake_cpuc = allocate_fake_cpuc();
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
@@ -2235,7 +2391,27 @@ static int validate_group(struct perf_event *event)
struct cpu_hw_events *fake_cpuc;
int ret = -EINVAL, n;
- fake_cpuc = allocate_fake_cpuc();
+ /*
+ * Reject events from different hybrid PMUs.
+ */
+ if (is_hybrid()) {
+ struct perf_event *sibling;
+ struct pmu *pmu = NULL;
+
+ if (is_x86_event(leader))
+ pmu = leader->pmu;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!is_x86_event(sibling))
+ continue;
+ if (!pmu)
+ pmu = sibling->pmu;
+ else if (pmu != sibling->pmu)
+ return ret;
+ }
+ }
+
+ fake_cpuc = allocate_fake_cpuc(event->pmu);
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
/*
@@ -2263,35 +2439,26 @@ out:
static int x86_pmu_event_init(struct perf_event *event)
{
- struct pmu *tmp;
+ struct x86_hybrid_pmu *pmu = NULL;
int err;
- switch (event->attr.type) {
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- case PERF_TYPE_HW_CACHE:
- break;
-
- default:
+ if ((event->attr.type != event->pmu->type) &&
+ (event->attr.type != PERF_TYPE_HARDWARE) &&
+ (event->attr.type != PERF_TYPE_HW_CACHE))
return -ENOENT;
+
+ if (is_hybrid() && (event->cpu != -1)) {
+ pmu = hybrid_pmu(event->pmu);
+ if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus))
+ return -ENOENT;
}
err = __x86_pmu_event_init(event);
if (!err) {
- /*
- * we temporarily connect event to its pmu
- * such that validate_group() can classify
- * it as an x86 event using is_x86_event()
- */
- tmp = event->pmu;
- event->pmu = &pmu;
-
if (event->group_leader != event)
err = validate_group(event);
else
err = validate_event(event);
-
- event->pmu = tmp;
}
if (err) {
if (event->destroy)
@@ -2475,6 +2642,14 @@ static int x86_pmu_aux_output_match(struct perf_event *event)
return 0;
}
+static int x86_pmu_filter_match(struct perf_event *event)
+{
+ if (x86_pmu.filter_match)
+ return x86_pmu.filter_match(event);
+
+ return 1;
+}
+
static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable = x86_pmu_disable,
@@ -2502,6 +2677,8 @@ static struct pmu pmu = {
.check_period = x86_pmu_check_period,
.aux_output_match = x86_pmu_aux_output_match,
+
+ .filter_match = x86_pmu_filter_match,
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -2770,6 +2947,11 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
cap->version = x86_pmu.version;
+ /*
+ * KVM doesn't support the hybrid PMU yet.
+ * Return the common value in global x86_pmu,
+ * which available for all cores.
+ */
cap->num_counters_gp = x86_pmu.num_counters;
cap->num_counters_fixed = x86_pmu.num_counters_fixed;
cap->bit_width_gp = x86_pmu.cntval_bits;
diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile
index e67a5886336c..10bde6c5abb2 100644
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -3,6 +3,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o
obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o
obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o
-intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o
+intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o
obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o
intel-cstate-objs := cstate.o
diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 731dd8d0dbb1..6320d2cfd9d3 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -594,7 +594,7 @@ static __init int bts_init(void)
* we cannot use the user mapping since it will not be available
* if we're not running the owning process.
*
- * With PTI we can't use the kernal map either, because its not
+ * With PTI we can't use the kernel map either, because its not
* there when we run userspace.
*
* For now, disable this driver when using PTI.
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 37ce38403cb8..2521d03de5e0 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -137,7 +137,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */
- INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */
+ INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */
INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
@@ -2076,6 +2076,14 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
+ /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+ INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+ INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+ INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+ EVENT_EXTRA_END
+};
+
#define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
#define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
#define KNL_MCDRAM_LOCAL BIT_ULL(21)
@@ -2153,10 +2161,11 @@ static void intel_pmu_disable_all(void)
static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
- x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+ intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_event *event =
@@ -2186,7 +2195,7 @@ static void intel_pmu_enable_all(int added)
* magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either
* in sequence on the same PMC or on different PMCs.
*
- * In practise it appears some of these events do in fact count, and
+ * In practice it appears some of these events do in fact count, and
* we need to program all 4 events.
*/
static void intel_pmu_nhm_workaround(void)
@@ -2429,13 +2438,23 @@ static int icl_set_topdown_event_period(struct perf_event *event)
return 0;
}
+static int adl_set_topdown_event_period(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type != hybrid_big)
+ return 0;
+
+ return icl_set_topdown_event_period(event);
+}
+
static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
{
u32 val;
/*
* The metric is reported as an 8bit integer fraction
- * suming up to 0xff.
+ * summing up to 0xff.
* slots-in-metric = (Metric / 0xff) * slots
*/
val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff;
@@ -2569,6 +2588,17 @@ static u64 icl_update_topdown_event(struct perf_event *event)
x86_pmu.num_topdown_events - 1);
}
+static u64 adl_update_topdown_event(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type != hybrid_big)
+ return 0;
+
+ return icl_update_topdown_event(event);
+}
+
+
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2709,22 +2739,25 @@ int intel_pmu_save_and_restart(struct perf_event *event)
static void intel_pmu_reset(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+ int num_counters = hybrid(cpuc->pmu, num_counters);
unsigned long flags;
int idx;
- if (!x86_pmu.num_counters)
+ if (!num_counters)
return;
local_irq_save(flags);
pr_info("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ for (idx = 0; idx < num_counters; idx++) {
wrmsrl_safe(x86_pmu_config_addr(idx), 0ull);
wrmsrl_safe(x86_pmu_event_addr(idx), 0ull);
}
- for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
- if (fixed_counter_disabled(idx))
+ for (idx = 0; idx < num_counters_fixed; idx++) {
+ if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
@@ -2753,6 +2786,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int bit;
int handled = 0;
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
inc_irq_stat(apic_perf_irqs);
@@ -2776,7 +2810,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
* processing loop coming after that the function, otherwise
* phony regular samples may be generated in the sampling buffer
* not marked with the EXACT tag. Another possibility is to have
- * one PEBS event and at least one non-PEBS event whic hoverflows
+ * one PEBS event and at least one non-PEBS event which overflows
* while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
* not be set, yet the overflow status bit for the PEBS counter will
* be on Skylake.
@@ -2798,7 +2832,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
handled++;
x86_pmu.drain_pebs(regs, &data);
- status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
+ status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
/*
* PMI throttle may be triggered, which stops the PEBS event.
@@ -2824,7 +2858,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
}
/*
- * Intel Perf mertrics
+ * Intel Perf metrics
*/
if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
handled++;
@@ -2961,8 +2995,10 @@ intel_vlbr_constraints(struct perf_event *event)
return NULL;
}
-static int intel_alt_er(int idx, u64 config)
+static int intel_alt_er(struct cpu_hw_events *cpuc,
+ int idx, u64 config)
{
+ struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
int alt_idx = idx;
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
@@ -2974,7 +3010,7 @@ static int intel_alt_er(int idx, u64 config)
if (idx == EXTRA_REG_RSP_1)
alt_idx = EXTRA_REG_RSP_0;
- if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+ if (config & ~extra_regs[alt_idx].valid_mask)
return idx;
return alt_idx;
@@ -2982,15 +3018,16 @@ static int intel_alt_er(int idx, u64 config)
static void intel_fixup_er(struct perf_event *event, int idx)
{
+ struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
event->hw.extra_reg.idx = idx;
if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
} else if (idx == EXTRA_REG_RSP_1) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
- event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+ event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
}
@@ -3066,7 +3103,7 @@ again:
*/
c = NULL;
} else {
- idx = intel_alt_er(idx, reg->config);
+ idx = intel_alt_er(cpuc, idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(&era->lock, flags);
goto again;
@@ -3131,10 +3168,11 @@ struct event_constraint *
x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
struct perf_event *event)
{
+ struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints);
struct event_constraint *c;
- if (x86_pmu.event_constraints) {
- for_each_event_constraint(c, x86_pmu.event_constraints) {
+ if (event_constraints) {
+ for_each_event_constraint(c, event_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
@@ -3142,7 +3180,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
}
}
- return &unconstrained;
+ return &hybrid_var(cpuc->pmu, unconstrained);
}
static struct event_constraint *
@@ -3646,6 +3684,23 @@ static inline bool is_mem_loads_aux_event(struct perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82);
}
+static inline bool require_mem_loads_aux_event(struct perf_event *event)
+{
+ if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX))
+ return false;
+
+ if (is_hybrid())
+ return hybrid_pmu(event->pmu)->cpu_type == hybrid_big;
+
+ return true;
+}
+
+static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
+{
+ union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap);
+
+ return test_bit(idx, (unsigned long *)&intel_cap->capabilities);
+}
static int intel_pmu_hw_config(struct perf_event *event)
{
@@ -3702,7 +3757,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT;
}
- if (event->attr.type != PERF_TYPE_RAW)
+ if ((event->attr.type == PERF_TYPE_HARDWARE) ||
+ (event->attr.type == PERF_TYPE_HW_CACHE))
return 0;
/*
@@ -3715,7 +3771,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
* with a slots event as group leader. When the slots event
* is used in a metrics group, it too cannot support sampling.
*/
- if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) {
+ if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2)
return -EINVAL;
@@ -3766,7 +3822,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
* event. The rule is to simplify the implementation of the check.
* That's because perf cannot have a complete group at the moment.
*/
- if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+ if (require_mem_loads_aux_event(event) &&
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
is_mem_loads_event(event)) {
struct perf_event *leader = event->group_leader;
@@ -3801,10 +3857,11 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
+ u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
- arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
- arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
+ arr[0].host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
+ arr[0].guest = intel_ctrl & ~cpuc->intel_ctrl_host_mask;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
arr[0].guest &= ~cpuc->pebs_enabled;
else
@@ -4042,6 +4099,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return c;
}
+static struct event_constraint *
+adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return spr_get_event_constraints(cpuc, idx, event);
+ else if (pmu->cpu_type == hybrid_small)
+ return tnt_get_event_constraints(cpuc, idx, event);
+
+ WARN_ON(1);
+ return &emptyconstraint;
+}
+
+static int adl_hw_config(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+ if (pmu->cpu_type == hybrid_big)
+ return hsw_hw_config(event);
+ else if (pmu->cpu_type == hybrid_small)
+ return intel_pmu_hw_config(event);
+
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+
+static u8 adl_get_hybrid_cpu_type(void)
+{
+ return hybrid_big;
+}
+
/*
* Broadwell:
*
@@ -4145,7 +4235,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
{
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
- if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
@@ -4199,12 +4289,62 @@ static void flip_smm_bit(void *data)
}
}
+static bool init_hybrid_pmu(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ u8 cpu_type = get_this_hybrid_cpu_type();
+ struct x86_hybrid_pmu *pmu = NULL;
+ int i;
+
+ if (!cpu_type && x86_pmu.get_hybrid_cpu_type)
+ cpu_type = x86_pmu.get_hybrid_cpu_type();
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) {
+ pmu = &x86_pmu.hybrid_pmu[i];
+ break;
+ }
+ }
+ if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) {
+ cpuc->pmu = NULL;
+ return false;
+ }
+
+ /* Only check and dump the PMU information for the first CPU */
+ if (!cpumask_empty(&pmu->supported_cpus))
+ goto end;
+
+ if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed))
+ return false;
+
+ pr_info("%s PMU driver: ", pmu->name);
+
+ if (pmu->intel_cap.pebs_output_pt_available)
+ pr_cont("PEBS-via-PT ");
+
+ pr_cont("\n");
+
+ x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed,
+ pmu->intel_ctrl);
+
+end:
+ cpumask_set_cpu(cpu, &pmu->supported_cpus);
+ cpuc->pmu = &pmu->pmu;
+
+ x86_pmu_update_cpu_context(&pmu->pmu, cpu);
+
+ return true;
+}
+
static void intel_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
int core_id = topology_core_id(cpu);
int i;
+ if (is_hybrid() && !init_hybrid_pmu(cpu))
+ return;
+
init_debug_store_on_cpu(cpu);
/*
* Deal with CPUs that don't clear their LBRs on power-up.
@@ -4222,8 +4362,16 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(&x86_pmu.attr_freeze_on_smi);
- /* Disable perf metrics if any added CPU doesn't support it. */
- if (x86_pmu.intel_cap.perf_metrics) {
+ /*
+ * Disable perf metrics if any added CPU doesn't support it.
+ *
+ * Turn off the check for a hybrid architecture, because the
+ * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate
+ * the architecture features. The perf metrics is a model-specific
+ * feature for now. The corresponding bit should always be 0 on
+ * a hybrid platform, e.g., Alder Lake.
+ */
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
@@ -4310,7 +4458,12 @@ void intel_cpuc_finish(struct cpu_hw_events *cpuc)
static void intel_pmu_cpu_dead(int cpu)
{
- intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu));
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+
+ intel_cpuc_finish(cpuc);
+
+ if (is_hybrid() && cpuc->pmu)
+ cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus);
}
static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -4339,6 +4492,14 @@ static int intel_pmu_aux_output_match(struct perf_event *event)
return is_intel_pt_event(event);
}
+static int intel_pmu_filter_match(struct perf_event *event)
+{
+ struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+ unsigned int cpu = smp_processor_id();
+
+ return cpumask_test_cpu(cpu, &pmu->supported_cpus);
+}
+
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -4516,7 +4677,7 @@ static const struct x86_cpu_desc isolation_ucodes[] = {
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009),
INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002),
- INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014),
+ INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000),
INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000),
@@ -4594,7 +4755,7 @@ static bool check_msr(unsigned long msr, u64 mask)
/*
* Disable the check for real HW, so we don't
- * mess with potentionaly enabled registers:
+ * mess with potentially enabled registers:
*/
if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
return true;
@@ -4659,7 +4820,7 @@ static __init void intel_arch_events_quirk(void)
{
int bit;
- /* disable event that reported as not presend by cpuid */
+ /* disable event that reported as not present by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
@@ -4879,7 +5040,7 @@ static void update_tfa_sched(void *ignored)
* and if so force schedule out for all event types all contexts
*/
if (test_bit(3, cpuc->active_mask))
- perf_pmu_resched(x86_get_pmu());
+ perf_pmu_resched(x86_get_pmu(smp_processor_id()));
}
static ssize_t show_sysctl_tfa(struct device *cdev,
@@ -5041,8 +5202,299 @@ static const struct attribute_group *attr_update[] = {
NULL,
};
+EVENT_ATTR_STR_HYBRID(slots, slots_adl, "event=0x00,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_adl, "event=0xc2,umask=0x0;event=0x00,umask=0x80", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-bad-spec, td_bad_spec_adl, "event=0x73,umask=0x0;event=0x00,umask=0x81", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_adl, "event=0x71,umask=0x0;event=0x00,umask=0x82", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_adl, "event=0x74,umask=0x0;event=0x00,umask=0x83", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(topdown-heavy-ops, td_heavy_ops_adl, "event=0x00,umask=0x84", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-br-mispredict, td_br_mis_adl, "event=0x00,umask=0x85", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-fetch-lat, td_fetch_lat_adl, "event=0x00,umask=0x86", hybrid_big);
+EVENT_ATTR_STR_HYBRID(topdown-mem-bound, td_mem_bound_adl, "event=0x00,umask=0x87", hybrid_big);
+
+static struct attribute *adl_hybrid_events_attrs[] = {
+ EVENT_PTR(slots_adl),
+ EVENT_PTR(td_retiring_adl),
+ EVENT_PTR(td_bad_spec_adl),
+ EVENT_PTR(td_fe_bound_adl),
+ EVENT_PTR(td_be_bound_adl),
+ EVENT_PTR(td_heavy_ops_adl),
+ EVENT_PTR(td_br_mis_adl),
+ EVENT_PTR(td_fetch_lat_adl),
+ EVENT_PTR(td_mem_bound_adl),
+ NULL,
+};
+
+/* Must be in IDX order */
+EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small);
+EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big);
+
+static struct attribute *adl_hybrid_mem_attrs[] = {
+ EVENT_PTR(mem_ld_adl),
+ EVENT_PTR(mem_st_adl),
+ EVENT_PTR(mem_ld_aux_adl),
+ NULL,
+};
+
+EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-conflict, tx_conflict_adl, "event=0x54,umask=0x1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-t, cycles_t_adl, "event=0x3c,in_tx=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(cycles-ct, cycles_ct_adl, "event=0x3c,in_tx=1,in_tx_cp=1", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-read, tx_capacity_read_adl, "event=0x54,umask=0x80", hybrid_big);
+EVENT_ATTR_STR_HYBRID(tx-capacity-write, tx_capacity_write_adl, "event=0x54,umask=0x2", hybrid_big);
+
+static struct attribute *adl_hybrid_tsx_attrs[] = {
+ EVENT_PTR(tx_start_adl),
+ EVENT_PTR(tx_abort_adl),
+ EVENT_PTR(tx_commit_adl),
+ EVENT_PTR(tx_capacity_read_adl),
+ EVENT_PTR(tx_capacity_write_adl),
+ EVENT_PTR(tx_conflict_adl),
+ EVENT_PTR(cycles_t_adl),
+ EVENT_PTR(cycles_ct_adl),
+ NULL,
+};
+
+FORMAT_ATTR_HYBRID(in_tx, hybrid_big);
+FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big);
+FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small);
+FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small);
+FORMAT_ATTR_HYBRID(frontend, hybrid_big);
+
+static struct attribute *adl_hybrid_extra_attr_rtm[] = {
+ FORMAT_HYBRID_PTR(in_tx),
+ FORMAT_HYBRID_PTR(in_tx_cp),
+ FORMAT_HYBRID_PTR(offcore_rsp),
+ FORMAT_HYBRID_PTR(ldlat),
+ FORMAT_HYBRID_PTR(frontend),
+ NULL,
+};
+
+static struct attribute *adl_hybrid_extra_attr[] = {
+ FORMAT_HYBRID_PTR(offcore_rsp),
+ FORMAT_HYBRID_PTR(ldlat),
+ FORMAT_HYBRID_PTR(frontend),
+ NULL,
+};
+
+static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_events_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr);
+
+ return pmu->cpu_type & pmu_attr->pmu_type;
+}
+
+static umode_t hybrid_events_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0;
+}
+
+static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu)
+{
+ int cpu = cpumask_first(&pmu->supported_cpus);
+
+ return (cpu >= nr_cpu_ids) ? -1 : cpu;
+}
+
+static umode_t hybrid_tsx_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t hybrid_format_is_visible(struct kobject *kobj,
+ struct attribute *attr, int i)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+ struct perf_pmu_format_hybrid_attr *pmu_attr =
+ container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr);
+ int cpu = hybrid_find_supported_cpu(pmu);
+
+ return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0;
+}
+
+static struct attribute_group hybrid_group_events_td = {
+ .name = "events",
+ .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_mem = {
+ .name = "events",
+ .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_tsx = {
+ .name = "events",
+ .is_visible = hybrid_tsx_is_visible,
+};
+
+static struct attribute_group hybrid_group_format_extra = {
+ .name = "format",
+ .is_visible = hybrid_format_is_visible,
+};
+
+static ssize_t intel_hybrid_get_attr_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct x86_hybrid_pmu *pmu =
+ container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+ return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus);
+}
+
+static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL);
+static struct attribute *intel_hybrid_cpus_attrs[] = {
+ &dev_attr_cpus.attr,
+ NULL,
+};
+
+static struct attribute_group hybrid_group_cpus = {
+ .attrs = intel_hybrid_cpus_attrs,
+};
+
+static const struct attribute_group *hybrid_attr_update[] = {
+ &hybrid_group_events_td,
+ &hybrid_group_events_mem,
+ &hybrid_group_events_tsx,
+ &group_caps_gen,
+ &group_caps_lbr,
+ &hybrid_group_format_extra,
+ &group_default,
+ &hybrid_group_cpus,
+ NULL,
+};
+
static struct attribute *empty_attrs;
+static void intel_pmu_check_num_counters(int *num_counters,
+ int *num_counters_fixed,
+ u64 *intel_ctrl, u64 fixed_mask)
+{
+ if (*num_counters > INTEL_PMC_MAX_GENERIC) {
+ WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+ *num_counters, INTEL_PMC_MAX_GENERIC);
+ *num_counters = INTEL_PMC_MAX_GENERIC;
+ }
+ *intel_ctrl = (1ULL << *num_counters) - 1;
+
+ if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+ WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+ *num_counters_fixed, INTEL_PMC_MAX_FIXED);
+ *num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ }
+
+ *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
+}
+
+static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints,
+ int num_counters,
+ int num_counters_fixed,
+ u64 intel_ctrl)
+{
+ struct event_constraint *c;
+
+ if (!event_constraints)
+ return;
+
+ /*
+ * event on fixed counter2 (REF_CYCLES) only works on this
+ * counter, so do not extend mask to generic counters
+ */
+ for_each_event_constraint(c, event_constraints) {
+ /*
+ * Don't extend the topdown slots and metrics
+ * events to the generic counters.
+ */
+ if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+ /*
+ * Disable topdown slots and metrics events,
+ * if slots event is not in CPUID.
+ */
+ if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
+ c->idxmsk64 = 0;
+ c->weight = hweight64(c->idxmsk64);
+ continue;
+ }
+
+ if (c->cmask == FIXED_EVENT_FLAGS) {
+ /* Disabled fixed counters which are not in CPUID */
+ c->idxmsk64 &= intel_ctrl;
+
+ if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+ c->idxmsk64 |= (1ULL << num_counters) - 1;
+ }
+ c->idxmsk64 &=
+ ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed));
+ c->weight = hweight64(c->idxmsk64);
+ }
+}
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
+{
+ struct extra_reg *er;
+
+ /*
+ * Access extra MSR may cause #GP under certain circumstances.
+ * E.g. KVM doesn't support offcore event
+ * Check all extra_regs here.
+ */
+ if (!extra_regs)
+ return;
+
+ for (er = extra_regs; er->msr; er++) {
+ er->extra_msr_access = check_msr(er->msr, 0x11UL);
+ /* Disable LBR select mapping */
+ if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+ x86_pmu.lbr_sel_map = NULL;
+ }
+}
+
+static void intel_pmu_check_hybrid_pmus(u64 fixed_mask)
+{
+ struct x86_hybrid_pmu *pmu;
+ int i;
+
+ for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+ pmu = &x86_pmu.hybrid_pmu[i];
+
+ intel_pmu_check_num_counters(&pmu->num_counters,
+ &pmu->num_counters_fixed,
+ &pmu->intel_ctrl,
+ fixed_mask);
+
+ if (pmu->intel_cap.perf_metrics) {
+ pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS;
+ }
+
+ if (pmu->intel_cap.pebs_output_pt_available)
+ pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+
+ intel_pmu_check_event_constraints(pmu->event_constraints,
+ pmu->num_counters,
+ pmu->num_counters_fixed,
+ pmu->intel_ctrl);
+
+ intel_pmu_check_extra_regs(pmu->extra_regs);
+ }
+}
+
__init int intel_pmu_init(void)
{
struct attribute **extra_skl_attr = &empty_attrs;
@@ -5053,12 +5505,11 @@ __init int intel_pmu_init(void)
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
- struct event_constraint *c;
unsigned int fixed_mask;
- struct extra_reg *er;
bool pmem = false;
int version, i;
char *name;
+ struct x86_hybrid_pmu *pmu;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
switch (boot_cpu_data.x86) {
@@ -5653,6 +6104,99 @@ __init int intel_pmu_init(void)
name = "sapphire_rapids";
break;
+ case INTEL_FAM6_ALDERLAKE:
+ case INTEL_FAM6_ALDERLAKE_L:
+ /*
+ * Alder Lake has 2 types of CPU, core and atom.
+ *
+ * Initialize the common PerfMon capabilities here.
+ */
+ x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS,
+ sizeof(struct x86_hybrid_pmu),
+ GFP_KERNEL);
+ if (!x86_pmu.hybrid_pmu)
+ return -ENOMEM;
+ static_branch_enable(&perf_is_hybrid);
+ x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS;
+
+ x86_pmu.late_ack = true;
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_PEBS_ALL;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+ x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
+ x86_pmu.lbr_pt_coexist = true;
+ intel_pmu_pebs_data_source_skl(false);
+ x86_pmu.num_topdown_events = 8;
+ x86_pmu.update_topdown_event = adl_update_topdown_event;
+ x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
+
+ x86_pmu.filter_match = intel_pmu_filter_match;
+ x86_pmu.get_event_constraints = adl_get_event_constraints;
+ x86_pmu.hw_config = adl_hw_config;
+ x86_pmu.limit_period = spr_limit_period;
+ x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type;
+ /*
+ * The rtm_abort_event is used to check whether to enable GPRs
+ * for the RTM abort event. Atom doesn't have the RTM abort
+ * event. There is no harmful to set it in the common
+ * x86_pmu.rtm_abort_event.
+ */
+ x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04);
+
+ td_attr = adl_hybrid_events_attrs;
+ mem_attr = adl_hybrid_mem_attrs;
+ tsx_attr = adl_hybrid_tsx_attrs;
+ extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
+ adl_hybrid_extra_attr_rtm : adl_hybrid_extra_attr;
+
+ /* Initialize big core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX];
+ pmu->name = "cpu_core";
+ pmu->cpu_type = hybrid_big;
+ pmu->num_counters = x86_pmu.num_counters + 2;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1;
+ pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ pmu->intel_cap.perf_metrics = 1;
+ pmu->intel_cap.pebs_output_pt_available = 0;
+
+ memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
+ memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
+ pmu->event_constraints = intel_spr_event_constraints;
+ pmu->pebs_constraints = intel_spr_pebs_event_constraints;
+ pmu->extra_regs = intel_spr_extra_regs;
+
+ /* Initialize Atom core specific PerfMon capabilities.*/
+ pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX];
+ pmu->name = "cpu_atom";
+ pmu->cpu_type = hybrid_small;
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ pmu->max_pebs_events = x86_pmu.max_pebs_events;
+ pmu->unconstrained = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
+ 0, pmu->num_counters, 0, 0);
+ pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities;
+ pmu->intel_cap.perf_metrics = 0;
+ pmu->intel_cap.pebs_output_pt_available = 1;
+
+ memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids));
+ memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs));
+ pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+ pmu->event_constraints = intel_slm_event_constraints;
+ pmu->pebs_constraints = intel_grt_pebs_event_constraints;
+ pmu->extra_regs = intel_grt_extra_regs;
+ pr_cont("Alderlake Hybrid events, ");
+ name = "alderlake_hybrid";
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -5673,68 +6217,36 @@ __init int intel_pmu_init(void)
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
+ if (!is_hybrid()) {
+ group_events_td.attrs = td_attr;
+ group_events_mem.attrs = mem_attr;
+ group_events_tsx.attrs = tsx_attr;
+ group_format_extra.attrs = extra_attr;
+ group_format_extra_skl.attrs = extra_skl_attr;
- group_events_td.attrs = td_attr;
- group_events_mem.attrs = mem_attr;
- group_events_tsx.attrs = tsx_attr;
- group_format_extra.attrs = extra_attr;
- group_format_extra_skl.attrs = extra_skl_attr;
-
- x86_pmu.attr_update = attr_update;
-
- if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
- WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
- x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
- }
- x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
+ x86_pmu.attr_update = attr_update;
+ } else {
+ hybrid_group_events_td.attrs = td_attr;
+ hybrid_group_events_mem.attrs = mem_attr;
+ hybrid_group_events_tsx.attrs = tsx_attr;
+ hybrid_group_format_extra.attrs = extra_attr;
- if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
- WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
- x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
+ x86_pmu.attr_update = hybrid_attr_update;
}
- x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
+ intel_pmu_check_num_counters(&x86_pmu.num_counters,
+ &x86_pmu.num_counters_fixed,
+ &x86_pmu.intel_ctrl,
+ (u64)fixed_mask);
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
- if (x86_pmu.event_constraints) {
- /*
- * event on fixed counter2 (REF_CYCLES) only works on this
- * counter, so do not extend mask to generic counters
- */
- for_each_event_constraint(c, x86_pmu.event_constraints) {
- /*
- * Don't extend the topdown slots and metrics
- * events to the generic counters.
- */
- if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
- /*
- * Disable topdown slots and metrics events,
- * if slots event is not in CPUID.
- */
- if (!(INTEL_PMC_MSK_FIXED_SLOTS & x86_pmu.intel_ctrl))
- c->idxmsk64 = 0;
- c->weight = hweight64(c->idxmsk64);
- continue;
- }
-
- if (c->cmask == FIXED_EVENT_FLAGS) {
- /* Disabled fixed counters which are not in CPUID */
- c->idxmsk64 &= x86_pmu.intel_ctrl;
-
- if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
- c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
- }
- c->idxmsk64 &=
- ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed));
- c->weight = hweight64(c->idxmsk64);
- }
- }
-
+ intel_pmu_check_event_constraints(x86_pmu.event_constraints,
+ x86_pmu.num_counters,
+ x86_pmu.num_counters_fixed,
+ x86_pmu.intel_ctrl);
/*
* Access LBR MSR may cause #GP under certain circumstances.
* E.g. KVM doesn't support LBR MSR
@@ -5752,19 +6264,7 @@ __init int intel_pmu_init(void)
if (x86_pmu.lbr_nr)
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
- /*
- * Access extra MSR may cause #GP under certain circumstances.
- * E.g. KVM doesn't support offcore event
- * Check all extra_regs here.
- */
- if (x86_pmu.extra_regs) {
- for (er = x86_pmu.extra_regs; er->msr; er++) {
- er->extra_msr_access = check_msr(er->msr, 0x11UL);
- /* Disable LBR select mapping */
- if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
- x86_pmu.lbr_sel_map = NULL;
- }
- }
+ intel_pmu_check_extra_regs(x86_pmu.extra_regs);
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
@@ -5773,9 +6273,12 @@ __init int intel_pmu_init(void)
pr_cont("full-width counters, ");
}
- if (x86_pmu.intel_cap.perf_metrics)
+ if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
+ if (is_hybrid())
+ intel_pmu_check_hybrid_pmus((u64)fixed_mask);
+
return 0;
}
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 407eee5f6f95..433399069e27 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL,TNT
+ * Available model: SLM,AMT,GLM,CNL,TNT,ADL
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
@@ -51,46 +51,49 @@
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
* Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL
+ * ICL,TGL,RKL,ADL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT,RKL
+ * KBL,CML,ICL,TGL,TNT,RKL,ADL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
+ * ADL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
* SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL
+ * KBL,CML,ICL,TGL,RKL,ADL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
- * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
* Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
* Scope: Package (physical package)
*
*/
@@ -563,6 +566,20 @@ static const struct cstate_model icl_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model adl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
+ BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
static const struct cstate_model slm_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
@@ -650,6 +667,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index d32b302719fe..1ec8fd311f38 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -779,6 +779,13 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
EVENT_CONSTRAINT_END
};
+struct event_constraint intel_grt_pebs_event_constraints[] = {
+ /* Allow all events as PEBS with no flags */
+ INTEL_PLD_CONSTRAINT(0x5d0, 0xf),
+ INTEL_PSD_CONSTRAINT(0x6d0, 0xf),
+ EVENT_CONSTRAINT_END
+};
+
struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
@@ -959,13 +966,14 @@ struct event_constraint intel_spr_pebs_event_constraints[] = {
struct event_constraint *intel_pebs_constraints(struct perf_event *event)
{
+ struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints);
struct event_constraint *c;
if (!event->attr.precise_ip)
return NULL;
- if (x86_pmu.pebs_constraints) {
- for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+ if (pebs_constraints) {
+ for_each_event_constraint(c, pebs_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
@@ -1007,6 +1015,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in)
static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
{
struct debug_store *ds = cpuc->ds;
+ int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
u64 threshold;
int reserved;
@@ -1014,9 +1024,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
return;
if (x86_pmu.flags & PMU_FL_PEBS_ALL)
- reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed;
+ reserved = max_pebs_events + num_counters_fixed;
else
- reserved = x86_pmu.max_pebs_events;
+ reserved = max_pebs_events;
if (cpuc->n_pebs == cpuc->n_large_pebs) {
threshold = ds->pebs_absolute_maximum -
@@ -1353,14 +1363,13 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
is_64bit = kernel_ip(to) || any_64bit_mode(regs);
#endif
insn_init(&insn, kaddr, size, is_64bit);
- insn_get_length(&insn);
+
/*
- * Make sure there was not a problem decoding the
- * instruction and getting the length. This is
- * doubly important because we have an infinite
- * loop if insn.length=0.
+ * Make sure there was not a problem decoding the instruction.
+ * This is doubly important because we have an infinite loop if
+ * insn.length=0.
*/
- if (!insn.length)
+ if (insn_get_length(&insn))
break;
to += insn.length;
@@ -1805,7 +1814,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
*
* [-period, 0]
*
- * the difference between two consequtive reads is:
+ * the difference between two consecutive reads is:
*
* A) value2 - value1;
* when no overflows have happened in between,
@@ -2072,6 +2081,8 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
{
short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {};
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events);
+ int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct debug_store *ds = cpuc->ds;
struct perf_event *event;
void *base, *at, *top;
@@ -2086,9 +2097,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_d
ds->pebs_index = ds->pebs_buffer_base;
- mask = ((1ULL << x86_pmu.max_pebs_events) - 1) |
- (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
- size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed;
+ mask = ((1ULL << max_pebs_events) - 1) |
+ (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED);
+ size = INTEL_PMC_IDX_FIXED + num_counters_fixed;
if (unlikely(base >= top)) {
intel_pmu_pebs_event_update_no_drain(cpuc, size);
@@ -2192,7 +2203,7 @@ void __init intel_ds_init(void)
PERF_SAMPLE_TIME;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
pebs_qual = "-baseline";
- x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
x86_pmu.large_pebs_flags &=
@@ -2205,9 +2216,9 @@ void __init intel_ds_init(void)
}
pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual);
- if (x86_pmu.intel_cap.pebs_output_pt_available) {
+ if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) {
pr_cont("PEBS-via-PT, ");
- x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
+ x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT;
}
break;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21890dacfcfe..76dbab6ac9fb 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -705,7 +705,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
void release_lbr_buffers(void)
{
- struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+ struct kmem_cache *kmem_cache;
struct cpu_hw_events *cpuc;
int cpu;
@@ -714,6 +714,7 @@ void release_lbr_buffers(void)
for_each_possible_cpu(cpu) {
cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+ kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
if (kmem_cache && cpuc->lbr_xsave) {
kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
cpuc->lbr_xsave = NULL;
@@ -1198,7 +1199,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
/*
* The LBR logs any address in the IP, even if the IP just
* faulted. This means userspace can control the from address.
- * Ensure we don't blindy read any address by validating it is
+ * Ensure we don't blindly read any address by validating it is
* a known text address.
*/
if (kernel_text_address(from)) {
@@ -1224,8 +1225,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
#endif
insn_init(&insn, addr, bytes_read, is64);
- insn_get_opcode(&insn);
- if (!insn.opcode.got)
+ if (insn_get_opcode(&insn))
return X86_BR_ABORT;
switch (insn.opcode.bytes[0]) {
@@ -1262,8 +1262,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
- insn_get_immediate(&insn);
- if (insn.immediate1.value == 0) {
+ if (insn_get_immediate(&insn) || insn.immediate1.value == 0) {
/* zero length call */
ret = X86_BR_ZERO_CALL;
break;
@@ -1279,7 +1278,9 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_JMP;
break;
case 0xff: /* call near absolute, call far absolute ind */
- insn_get_modrm(&insn);
+ if (insn_get_modrm(&insn))
+ return X86_BR_ABORT;
+
ext = (insn.modrm.bytes[0] >> 3) & 0x7;
switch (ext) {
case 2: /* near ind call */
@@ -1609,7 +1610,7 @@ void intel_pmu_lbr_init_hsw(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
- x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
if (lbr_from_signext_quirk_needed())
static_branch_enable(&lbr_from_quirk_key);
@@ -1629,7 +1630,7 @@ __init void intel_pmu_lbr_init_skl(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
- x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+ x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0);
/*
* SW branch filter usage:
@@ -1726,7 +1727,7 @@ static bool is_arch_lbr_xsave_available(void)
void __init intel_pmu_arch_lbr_init(void)
{
- struct pmu *pmu = x86_get_pmu();
+ struct pmu *pmu = x86_get_pmu(smp_processor_id());
union cpuid28_eax eax;
union cpuid28_ebx ebx;
union cpuid28_ecx ecx;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index a4cc66005ce8..7951a5dc73b6 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -24,7 +24,7 @@ struct p4_event_bind {
unsigned int escr_msr[2]; /* ESCR MSR for this event */
unsigned int escr_emask; /* valid ESCR EventMask bits */
unsigned int shared; /* event is shared across threads */
- char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
+ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */
};
struct p4_pebs_bind {
@@ -45,7 +45,7 @@ struct p4_pebs_bind {
* it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
* event configuration to find out which values are to be
* written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
- * resgisters
+ * registers
*/
static struct p4_pebs_bind p4_pebs_bind_map[] = {
P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
@@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config)
(void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
}
-static void p4_pmu_enable_event(struct perf_event *event)
+static void __p4_pmu_enable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int thread = p4_ht_config_thread(hwc->config);
@@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event)
(cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
}
+static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running);
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ int idx = event->hw.idx;
+
+ __set_bit(idx, per_cpu(p4_running, smp_processor_id()));
+ __p4_pmu_enable_event(event);
+}
+
static void p4_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -992,7 +1002,7 @@ static void p4_pmu_enable_all(int added)
struct perf_event *event = cpuc->events[idx];
if (!test_bit(idx, cpuc->active_mask))
continue;
- p4_pmu_enable_event(event);
+ __p4_pmu_enable_event(event);
}
}
@@ -1012,7 +1022,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
if (!test_bit(idx, cpuc->active_mask)) {
/* catch in-flight IRQs */
- if (__test_and_clear_bit(idx, cpuc->running))
+ if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id())))
handled++;
continue;
}
@@ -1313,7 +1323,7 @@ static __initconst const struct x86_pmu p4_pmu = {
.get_event_constraints = x86_get_event_constraints,
/*
* IF HT disabled we may need to use all
- * ARCH_P4_MAX_CCCR counters simulaneously
+ * ARCH_P4_MAX_CCCR counters simultaneously
* though leave it restricted at moment assuming
* HT is on
*/
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e94af4a54d0d..915847655c06 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -362,7 +362,7 @@ static bool pt_event_valid(struct perf_event *event)
/*
* Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config
- * clears the assomption that BranchEn must always be enabled,
+ * clears the assumption that BranchEn must always be enabled,
* as was the case with the first implementation of PT.
* If this bit is not set, the legacy behavior is preserved
* for compatibility with the older userspace.
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 33c8180d5a87..df7b07d7fdcb 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -4,8 +4,13 @@
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include "uncore.h"
+#include "uncore_discovery.h"
-static struct intel_uncore_type *empty_uncore[] = { NULL, };
+static bool uncore_no_discover;
+module_param(uncore_no_discover, bool, 0);
+MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism "
+ "(default: enable the discovery mechanism).");
+struct intel_uncore_type *empty_uncore[] = { NULL, };
struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
struct intel_uncore_type **uncore_mmio_uncores = empty_uncore;
@@ -48,6 +53,18 @@ int uncore_pcibus_to_dieid(struct pci_bus *bus)
return die_id;
}
+int uncore_die_to_segment(int die)
+{
+ struct pci_bus *bus = NULL;
+
+ /* Find first pci bus which attributes to specified die. */
+ while ((bus = pci_find_next_bus(bus)) &&
+ (die != uncore_pcibus_to_dieid(bus)))
+ ;
+
+ return bus ? pci_domain_nr(bus) : -EINVAL;
+}
+
static void uncore_free_pcibus_map(void)
{
struct pci2phy_map *map, *tmp;
@@ -829,6 +846,34 @@ static const struct attribute_group uncore_pmu_attr_group = {
.attrs = uncore_pmu_attrs,
};
+static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu)
+{
+ struct intel_uncore_type *type = pmu->type;
+
+ /*
+ * No uncore block name in discovery table.
+ * Use uncore_type_&typeid_&boxid as name.
+ */
+ if (!type->name) {
+ if (type->num_boxes == 1)
+ sprintf(pmu->name, "uncore_type_%u", type->type_id);
+ else {
+ sprintf(pmu->name, "uncore_type_%u_%d",
+ type->type_id, type->box_ids[pmu->pmu_idx]);
+ }
+ return;
+ }
+
+ if (type->num_boxes == 1) {
+ if (strlen(type->name) > 0)
+ sprintf(pmu->name, "uncore_%s", type->name);
+ else
+ sprintf(pmu->name, "uncore");
+ } else
+ sprintf(pmu->name, "uncore_%s_%d", type->name, pmu->pmu_idx);
+
+}
+
static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
{
int ret;
@@ -855,15 +900,7 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
pmu->pmu.attr_update = pmu->type->attr_update;
}
- if (pmu->type->num_boxes == 1) {
- if (strlen(pmu->type->name) > 0)
- sprintf(pmu->name, "uncore_%s", pmu->type->name);
- else
- sprintf(pmu->name, "uncore");
- } else {
- sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
- pmu->pmu_idx);
- }
+ uncore_get_pmu_name(pmu);
ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
if (!ret)
@@ -904,6 +941,10 @@ static void uncore_type_exit(struct intel_uncore_type *type)
kfree(type->pmus);
type->pmus = NULL;
}
+ if (type->box_ids) {
+ kfree(type->box_ids);
+ type->box_ids = NULL;
+ }
kfree(type->events_group);
type->events_group = NULL;
}
@@ -1003,10 +1044,37 @@ static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die)
return 0;
}
+static struct intel_uncore_pmu *
+uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_type *type;
+ u64 box_ctl;
+ int i, die;
+
+ for (; *types; types++) {
+ type = *types;
+ for (die = 0; die < __uncore_max_dies; die++) {
+ for (i = 0; i < type->num_boxes; i++) {
+ if (!type->box_ctls[die])
+ continue;
+ box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+ if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) &&
+ pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) &&
+ pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl))
+ return &type->pmus[i];
+ }
+ }
+ }
+
+ return NULL;
+}
+
/*
* Find the PMU of a PCI device.
* @pdev: The PCI device.
* @ids: The ID table of the available PCI devices with a PMU.
+ * If NULL, search the whole uncore_pci_uncores.
*/
static struct intel_uncore_pmu *
uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
@@ -1016,6 +1084,9 @@ uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids)
kernel_ulong_t data;
unsigned int devfn;
+ if (!ids)
+ return uncore_pci_find_dev_pmu_from_types(pdev);
+
while (ids && ids->vendor) {
if ((ids->vendor == pdev->vendor) &&
(ids->device == pdev->device)) {
@@ -1174,7 +1245,8 @@ static void uncore_pci_remove(struct pci_dev *pdev)
}
static int uncore_bus_notify(struct notifier_block *nb,
- unsigned long action, void *data)
+ unsigned long action, void *data,
+ const struct pci_device_id *ids)
{
struct device *dev = data;
struct pci_dev *pdev = to_pci_dev(dev);
@@ -1185,7 +1257,7 @@ static int uncore_bus_notify(struct notifier_block *nb,
if (action != BUS_NOTIFY_DEL_DEVICE)
return NOTIFY_DONE;
- pmu = uncore_pci_find_dev_pmu(pdev, uncore_pci_sub_driver->id_table);
+ pmu = uncore_pci_find_dev_pmu(pdev, ids);
if (!pmu)
return NOTIFY_DONE;
@@ -1197,8 +1269,15 @@ static int uncore_bus_notify(struct notifier_block *nb,
return NOTIFY_OK;
}
-static struct notifier_block uncore_notifier = {
- .notifier_call = uncore_bus_notify,
+static int uncore_pci_sub_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data,
+ uncore_pci_sub_driver->id_table);
+}
+
+static struct notifier_block uncore_pci_sub_notifier = {
+ .notifier_call = uncore_pci_sub_bus_notify,
};
static void uncore_pci_sub_driver_init(void)
@@ -1239,13 +1318,55 @@ static void uncore_pci_sub_driver_init(void)
ids++;
}
- if (notify && bus_register_notifier(&pci_bus_type, &uncore_notifier))
+ if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier))
notify = false;
if (!notify)
uncore_pci_sub_driver = NULL;
}
+static int uncore_pci_bus_notify(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return uncore_bus_notify(nb, action, data, NULL);
+}
+
+static struct notifier_block uncore_pci_notifier = {
+ .notifier_call = uncore_pci_bus_notify,
+};
+
+
+static void uncore_pci_pmus_register(void)
+{
+ struct intel_uncore_type **types = uncore_pci_uncores;
+ struct intel_uncore_type *type;
+ struct intel_uncore_pmu *pmu;
+ struct pci_dev *pdev;
+ u64 box_ctl;
+ int i, die;
+
+ for (; *types; types++) {
+ type = *types;
+ for (die = 0; die < __uncore_max_dies; die++) {
+ for (i = 0; i < type->num_boxes; i++) {
+ if (!type->box_ctls[die])
+ continue;
+ box_ctl = type->box_ctls[die] + type->pci_offsets[i];
+ pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl),
+ UNCORE_DISCOVERY_PCI_BUS(box_ctl),
+ UNCORE_DISCOVERY_PCI_DEVFN(box_ctl));
+ if (!pdev)
+ continue;
+ pmu = &type->pmus[i];
+
+ uncore_pci_pmu_register(pdev, type, pmu, die);
+ }
+ }
+ }
+
+ bus_register_notifier(&pci_bus_type, &uncore_pci_notifier);
+}
+
static int __init uncore_pci_init(void)
{
size_t size;
@@ -1262,12 +1383,15 @@ static int __init uncore_pci_init(void)
if (ret)
goto errtype;
- uncore_pci_driver->probe = uncore_pci_probe;
- uncore_pci_driver->remove = uncore_pci_remove;
+ if (uncore_pci_driver) {
+ uncore_pci_driver->probe = uncore_pci_probe;
+ uncore_pci_driver->remove = uncore_pci_remove;
- ret = pci_register_driver(uncore_pci_driver);
- if (ret)
- goto errtype;
+ ret = pci_register_driver(uncore_pci_driver);
+ if (ret)
+ goto errtype;
+ } else
+ uncore_pci_pmus_register();
if (uncore_pci_sub_driver)
uncore_pci_sub_driver_init();
@@ -1290,8 +1414,11 @@ static void uncore_pci_exit(void)
if (pcidrv_registered) {
pcidrv_registered = false;
if (uncore_pci_sub_driver)
- bus_unregister_notifier(&pci_bus_type, &uncore_notifier);
- pci_unregister_driver(uncore_pci_driver);
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier);
+ if (uncore_pci_driver)
+ pci_unregister_driver(uncore_pci_driver);
+ else
+ bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier);
uncore_types_exit(uncore_pci_uncores);
kfree(uncore_extra_pci_dev);
uncore_free_pcibus_map();
@@ -1625,6 +1752,11 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
};
+static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
+ .cpu_init = adl_uncore_cpu_init,
+ .mmio_init = tgl_uncore_mmio_init,
+};
+
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1637,6 +1769,12 @@ static const struct intel_uncore_init_fun snr_uncore_init __initconst = {
.mmio_init = snr_uncore_mmio_init,
};
+static const struct intel_uncore_init_fun generic_uncore_init __initconst = {
+ .cpu_init = intel_uncore_generic_uncore_cpu_init,
+ .pci_init = intel_uncore_generic_uncore_pci_init,
+ .mmio_init = intel_uncore_generic_uncore_mmio_init,
+};
+
static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init),
@@ -1673,6 +1811,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
};
@@ -1684,17 +1824,21 @@ static int __init intel_uncore_init(void)
struct intel_uncore_init_fun *uncore_init;
int pret = 0, cret = 0, mret = 0, ret;
- id = x86_match_cpu(intel_uncore_match);
- if (!id)
- return -ENODEV;
-
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return -ENODEV;
__uncore_max_dies =
topology_max_packages() * topology_max_die_per_package();
- uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+ id = x86_match_cpu(intel_uncore_match);
+ if (!id) {
+ if (!uncore_no_discover && intel_uncore_has_discovery_tables())
+ uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init;
+ else
+ return -ENODEV;
+ } else
+ uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
+
if (uncore_init->pci_init) {
pret = uncore_init->pci_init();
if (!pret)
@@ -1711,8 +1855,10 @@ static int __init intel_uncore_init(void)
mret = uncore_mmio_init();
}
- if (cret && pret && mret)
- return -ENODEV;
+ if (cret && pret && mret) {
+ ret = -ENODEV;
+ goto free_discovery;
+ }
/* Install hotplug callbacks to setup the targets for each package */
ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE,
@@ -1727,6 +1873,8 @@ err:
uncore_types_exit(uncore_msr_uncores);
uncore_types_exit(uncore_mmio_uncores);
uncore_pci_exit();
+free_discovery:
+ intel_uncore_clear_discovery_tables();
return ret;
}
module_init(intel_uncore_init);
@@ -1737,5 +1885,6 @@ static void __exit intel_uncore_exit(void)
uncore_types_exit(uncore_msr_uncores);
uncore_types_exit(uncore_mmio_uncores);
uncore_pci_exit();
+ intel_uncore_clear_discovery_tables();
}
module_exit(intel_uncore_exit);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a3c6e1643ad2..291791002997 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -42,6 +42,7 @@ struct intel_uncore_pmu;
struct intel_uncore_box;
struct uncore_event_desc;
struct freerunning_counters;
+struct intel_uncore_topology;
struct intel_uncore_type {
const char *name;
@@ -50,6 +51,7 @@ struct intel_uncore_type {
int perf_ctr_bits;
int fixed_ctr_bits;
int num_freerunning_types;
+ int type_id;
unsigned perf_ctr;
unsigned event_ctl;
unsigned event_mask;
@@ -57,6 +59,7 @@ struct intel_uncore_type {
unsigned fixed_ctr;
unsigned fixed_ctl;
unsigned box_ctl;
+ u64 *box_ctls; /* Unit ctrl addr of the first box of each die */
union {
unsigned msr_offset;
unsigned mmio_offset;
@@ -65,7 +68,12 @@ struct intel_uncore_type {
unsigned num_shared_regs:8;
unsigned single_fixed:1;
unsigned pair_ctr_ctl:1;
- unsigned *msr_offsets;
+ union {
+ unsigned *msr_offsets;
+ unsigned *pci_offsets;
+ unsigned *mmio_offsets;
+ };
+ unsigned *box_ids;
struct event_constraint unconstrainted;
struct event_constraint *constraints;
struct intel_uncore_pmu *pmus;
@@ -80,7 +88,7 @@ struct intel_uncore_type {
* to identify which platform component each PMON block of that type is
* supposed to monitor.
*/
- u64 *topology;
+ struct intel_uncore_topology *topology;
/*
* Optional callbacks for managing mapping of Uncore units to PMONs
*/
@@ -169,6 +177,11 @@ struct freerunning_counters {
unsigned *box_offsets;
};
+struct intel_uncore_topology {
+ u64 configuration;
+ int segment;
+};
+
struct pci2phy_map {
struct list_head list;
int segment;
@@ -177,6 +190,7 @@ struct pci2phy_map {
struct pci2phy_map *__find_pci2phy_map(int segment);
int uncore_pcibus_to_dieid(struct pci_bus *bus);
+int uncore_die_to_segment(int die);
ssize_t uncore_event_show(struct device *dev,
struct device_attribute *attr, char *buf);
@@ -547,6 +561,7 @@ uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event);
void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event);
u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx);
+extern struct intel_uncore_type *empty_uncore[];
extern struct intel_uncore_type **uncore_msr_uncores;
extern struct intel_uncore_type **uncore_pci_uncores;
extern struct intel_uncore_type **uncore_mmio_uncores;
@@ -567,6 +582,7 @@ void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
new file mode 100644
index 000000000000..aba9bff95413
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -0,0 +1,622 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Support Intel uncore PerfMon discovery mechanism.
+ * Copyright(c) 2021 Intel Corporation.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "uncore.h"
+#include "uncore_discovery.h"
+
+static struct rb_root discovery_tables = RB_ROOT;
+static int num_discovered_types[UNCORE_ACCESS_MAX];
+
+static bool has_generic_discovery_table(void)
+{
+ struct pci_dev *dev;
+ int dvsec;
+
+ dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL);
+ if (!dev)
+ return false;
+
+ /* A discovery table device has the unique capability ID. */
+ dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY);
+ pci_dev_put(dev);
+ if (dvsec)
+ return true;
+
+ return false;
+}
+
+static int logical_die_id;
+
+static int get_device_die_id(struct pci_dev *dev)
+{
+ int cpu, node = pcibus_to_node(dev->bus);
+
+ /*
+ * If the NUMA info is not available, assume that the logical die id is
+ * continuous in the order in which the discovery table devices are
+ * detected.
+ */
+ if (node < 0)
+ return logical_die_id++;
+
+ for_each_cpu(cpu, cpumask_of_node(node)) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && cpu_to_node(cpu) == node)
+ return c->logical_die_id;
+ }
+
+ /*
+ * All CPUs of a node may be offlined. For this case,
+ * the PCI and MMIO type of uncore blocks which are
+ * enumerated by the device will be unavailable.
+ */
+ return -1;
+}
+
+#define __node_2_type(cur) \
+ rb_entry((cur), struct intel_uncore_discovery_type, node)
+
+static inline int __type_cmp(const void *key, const struct rb_node *b)
+{
+ struct intel_uncore_discovery_type *type_b = __node_2_type(b);
+ const u16 *type_id = key;
+
+ if (type_b->type > *type_id)
+ return -1;
+ else if (type_b->type < *type_id)
+ return 1;
+
+ return 0;
+}
+
+static inline struct intel_uncore_discovery_type *
+search_uncore_discovery_type(u16 type_id)
+{
+ struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp);
+
+ return (node) ? __node_2_type(node) : NULL;
+}
+
+static inline bool __type_less(struct rb_node *a, const struct rb_node *b)
+{
+ return (__node_2_type(a)->type < __node_2_type(b)->type);
+}
+
+static struct intel_uncore_discovery_type *
+add_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ if (unit->access_type >= UNCORE_ACCESS_MAX) {
+ pr_warn("Unsupported access type %d\n", unit->access_type);
+ return NULL;
+ }
+
+ type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL);
+ if (!type)
+ return NULL;
+
+ type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL);
+ if (!type->box_ctrl_die)
+ goto free_type;
+
+ type->access_type = unit->access_type;
+ num_discovered_types[type->access_type]++;
+ type->type = unit->box_type;
+
+ rb_add(&type->node, &discovery_tables, __type_less);
+
+ return type;
+
+free_type:
+ kfree(type);
+
+ return NULL;
+
+}
+
+static struct intel_uncore_discovery_type *
+get_uncore_discovery_type(struct uncore_unit_discovery *unit)
+{
+ struct intel_uncore_discovery_type *type;
+
+ type = search_uncore_discovery_type(unit->box_type);
+ if (type)
+ return type;
+
+ return add_uncore_discovery_type(unit);
+}
+
+static void
+uncore_insert_box_info(struct uncore_unit_discovery *unit,
+ int die, bool parsed)
+{
+ struct intel_uncore_discovery_type *type;
+ unsigned int *box_offset, *ids;
+ int i;
+
+ if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset))
+ return;
+
+ if (parsed) {
+ type = search_uncore_discovery_type(unit->box_type);
+ if (WARN_ON_ONCE(!type))
+ return;
+ /* Store the first box of each die */
+ if (!type->box_ctrl_die[die])
+ type->box_ctrl_die[die] = unit->ctl;
+ return;
+ }
+
+ type = get_uncore_discovery_type(unit);
+ if (!type)
+ return;
+
+ box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ if (!box_offset)
+ return;
+
+ ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
+ if (!ids)
+ goto free_box_offset;
+
+ /* Store generic information for the first box */
+ if (!type->num_boxes) {
+ type->box_ctrl = unit->ctl;
+ type->box_ctrl_die[die] = unit->ctl;
+ type->num_counters = unit->num_regs;
+ type->counter_width = unit->bit_width;
+ type->ctl_offset = unit->ctl_offset;
+ type->ctr_offset = unit->ctr_offset;
+ *ids = unit->box_id;
+ goto end;
+ }
+
+ for (i = 0; i < type->num_boxes; i++) {
+ ids[i] = type->ids[i];
+ box_offset[i] = type->box_offset[i];
+
+ if (WARN_ON_ONCE(unit->box_id == ids[i]))
+ goto free_ids;
+ }
+ ids[i] = unit->box_id;
+ box_offset[i] = unit->ctl - type->box_ctrl;
+ kfree(type->ids);
+ kfree(type->box_offset);
+end:
+ type->ids = ids;
+ type->box_offset = box_offset;
+ type->num_boxes++;
+ return;
+
+free_ids:
+ kfree(ids);
+
+free_box_offset:
+ kfree(box_offset);
+
+}
+
+static int parse_discovery_table(struct pci_dev *dev, int die,
+ u32 bar_offset, bool *parsed)
+{
+ struct uncore_global_discovery global;
+ struct uncore_unit_discovery unit;
+ void __iomem *io_addr;
+ resource_size_t addr;
+ unsigned long size;
+ u32 val;
+ int i;
+
+ pci_read_config_dword(dev, bar_offset, &val);
+
+ if (val & UNCORE_DISCOVERY_MASK)
+ return -EINVAL;
+
+ addr = (resource_size_t)(val & ~UNCORE_DISCOVERY_MASK);
+ size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Read Global Discovery State */
+ memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery));
+ if (uncore_discovery_invalid_unit(global)) {
+ pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n",
+ global.table1, global.ctl, global.table3);
+ iounmap(io_addr);
+ return -EINVAL;
+ }
+ iounmap(io_addr);
+
+ size = (1 + global.max_units) * global.stride * 8;
+ io_addr = ioremap(addr, size);
+ if (!io_addr)
+ return -ENOMEM;
+
+ /* Parsing Unit Discovery State */
+ for (i = 0; i < global.max_units; i++) {
+ memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8),
+ sizeof(struct uncore_unit_discovery));
+
+ if (uncore_discovery_invalid_unit(unit))
+ continue;
+
+ if (unit.access_type >= UNCORE_ACCESS_MAX)
+ continue;
+
+ uncore_insert_box_info(&unit, die, *parsed);
+ }
+
+ *parsed = true;
+ iounmap(io_addr);
+ return 0;
+}
+
+bool intel_uncore_has_discovery_tables(void)
+{
+ u32 device, val, entry_id, bar_offset;
+ int die, dvsec = 0, ret = true;
+ struct pci_dev *dev = NULL;
+ bool parsed = false;
+
+ if (has_generic_discovery_table())
+ device = UNCORE_DISCOVERY_TABLE_DEVICE;
+ else
+ device = PCI_ANY_ID;
+
+ /*
+ * Start a new search and iterates through the list of
+ * the discovery table devices.
+ */
+ while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) {
+ while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) {
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val);
+ entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK;
+ if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON)
+ continue;
+
+ pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val);
+
+ if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) {
+ ret = false;
+ goto err;
+ }
+ bar_offset = UNCORE_DISCOVERY_BIR_BASE +
+ (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP;
+
+ die = get_device_die_id(dev);
+ if (die < 0)
+ continue;
+
+ parse_discovery_table(dev, die, bar_offset, &parsed);
+ }
+ }
+
+ /* None of the discovery tables are available */
+ if (!parsed)
+ ret = false;
+err:
+ pci_dev_put(dev);
+
+ return ret;
+}
+
+void intel_uncore_clear_discovery_tables(void)
+{
+ struct intel_uncore_discovery_type *type, *next;
+
+ rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+ kfree(type->box_ctrl_die);
+ kfree(type);
+ }
+}
+
+DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
+DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
+DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
+DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31");
+
+static struct attribute *generic_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_thresh.attr,
+ NULL,
+};
+
+static const struct attribute_group generic_uncore_format_group = {
+ .name = "format",
+ .attrs = generic_uncore_formats_attr,
+};
+
+static void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT);
+}
+
+static void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+static void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(uncore_msr_box_ctl(box), 0);
+}
+
+static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ wrmsrl(hwc->config_base, 0);
+}
+
+static struct intel_uncore_ops generic_uncore_msr_ops = {
+ .init_box = intel_generic_uncore_msr_init_box,
+ .disable_box = intel_generic_uncore_msr_disable_box,
+ .enable_box = intel_generic_uncore_msr_enable_box,
+ .disable_event = intel_generic_uncore_msr_disable_event,
+ .enable_event = intel_generic_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags);
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT);
+}
+
+static void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ);
+}
+
+static void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ int box_ctl = uncore_pci_box_ctl(box);
+
+ pci_write_config_dword(pdev, box_ctl, 0);
+}
+
+static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, hwc->config);
+}
+
+static void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+
+ pci_write_config_dword(pdev, hwc->config_base, 0);
+}
+
+static u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct pci_dev *pdev = box->pci_dev;
+ struct hw_perf_event *hwc = &event->hw;
+ u64 count = 0;
+
+ pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
+ pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
+
+ return count;
+}
+
+static struct intel_uncore_ops generic_uncore_pci_ops = {
+ .init_box = intel_generic_uncore_pci_init_box,
+ .disable_box = intel_generic_uncore_pci_disable_box,
+ .enable_box = intel_generic_uncore_pci_enable_box,
+ .disable_event = intel_generic_uncore_pci_disable_event,
+ .enable_event = intel_generic_uncore_pci_enable_event,
+ .read_counter = intel_generic_uncore_pci_read_counter,
+};
+
+#define UNCORE_GENERIC_MMIO_SIZE 0x4000
+
+static unsigned int generic_uncore_mmio_box_ctl(struct intel_uncore_box *box)
+{
+ struct intel_uncore_type *type = box->pmu->type;
+
+ if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets)
+ return 0;
+
+ return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx];
+}
+
+static void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box)
+{
+ unsigned int box_ctl = generic_uncore_mmio_box_ctl(box);
+ struct intel_uncore_type *type = box->pmu->type;
+ resource_size_t addr;
+
+ if (!box_ctl) {
+ pr_warn("Uncore type %d box %d: Invalid box control address.\n",
+ type->type_id, type->box_ids[box->pmu->pmu_idx]);
+ return;
+ }
+
+ addr = box_ctl;
+ box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE);
+ if (!box->io_addr) {
+ pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n",
+ type->type_id, type->box_ids[box->pmu->pmu_idx],
+ (unsigned long long)addr);
+ return;
+ }
+
+ writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr);
+}
+
+static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(hwc->config, box->io_addr + hwc->config_base);
+}
+
+static void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + hwc->config_base);
+}
+
+static struct intel_uncore_ops generic_uncore_mmio_ops = {
+ .init_box = intel_generic_uncore_mmio_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = intel_generic_uncore_mmio_disable_box,
+ .enable_box = intel_generic_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+static bool uncore_update_uncore_type(enum uncore_access_type type_id,
+ struct intel_uncore_type *uncore,
+ struct intel_uncore_discovery_type *type)
+{
+ uncore->type_id = type->type;
+ uncore->num_boxes = type->num_boxes;
+ uncore->num_counters = type->num_counters;
+ uncore->perf_ctr_bits = type->counter_width;
+ uncore->box_ids = type->ids;
+
+ switch (type_id) {
+ case UNCORE_ACCESS_MSR:
+ uncore->ops = &generic_uncore_msr_ops;
+ uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset;
+ uncore->box_ctl = (unsigned int)type->box_ctrl;
+ uncore->msr_offsets = type->box_offset;
+ break;
+ case UNCORE_ACCESS_PCI:
+ uncore->ops = &generic_uncore_pci_ops;
+ uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset;
+ uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset;
+ uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl);
+ uncore->box_ctls = type->box_ctrl_die;
+ uncore->pci_offsets = type->box_offset;
+ break;
+ case UNCORE_ACCESS_MMIO:
+ uncore->ops = &generic_uncore_mmio_ops;
+ uncore->perf_ctr = (unsigned int)type->ctr_offset;
+ uncore->event_ctl = (unsigned int)type->ctl_offset;
+ uncore->box_ctl = (unsigned int)type->box_ctrl;
+ uncore->box_ctls = type->box_ctrl_die;
+ uncore->mmio_offsets = type->box_offset;
+ uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static struct intel_uncore_type **
+intel_uncore_generic_init_uncores(enum uncore_access_type type_id)
+{
+ struct intel_uncore_discovery_type *type;
+ struct intel_uncore_type **uncores;
+ struct intel_uncore_type *uncore;
+ struct rb_node *node;
+ int i = 0;
+
+ uncores = kcalloc(num_discovered_types[type_id] + 1,
+ sizeof(struct intel_uncore_type *), GFP_KERNEL);
+ if (!uncores)
+ return empty_uncore;
+
+ for (node = rb_first(&discovery_tables); node; node = rb_next(node)) {
+ type = rb_entry(node, struct intel_uncore_discovery_type, node);
+ if (type->access_type != type_id)
+ continue;
+
+ uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL);
+ if (!uncore)
+ break;
+
+ uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK;
+ uncore->format_group = &generic_uncore_format_group;
+
+ if (!uncore_update_uncore_type(type_id, uncore, type)) {
+ kfree(uncore);
+ continue;
+ }
+ uncores[i++] = uncore;
+ }
+
+ return uncores;
+}
+
+void intel_uncore_generic_uncore_cpu_init(void)
+{
+ uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR);
+}
+
+int intel_uncore_generic_uncore_pci_init(void)
+{
+ uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI);
+
+ return 0;
+}
+
+void intel_uncore_generic_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO);
+}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
new file mode 100644
index 000000000000..1d652939a01c
--- /dev/null
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+/* Generic device ID of a discovery table device */
+#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7
+/* Capability ID for a discovery table device */
+#define UNCORE_EXT_CAP_ID_DISCOVERY 0x23
+/* First DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC_OFFSET 0x8
+/* Mask of the supported discovery entry type */
+#define UNCORE_DISCOVERY_DVSEC_ID_MASK 0xffff
+/* PMON discovery entry type ID */
+#define UNCORE_DISCOVERY_DVSEC_ID_PMON 0x1
+/* Second DVSEC offset */
+#define UNCORE_DISCOVERY_DVSEC2_OFFSET 0xc
+/* Mask of the discovery table BAR offset */
+#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK 0x7
+/* Discovery table BAR base offset */
+#define UNCORE_DISCOVERY_BIR_BASE 0x10
+/* Discovery table BAR step */
+#define UNCORE_DISCOVERY_BIR_STEP 0x4
+/* Mask of the discovery table offset */
+#define UNCORE_DISCOVERY_MASK 0xf
+/* Global discovery table size */
+#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20
+
+#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7)
+#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff)
+#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff)
+#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff)
+
+
+#define uncore_discovery_invalid_unit(unit) \
+ (!unit.table1 || !unit.ctl || !unit.table3 || \
+ unit.table1 == -1ULL || unit.ctl == -1ULL || \
+ unit.table3 == -1ULL)
+
+#define GENERIC_PMON_CTL_EV_SEL_MASK 0x000000ff
+#define GENERIC_PMON_CTL_UMASK_MASK 0x0000ff00
+#define GENERIC_PMON_CTL_EDGE_DET (1 << 18)
+#define GENERIC_PMON_CTL_INVERT (1 << 23)
+#define GENERIC_PMON_CTL_TRESH_MASK 0xff000000
+#define GENERIC_PMON_RAW_EVENT_MASK (GENERIC_PMON_CTL_EV_SEL_MASK | \
+ GENERIC_PMON_CTL_UMASK_MASK | \
+ GENERIC_PMON_CTL_EDGE_DET | \
+ GENERIC_PMON_CTL_INVERT | \
+ GENERIC_PMON_CTL_TRESH_MASK)
+
+#define GENERIC_PMON_BOX_CTL_FRZ (1 << 0)
+#define GENERIC_PMON_BOX_CTL_RST_CTRL (1 << 8)
+#define GENERIC_PMON_BOX_CTL_RST_CTRS (1 << 9)
+#define GENERIC_PMON_BOX_CTL_INT (GENERIC_PMON_BOX_CTL_RST_CTRL | \
+ GENERIC_PMON_BOX_CTL_RST_CTRS)
+
+enum uncore_access_type {
+ UNCORE_ACCESS_MSR = 0,
+ UNCORE_ACCESS_MMIO,
+ UNCORE_ACCESS_PCI,
+
+ UNCORE_ACCESS_MAX,
+};
+
+struct uncore_global_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 type : 8,
+ stride : 8,
+ max_units : 10,
+ __reserved_1 : 36,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Global Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 status_offset : 8,
+ num_status : 16,
+ __reserved_2 : 40;
+ };
+ };
+};
+
+struct uncore_unit_discovery {
+ union {
+ u64 table1;
+ struct {
+ u64 num_regs : 8,
+ ctl_offset : 8,
+ bit_width : 8,
+ ctr_offset : 8,
+ status_offset : 8,
+ __reserved_1 : 22,
+ access_type : 2;
+ };
+ };
+
+ u64 ctl; /* Unit Control Address */
+
+ union {
+ u64 table3;
+ struct {
+ u64 box_type : 16,
+ box_id : 16,
+ __reserved_2 : 32;
+ };
+ };
+};
+
+struct intel_uncore_discovery_type {
+ struct rb_node node;
+ enum uncore_access_type access_type;
+ u64 box_ctrl; /* Unit ctrl addr of the first box */
+ u64 *box_ctrl_die; /* Unit ctrl addr of the first box of each die */
+ u16 type; /* Type ID of the uncore block */
+ u8 num_counters;
+ u8 counter_width;
+ u8 ctl_offset; /* Counter Control 0 offset */
+ u8 ctr_offset; /* Counter 0 offset */
+ u16 num_boxes; /* number of boxes for the uncore block */
+ unsigned int *ids; /* Box IDs */
+ unsigned int *box_offset; /* Box offset */
+};
+
+bool intel_uncore_has_discovery_tables(void);
+void intel_uncore_clear_discovery_tables(void);
+void intel_uncore_generic_uncore_cpu_init(void);
+int intel_uncore_generic_uncore_pci_init(void);
+void intel_uncore_generic_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 51271288499e..0f63706cdadf 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -62,6 +62,8 @@
#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36
#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43
#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
+#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660
+#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -131,12 +133,33 @@
#define ICL_UNC_ARB_PER_CTR 0x3b1
#define ICL_UNC_ARB_PERFEVTSEL 0x3b3
+/* ADL uncore global control */
+#define ADL_UNC_PERF_GLOBAL_CTL 0x2ff0
+#define ADL_UNC_FIXED_CTR_CTRL 0x2fde
+#define ADL_UNC_FIXED_CTR 0x2fdf
+
+/* ADL Cbo register */
+#define ADL_UNC_CBO_0_PER_CTR0 0x2002
+#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000
+#define ADL_UNC_CTL_THRESHOLD 0x3f000000
+#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ SNB_UNC_CTL_UMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET | \
+ SNB_UNC_CTL_INVERT | \
+ ADL_UNC_CTL_THRESHOLD)
+
+/* ADL ARB register */
+#define ADL_UNC_ARB_PER_CTR0 0x2FD2
+#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0
+#define ADL_UNC_ARB_MSR_OFFSET 0x8
+
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
@@ -422,6 +445,106 @@ void tgl_uncore_cpu_init(void)
skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
}
+static void adl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static void adl_uncore_msr_exit_box(struct intel_uncore_box *box)
+{
+ if (box->pmu->pmu_idx == 0)
+ wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0);
+}
+
+static struct intel_uncore_ops adl_uncore_msr_ops = {
+ .init_box = adl_uncore_msr_init_box,
+ .enable_box = adl_uncore_msr_enable_box,
+ .disable_box = adl_uncore_msr_disable_box,
+ .exit_box = adl_uncore_msr_exit_box,
+ .disable_event = snb_uncore_msr_disable_event,
+ .enable_event = snb_uncore_msr_enable_event,
+ .read_counter = uncore_msr_read_counter,
+};
+
+static struct attribute *adl_uncore_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_umask.attr,
+ &format_attr_edge.attr,
+ &format_attr_inv.attr,
+ &format_attr_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_cbox = {
+ .name = "cbox",
+ .num_counters = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_CBO_0_PER_CTR0,
+ .event_ctl = ADL_UNC_CBO_0_PERFEVTSEL0,
+ .event_mask = ADL_UNC_RAW_EVENT_MASK,
+ .msr_offset = ICL_UNC_CBO_MSR_OFFSET,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &adl_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_arb = {
+ .name = "arb",
+ .num_counters = 2,
+ .num_boxes = 2,
+ .perf_ctr_bits = 44,
+ .perf_ctr = ADL_UNC_ARB_PER_CTR0,
+ .event_ctl = ADL_UNC_ARB_PERFEVTSEL0,
+ .event_mask = SNB_UNC_RAW_EVENT_MASK,
+ .msr_offset = ADL_UNC_ARB_MSR_OFFSET,
+ .constraints = snb_uncore_arb_constraints,
+ .ops = &adl_uncore_msr_ops,
+ .format_group = &snb_uncore_format_group,
+};
+
+static struct intel_uncore_type adl_uncore_clockbox = {
+ .name = "clock",
+ .num_counters = 1,
+ .num_boxes = 1,
+ .fixed_ctr_bits = 48,
+ .fixed_ctr = ADL_UNC_FIXED_CTR,
+ .fixed_ctl = ADL_UNC_FIXED_CTR_CTRL,
+ .single_fixed = 1,
+ .event_mask = SNB_UNC_CTL_EV_SEL_MASK,
+ .format_group = &icl_uncore_clock_format_group,
+ .ops = &adl_uncore_msr_ops,
+ .event_descs = icl_uncore_events,
+};
+
+static struct intel_uncore_type *adl_msr_uncores[] = {
+ &adl_uncore_cbox,
+ &adl_uncore_arb,
+ &adl_uncore_clockbox,
+ NULL,
+};
+
+void adl_uncore_cpu_init(void)
+{
+ adl_uncore_cbox.num_boxes = icl_get_cbox_num();
+ uncore_msr_uncores = adl_msr_uncores;
+}
+
enum {
SNB_PCI_UNCORE_IMC,
};
@@ -1203,6 +1326,14 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_TGL_H_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_1_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ }
};
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index b79951d0707c..63f097289a84 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -280,17 +280,17 @@
* | [63] | 00h | VALID - When set, indicates the CPU bus
* numbers have been initialized. (RO)
* |[62:48]| --- | Reserved
- * |[47:40]| 00h | BUS_NUM_5 — Return the bus number BIOS assigned
+ * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned
* CPUBUSNO(5). (RO)
- * |[39:32]| 00h | BUS_NUM_4 — Return the bus number BIOS assigned
+ * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned
* CPUBUSNO(4). (RO)
- * |[31:24]| 00h | BUS_NUM_3 — Return the bus number BIOS assigned
+ * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned
* CPUBUSNO(3). (RO)
- * |[23:16]| 00h | BUS_NUM_2 — Return the bus number BIOS assigned
+ * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned
* CPUBUSNO(2). (RO)
- * |[15:8] | 00h | BUS_NUM_1 — Return the bus number BIOS assigned
+ * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned
* CPUBUSNO(1). (RO)
- * | [7:0] | 00h | BUS_NUM_0 — Return the bus number BIOS assigned
+ * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned
* CPUBUSNO(0). (RO)
*/
#define SKX_MSR_CPU_BUS_NUMBER 0x300
@@ -1159,7 +1159,6 @@ enum {
SNBEP_PCI_QPI_PORT0_FILTER,
SNBEP_PCI_QPI_PORT1_FILTER,
BDX_PCI_QPI_PORT2_FILTER,
- HSWEP_PCI_PCU_3,
};
static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event)
@@ -2857,22 +2856,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = {
NULL,
};
-void hswep_uncore_cpu_init(void)
+#define HSWEP_PCU_DID 0x2fc0
+#define HSWEP_PCU_CAPID4_OFFET 0x94
+#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3)
+
+static bool hswep_has_limit_sbox(unsigned int device)
{
- int pkg = boot_cpu_data.logical_proc_id;
+ struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL);
+ u32 capid4;
+
+ if (!dev)
+ return false;
+
+ pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4);
+ if (!hswep_get_chop(capid4))
+ return true;
+ return false;
+}
+
+void hswep_uncore_cpu_init(void)
+{
if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
/* Detect 6-8 core systems with only two SBOXes */
- if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- u32 capid4;
-
- pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3],
- 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- hswep_uncore_sbox.num_boxes = 2;
- }
+ if (hswep_has_limit_sbox(HSWEP_PCU_DID))
+ hswep_uncore_sbox.num_boxes = 2;
uncore_msr_uncores = hswep_msr_uncores;
}
@@ -3135,11 +3145,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
SNBEP_PCI_QPI_PORT1_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
@@ -3231,27 +3236,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = {
EVENT_CONSTRAINT_END
};
+#define BDX_PCU_DID 0x6fc0
+
void bdx_uncore_cpu_init(void)
{
- int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id);
-
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
- /* BDX-DE doesn't have SBOX */
- if (boot_cpu_data.x86_model == 86) {
- uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
/* Detect systems with no SBOXes */
- } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) {
- struct pci_dev *pdev;
- u32 capid4;
-
- pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3];
- pci_read_config_dword(pdev, 0x94, &capid4);
- if (((capid4 >> 6) & 0x3) == 0)
- bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
- }
+ if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID))
+ uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL;
+
hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
@@ -3472,11 +3468,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = {
.driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
BDX_PCI_QPI_PORT2_FILTER),
},
- { /* PCU.3 (for Capability registers) */
- PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0),
- .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV,
- HSWEP_PCI_PCU_3),
- },
{ /* end: all zeroes */ }
};
@@ -3684,7 +3675,8 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
{
- return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+ return pmu->type->topology[die].configuration >>
+ (pmu->pmu_idx * BUS_NUM_STRIDE);
}
static umode_t
@@ -3697,19 +3689,14 @@ skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
}
static ssize_t skx_iio_mapping_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+ struct device_attribute *attr, char *buf)
{
- struct pci_bus *bus = pci_find_next_bus(NULL);
- struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+ struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev);
struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
long die = (long)ea->var;
- /*
- * Current implementation is for single segment configuration hence it's
- * safe to take the segment value from the first available root bus.
- */
- return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
- skx_iio_stack(uncore_pmu, die));
+ return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment,
+ skx_iio_stack(pmu, die));
}
static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
@@ -3746,34 +3733,32 @@ static int die_to_cpu(int die)
static int skx_iio_get_topology(struct intel_uncore_type *type)
{
- int i, ret;
- struct pci_bus *bus = NULL;
+ int die, ret = -EPERM;
- /*
- * Verified single-segment environments only; disabled for multiple
- * segment topologies for now except VMD domains.
- * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
- */
- while ((bus = pci_find_next_bus(bus))
- && (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
- ;
- if (bus)
- return -EPERM;
-
- type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+ type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology),
+ GFP_KERNEL);
if (!type->topology)
return -ENOMEM;
- for (i = 0; i < uncore_max_dies(); i++) {
- ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
- if (ret) {
- kfree(type->topology);
- type->topology = NULL;
- return ret;
- }
+ for (die = 0; die < uncore_max_dies(); die++) {
+ ret = skx_msr_cpu_bus_read(die_to_cpu(die),
+ &type->topology[die].configuration);
+ if (ret)
+ break;
+
+ ret = uncore_die_to_segment(die);
+ if (ret < 0)
+ break;
+
+ type->topology[die].segment = ret;
}
- return 0;
+ if (ret < 0) {
+ kfree(type->topology);
+ type->topology = NULL;
+ }
+
+ return ret;
}
static struct attribute_group skx_iio_mapping_group = {
@@ -3794,7 +3779,7 @@ static int skx_iio_set_mapping(struct intel_uncore_type *type)
struct dev_ext_attribute *eas = NULL;
ret = skx_iio_get_topology(type);
- if (ret)
+ if (ret < 0)
goto clear_attr_update;
ret = -ENOMEM;
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 680404c58cb1..c853b28efa33 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -100,6 +100,8 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
case INTEL_FAM6_ROCKETLAKE:
+ case INTEL_FAM6_ALDERLAKE:
+ case INTEL_FAM6_ALDERLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 53b2b5fc23bc..27fa85e7d4fd 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -15,6 +15,7 @@
#include <linux/perf_event.h>
#include <asm/intel_ds.h>
+#include <asm/cpu.h>
/* To enable MSR tracing please use the generic trace points. */
@@ -228,7 +229,6 @@ struct cpu_hw_events {
*/
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
int enabled;
int n_events; /* the # of events in the below arrays */
@@ -327,6 +327,8 @@ struct cpu_hw_events {
int n_pair; /* Large increment events */
void *kfree_on_online[X86_PERF_KFREE_MAX];
+
+ struct pmu *pmu;
};
#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \
@@ -630,6 +632,71 @@ enum {
x86_lbr_exclusive_max,
};
+struct x86_hybrid_pmu {
+ struct pmu pmu;
+ const char *name;
+ u8 cpu_type;
+ cpumask_t supported_cpus;
+ union perf_capabilities intel_cap;
+ u64 intel_ctrl;
+ int max_pebs_events;
+ int num_counters;
+ int num_counters_fixed;
+ struct event_constraint unconstrained;
+
+ u64 hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ u64 hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX];
+ struct event_constraint *event_constraints;
+ struct event_constraint *pebs_constraints;
+ struct extra_reg *extra_regs;
+};
+
+static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
+{
+ return container_of(pmu, struct x86_hybrid_pmu, pmu);
+}
+
+extern struct static_key_false perf_is_hybrid;
+#define is_hybrid() static_branch_unlikely(&perf_is_hybrid)
+
+#define hybrid(_pmu, _field) \
+(*({ \
+ typeof(&x86_pmu._field) __Fp = &x86_pmu._field; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_field; \
+ \
+ __Fp; \
+}))
+
+#define hybrid_var(_pmu, _var) \
+(*({ \
+ typeof(&_var) __Fp = &_var; \
+ \
+ if (is_hybrid() && (_pmu)) \
+ __Fp = &hybrid_pmu(_pmu)->_var; \
+ \
+ __Fp; \
+}))
+
+enum hybrid_pmu_type {
+ hybrid_big = 0x40,
+ hybrid_small = 0x20,
+
+ hybrid_big_small = hybrid_big | hybrid_small,
+};
+
+#define X86_HYBRID_PMU_ATOM_IDX 0
+#define X86_HYBRID_PMU_CORE_IDX 1
+
+#define X86_HYBRID_NUM_PMUS 2
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -816,6 +883,19 @@ struct x86_pmu {
int (*check_period) (struct perf_event *event, u64 period);
int (*aux_output_match) (struct perf_event *event);
+
+ int (*filter_match)(struct perf_event *event);
+ /*
+ * Hybrid support
+ *
+ * Most PMU capabilities are the same among different hybrid PMUs.
+ * The global x86_pmu saves the architecture capabilities, which
+ * are available for all PMUs. The hybrid_pmu only includes the
+ * unique capabilities.
+ */
+ int num_hybrid_pmus;
+ struct x86_hybrid_pmu *hybrid_pmu;
+ u8 (*get_hybrid_cpu_type) (void);
};
struct x86_perf_task_context_opt {
@@ -905,7 +985,23 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \
.event_str_ht = ht, \
}
-struct pmu *x86_get_pmu(void);
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \
+static struct perf_pmu_events_hybrid_attr event_attr_##v = { \
+ .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+ .id = 0, \
+ .event_str = str, \
+ .pmu_type = _pmu, \
+}
+
+#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr)
+
+#define FORMAT_ATTR_HYBRID(_name, _pmu) \
+static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
+ .attr = __ATTR_RO(_name), \
+ .pmu_type = _pmu, \
+}
+
+struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;
static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
@@ -964,6 +1060,9 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+bool check_hw_exists(struct pmu *pmu, int num_counters,
+ int num_counters_fixed);
+
int x86_add_exclusive(unsigned int what);
void x86_del_exclusive(unsigned int what);
@@ -1027,6 +1126,11 @@ void x86_pmu_enable_event(struct perf_event *event);
int x86_pmu_handle_irq(struct pt_regs *regs);
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl);
+
+void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu);
+
extern struct event_constraint emptyconstraint;
extern struct event_constraint unconstrained;
@@ -1067,10 +1171,15 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page);
-static inline bool fixed_counter_disabled(int i)
+static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
{
- return !(x86_pmu.intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
+ u64 intel_ctrl = hybrid(pmu, intel_ctrl);
+
+ return !(intel_ctrl >> (i + INTEL_PMC_IDX_FIXED));
}
#ifdef CONFIG_CPU_SUP_AMD
@@ -1154,6 +1263,8 @@ extern struct event_constraint intel_glm_pebs_event_constraints[];
extern struct event_constraint intel_glp_pebs_event_constraints[];
+extern struct event_constraint intel_grt_pebs_event_constraints[];
+
extern struct event_constraint intel_nehalem_pebs_event_constraints[];
extern struct event_constraint intel_westmere_pebs_event_constraints[];
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index f42a70496a24..84a1042c3b01 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -800,6 +800,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
+ X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h),
X86_MATCH_VENDOR_FAM(HYGON, 0x18, &model_amd_fam17h),
diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index e68827e604ad..949d845c922b 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -494,7 +494,7 @@ static __init void zhaoxin_arch_events_quirk(void)
{
int bit;
- /* disable event that reported as not presend by cpuid */
+ /* disable event that reported as not present by cpuid */
for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) {
zx_pmon_event_map[zx_arch_events_map[bit].id] = 0;
pr_warn("CPUID marked event: \'%s\' unavailable\n",
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 284e73661a18..90e682a92820 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -60,9 +60,11 @@ static u32 hv_apic_read(u32 reg)
switch (reg) {
case APIC_EOI:
rdmsr(HV_X64_MSR_EOI, reg_val, hi);
+ (void)hi;
return reg_val;
case APIC_TASKPRI:
rdmsr(HV_X64_MSR_TPR, reg_val, hi);
+ (void)hi;
return reg_val;
default:
@@ -103,7 +105,7 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
struct hv_send_ipi_ex *ipi_arg;
unsigned long flags;
int nr_bank = 0;
- int ret = 1;
+ u64 status = HV_STATUS_INVALID_PARAMETER;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
return false;
@@ -128,19 +130,19 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector)
if (!nr_bank)
ipi_arg->vp_set.format = HV_GENERIC_SET_ALL;
- ret = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
+ status = hv_do_rep_hypercall(HVCALL_SEND_IPI_EX, 0, nr_bank,
ipi_arg, NULL);
ipi_mask_ex_done:
local_irq_restore(flags);
- return ((ret == 0) ? true : false);
+ return hv_result_success(status);
}
static bool __send_ipi_mask(const struct cpumask *mask, int vector)
{
int cur_cpu, vcpu;
struct hv_send_ipi ipi_arg;
- int ret = 1;
+ u64 status;
trace_hyperv_send_ipi_mask(mask, vector);
@@ -184,9 +186,9 @@ static bool __send_ipi_mask(const struct cpumask *mask, int vector)
__set_bit(vcpu, (unsigned long *)&ipi_arg.cpu_mask);
}
- ret = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, ipi_arg.vector,
ipi_arg.cpu_mask);
- return ((ret == 0) ? true : false);
+ return hv_result_success(status);
do_ex_hypercall:
return __send_ipi_mask_ex(mask, vector);
@@ -195,6 +197,7 @@ do_ex_hypercall:
static bool __send_ipi_one(int cpu, int vector)
{
int vp = hv_cpu_number_to_vp_number(cpu);
+ u64 status;
trace_hyperv_send_ipi_one(cpu, vector);
@@ -207,7 +210,8 @@ static bool __send_ipi_one(int cpu, int vector)
if (vp >= 64)
return __send_ipi_mask_ex(cpumask_of(cpu), vector);
- return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ status = hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp));
+ return hv_result_success(status);
}
static void hv_send_ipi(int cpu, int vector)
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index b81047dec1da..bb0ae4b5c00f 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -54,28 +54,6 @@ EXPORT_SYMBOL_GPL(hyperv_pcpu_output_arg);
u32 hv_max_vp_index;
EXPORT_SYMBOL_GPL(hv_max_vp_index);
-void *hv_alloc_hyperv_page(void)
-{
- BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
-
- return (void *)__get_free_page(GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_page);
-
-void *hv_alloc_hyperv_zeroed_page(void)
-{
- BUILD_BUG_ON(PAGE_SIZE != HV_HYP_PAGE_SIZE);
-
- return (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-EXPORT_SYMBOL_GPL(hv_alloc_hyperv_zeroed_page);
-
-void hv_free_hyperv_page(unsigned long addr)
-{
- free_page(addr);
-}
-EXPORT_SYMBOL_GPL(hv_free_hyperv_page);
-
static int hv_cpu_init(unsigned int cpu)
{
u64 msr_vp_index;
@@ -97,7 +75,7 @@ static int hv_cpu_init(unsigned int cpu)
*output_arg = page_address(pg + 1);
}
- hv_get_vp_index(msr_vp_index);
+ msr_vp_index = hv_get_register(HV_REGISTER_VP_INDEX);
hv_vp_index[smp_processor_id()] = msr_vp_index;
@@ -162,7 +140,7 @@ EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation);
static inline bool hv_reenlightenment_available(void)
{
/*
- * Check for required features and priviliges to make TSC frequency
+ * Check for required features and privileges to make TSC frequency
* change notifications work.
*/
return ms_hyperv.features & HV_ACCESS_FREQUENCY_MSRS &&
@@ -292,7 +270,7 @@ static int hv_suspend(void)
/*
* Reset the hypercall page as it is going to be invalidated
- * accross hibernation. Setting hv_hypercall_pg to NULL ensures
+ * across hibernation. Setting hv_hypercall_pg to NULL ensures
* that any subsequent hypercall operation fails safely instead of
* crashing due to an access of an invalid page. The hypercall page
* pointer is restored on resume.
@@ -349,7 +327,7 @@ static void __init hv_stimer_setup_percpu_clockev(void)
* Ignore any errors in setting up stimer clockevents
* as we can run with the LAPIC timer as a fallback.
*/
- (void)hv_stimer_alloc();
+ (void)hv_stimer_alloc(false);
/*
* Still register the LAPIC timer, because the direct-mode STIMER is
@@ -369,7 +347,7 @@ static void __init hv_get_partition_id(void)
local_irq_save(flags);
output_page = *this_cpu_ptr(hyperv_pcpu_output_arg);
status = hv_do_hypercall(HVCALL_GET_PARTITION_ID, NULL, output_page);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ if (!hv_result_success(status)) {
/* No point in proceeding if this failed */
pr_err("Failed to get partition ID: %lld\n", status);
BUG();
@@ -520,6 +498,8 @@ void __init hyperv_init(void)
x86_init.irqs.create_pci_msi_domain = hv_create_pci_msi_domain;
#endif
+ /* Query the VMs extended capability once, so that it can be cached. */
+ hv_query_ext_cap(0);
return;
remove_cpuhp_state:
@@ -593,33 +573,6 @@ void hyperv_report_panic(struct pt_regs *regs, long err, bool in_die)
}
EXPORT_SYMBOL_GPL(hyperv_report_panic);
-/**
- * hyperv_report_panic_msg - report panic message to Hyper-V
- * @pa: physical address of the panic page containing the message
- * @size: size of the message in the page
- */
-void hyperv_report_panic_msg(phys_addr_t pa, size_t size)
-{
- /*
- * P3 to contain the physical address of the panic page & P4 to
- * contain the size of the panic data in that page. Rest of the
- * registers are no-op when the NOTIFY_MSG flag is set.
- */
- wrmsrl(HV_X64_MSR_CRASH_P0, 0);
- wrmsrl(HV_X64_MSR_CRASH_P1, 0);
- wrmsrl(HV_X64_MSR_CRASH_P2, 0);
- wrmsrl(HV_X64_MSR_CRASH_P3, pa);
- wrmsrl(HV_X64_MSR_CRASH_P4, size);
-
- /*
- * Let Hyper-V know there is crash data available along with
- * the panic message.
- */
- wrmsrl(HV_X64_MSR_CRASH_CTL,
- (HV_CRASH_CTL_CRASH_NOTIFY | HV_CRASH_CTL_CRASH_NOTIFY_MSG));
-}
-EXPORT_SYMBOL_GPL(hyperv_report_panic_msg);
-
bool hv_is_hyperv_initialized(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
@@ -650,7 +603,7 @@ EXPORT_SYMBOL_GPL(hv_is_hibernation_supported);
enum hv_isolation_type hv_get_isolation_type(void)
{
- if (!(ms_hyperv.features_b & HV_ISOLATION))
+ if (!(ms_hyperv.priv_high & HV_ISOLATION))
return HV_ISOLATION_TYPE_NONE;
return FIELD_GET(HV_ISOLATION_TYPE, ms_hyperv.isolation_config_b);
}
@@ -661,3 +614,50 @@ bool hv_is_isolation_supported(void)
return hv_get_isolation_type() != HV_ISOLATION_TYPE_NONE;
}
EXPORT_SYMBOL_GPL(hv_is_isolation_supported);
+
+/* Bit mask of the extended capability to query: see HV_EXT_CAPABILITY_xxx */
+bool hv_query_ext_cap(u64 cap_query)
+{
+ /*
+ * The address of the 'hv_extended_cap' variable will be used as an
+ * output parameter to the hypercall below and so it should be
+ * compatible with 'virt_to_phys'. Which means, it's address should be
+ * directly mapped. Use 'static' to keep it compatible; stack variables
+ * can be virtually mapped, making them imcompatible with
+ * 'virt_to_phys'.
+ * Hypercall input/output addresses should also be 8-byte aligned.
+ */
+ static u64 hv_extended_cap __aligned(8);
+ static bool hv_extended_cap_queried;
+ u64 status;
+
+ /*
+ * Querying extended capabilities is an extended hypercall. Check if the
+ * partition supports extended hypercall, first.
+ */
+ if (!(ms_hyperv.priv_high & HV_ENABLE_EXTENDED_HYPERCALLS))
+ return false;
+
+ /* Extended capabilities do not change at runtime. */
+ if (hv_extended_cap_queried)
+ return hv_extended_cap & cap_query;
+
+ status = hv_do_hypercall(HV_EXT_CALL_QUERY_CAPABILITIES, NULL,
+ &hv_extended_cap);
+
+ /*
+ * The query extended capabilities hypercall should not fail under
+ * any normal circumstances. Avoid repeatedly making the hypercall, on
+ * error.
+ */
+ hv_extended_cap_queried = true;
+ status &= HV_HYPERCALL_RESULT_MASK;
+ if (status != HV_STATUS_SUCCESS) {
+ pr_err("Hyper-V: Extended query capabilities hypercall failed 0x%llx\n",
+ status);
+ return false;
+ }
+
+ return hv_extended_cap & cap_query;
+}
+EXPORT_SYMBOL_GPL(hv_query_ext_cap);
diff --git a/arch/x86/hyperv/hv_proc.c b/arch/x86/hyperv/hv_proc.c
index 60461e598239..68a0843d4750 100644
--- a/arch/x86/hyperv/hv_proc.c
+++ b/arch/x86/hyperv/hv_proc.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
-#include <linux/version.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/clockchips.h>
@@ -93,10 +92,9 @@ int hv_call_deposit_pages(int node, u64 partition_id, u32 num_pages)
status = hv_do_rep_hypercall(HVCALL_DEPOSIT_MEMORY,
page_count, 0, input_page, NULL);
local_irq_restore(flags);
-
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS) {
+ if (!hv_result_success(status)) {
pr_err("Failed to deposit pages: %lld\n", status);
- ret = status;
+ ret = hv_result(status);
goto err_free_allocations;
}
@@ -122,7 +120,7 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
struct hv_add_logical_processor_out *output;
u64 status;
unsigned long flags;
- int ret = 0;
+ int ret = HV_STATUS_SUCCESS;
int pxm = node_to_pxm(node);
/*
@@ -148,13 +146,11 @@ int hv_call_add_logical_proc(int node, u32 lp_index, u32 apic_id)
input, output);
local_irq_restore(flags);
- status &= HV_HYPERCALL_RESULT_MASK;
-
- if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (status != HV_STATUS_SUCCESS) {
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
pr_err("%s: cpu %u apic ID %u, %lld\n", __func__,
lp_index, apic_id, status);
- ret = status;
+ ret = hv_result(status);
}
break;
}
@@ -169,7 +165,7 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
struct hv_create_vp *input;
u64 status;
unsigned long irq_flags;
- int ret = 0;
+ int ret = HV_STATUS_SUCCESS;
int pxm = node_to_pxm(node);
/* Root VPs don't seem to need pages deposited */
@@ -200,13 +196,11 @@ int hv_call_create_vp(int node, u64 partition_id, u32 vp_index, u32 flags)
status = hv_do_hypercall(HVCALL_CREATE_VP, input, NULL);
local_irq_restore(irq_flags);
- status &= HV_HYPERCALL_RESULT_MASK;
-
- if (status != HV_STATUS_INSUFFICIENT_MEMORY) {
- if (status != HV_STATUS_SUCCESS) {
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY) {
+ if (!hv_result_success(status)) {
pr_err("%s: vcpu %u, lp %u, %lld\n", __func__,
vp_index, flags, status);
- ret = status;
+ ret = hv_result(status);
}
break;
}
diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c
index f3270c1fc48c..91cfe698bde0 100644
--- a/arch/x86/hyperv/hv_spinlock.c
+++ b/arch/x86/hyperv/hv_spinlock.c
@@ -25,7 +25,6 @@ static void hv_qlock_kick(int cpu)
static void hv_qlock_wait(u8 *byte, u8 val)
{
- unsigned long msr_val;
unsigned long flags;
if (in_nmi())
@@ -48,8 +47,13 @@ static void hv_qlock_wait(u8 *byte, u8 val)
/*
* Only issue the rdmsrl() when the lock state has not changed.
*/
- if (READ_ONCE(*byte) == val)
+ if (READ_ONCE(*byte) == val) {
+ unsigned long msr_val;
+
rdmsrl(HV_X64_MSR_GUEST_IDLE, msr_val);
+
+ (void)msr_val;
+ }
local_irq_restore(flags);
}
diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c
index 4421a8d92e23..514fc64e23d5 100644
--- a/arch/x86/hyperv/irqdomain.c
+++ b/arch/x86/hyperv/irqdomain.c
@@ -63,10 +63,10 @@ static int hv_map_interrupt(union hv_device_id device_id, bool level,
local_irq_restore(flags);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
+ if (!hv_result_success(status))
pr_err("%s: hypercall failed, status %lld\n", __func__, status);
- return status & HV_HYPERCALL_RESULT_MASK;
+ return hv_result(status);
}
static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
@@ -88,7 +88,7 @@ static int hv_unmap_interrupt(u64 id, struct hv_interrupt_entry *old_entry)
status = hv_do_hypercall(HVCALL_UNMAP_DEVICE_INTERRUPT, input, NULL);
local_irq_restore(flags);
- return status & HV_HYPERCALL_RESULT_MASK;
+ return hv_result(status);
}
#ifdef CONFIG_PCI_MSI
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 2c87350c1fb0..bd13736d0c05 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -52,16 +52,16 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
}
-static void hyperv_flush_tlb_others(const struct cpumask *cpus,
- const struct flush_tlb_info *info)
+static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
{
int cpu, vcpu, gva_n, max_gvas;
struct hv_tlb_flush **flush_pcpu;
struct hv_tlb_flush *flush;
- u64 status = U64_MAX;
+ u64 status;
unsigned long flags;
- trace_hyperv_mmu_flush_tlb_others(cpus, info);
+ trace_hyperv_mmu_flush_tlb_multi(cpus, info);
if (!hv_hypercall_pg)
goto do_native;
@@ -161,10 +161,10 @@ do_ex_hypercall:
check_status:
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
return;
do_native:
- native_flush_tlb_others(cpus, info);
+ native_flush_tlb_multi(cpus, info);
}
static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
@@ -176,7 +176,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
u64 status;
if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED))
- return U64_MAX;
+ return HV_STATUS_INVALID_PARAMETER;
flush_pcpu = (struct hv_tlb_flush_ex **)
this_cpu_ptr(hyperv_pcpu_input_arg);
@@ -201,7 +201,7 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
if (nr_bank < 0)
- return U64_MAX;
+ return HV_STATUS_INVALID_PARAMETER;
/*
* We can flush not more than max_gvas with one hypercall. Flush the
@@ -239,6 +239,6 @@ void hyperv_setup_mmu_ops(void)
return;
pr_info("Using hypercall for remote TLB flush\n");
- pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
+ pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
}
diff --git a/arch/x86/hyperv/nested.c b/arch/x86/hyperv/nested.c
index dd0a843f766d..5d70968c8538 100644
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -47,7 +47,7 @@ int hyperv_flush_guest_mapping(u64 as)
flush, NULL);
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
ret = 0;
fault:
@@ -92,7 +92,7 @@ int hyperv_flush_guest_mapping_range(u64 as,
{
struct hv_guest_mapping_flush_list **flush_pcpu;
struct hv_guest_mapping_flush_list *flush;
- u64 status = 0;
+ u64 status;
unsigned long flags;
int ret = -ENOTSUPP;
int gpa_n = 0;
@@ -125,10 +125,10 @@ int hyperv_flush_guest_mapping_range(u64 as,
local_irq_restore(flags);
- if (!(status & HV_HYPERCALL_RESULT_MASK))
+ if (hv_result_success(status))
ret = 0;
else
- ret = status;
+ ret = hv_result(status);
fault:
trace_hyperv_nested_flush_guest_mapping_range(as, ret);
return ret;
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
index 62da760d6d5a..cd7b14322035 100644
--- a/arch/x86/include/asm/agp.h
+++ b/arch/x86/include/asm/agp.h
@@ -9,7 +9,7 @@
* Functions to keep the agpgart mappings coherent with the MMU. The
* GART gives the CPU a physical alias of pages in memory. The alias
* region is mapped uncacheable. Make sure there are no conflicting
- * mappings with different cachability attributes for the same
+ * mappings with different cacheability attributes for the same
* page. This avoids data corruption on some CPUs.
*/
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
deleted file mode 100644
index 464034db299f..000000000000
--- a/arch/x86/include/asm/alternative-asm.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_ALTERNATIVE_ASM_H
-#define _ASM_X86_ALTERNATIVE_ASM_H
-
-#ifdef __ASSEMBLY__
-
-#include <asm/asm.h>
-
-#ifdef CONFIG_SMP
- .macro LOCK_PREFIX
-672: lock
- .pushsection .smp_locks,"a"
- .balign 4
- .long 672b - .
- .popsection
- .endm
-#else
- .macro LOCK_PREFIX
- .endm
-#endif
-
-/*
- * objtool annotation to ignore the alternatives and only consider the original
- * instruction(s).
- */
-.macro ANNOTATE_IGNORE_ALTERNATIVE
- .Lannotate_\@:
- .pushsection .discard.ignore_alts
- .long .Lannotate_\@ - .
- .popsection
-.endm
-
-/*
- * Issue one struct alt_instr descriptor entry (need to put it into
- * the section .altinstructions, see below). This entry contains
- * enough information for the alternatives patching code to patch an
- * instruction. See apply_alternatives().
- */
-.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
- .long \orig - .
- .long \alt - .
- .word \feature
- .byte \orig_len
- .byte \alt_len
- .byte \pad_len
-.endm
-
-/*
- * Define an alternative between two instructions. If @feature is
- * present, early code in apply_alternatives() replaces @oldinstr with
- * @newinstr. ".skip" directive takes care of proper instruction padding
- * in case @newinstr is longer than @oldinstr.
- */
-.macro ALTERNATIVE oldinstr, newinstr, feature
-140:
- \oldinstr
-141:
- .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr
-144:
- .popsection
-.endm
-
-#define old_len 141b-140b
-#define new_len1 144f-143f
-#define new_len2 145f-144f
-
-/*
- * gas compatible max based on the idea from:
- * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
- *
- * The additional "-" is needed because gas uses a "true" value of -1.
- */
-#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
-
-
-/*
- * Same as ALTERNATIVE macro above but for two alternatives. If CPU
- * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
- * @feature2, it replaces @oldinstr with @feature2.
- */
-.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
-140:
- \oldinstr
-141:
- .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
- (alt_max_short(new_len1, new_len2) - (old_len)),0x90
-142:
-
- .pushsection .altinstructions,"a"
- altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
- altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
- .popsection
-
- .pushsection .altinstr_replacement,"ax"
-143:
- \newinstr1
-144:
- \newinstr2
-145:
- .popsection
-.endm
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _ASM_X86_ALTERNATIVE_ASM_H */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13adca37c99a..a3c2315aca12 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -2,13 +2,17 @@
#ifndef _ASM_X86_ALTERNATIVE_H
#define _ASM_X86_ALTERNATIVE_H
-#ifndef __ASSEMBLY__
-
#include <linux/types.h>
-#include <linux/stddef.h>
#include <linux/stringify.h>
#include <asm/asm.h>
+#define ALTINSTR_FLAG_INV (1 << 15)
+#define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV)
+
+#ifndef __ASSEMBLY__
+
+#include <linux/stddef.h>
+
/*
* Alternative inline assembly for SMP.
*
@@ -61,7 +65,6 @@ struct alt_instr {
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
u8 replacementlen; /* length of new instruction */
- u8 padlen; /* length of build-time padding */
} __packed;
/*
@@ -100,7 +103,6 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alt_end_marker "663"
#define alt_slen "662b-661b"
-#define alt_pad_len alt_end_marker"b-662b"
#define alt_total_slen alt_end_marker"b-661b"
#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
@@ -147,10 +149,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
" .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
" .byte " alt_total_slen "\n" /* source len */ \
- " .byte " alt_rlen(num) "\n" /* replacement len */ \
- " .byte " alt_pad_len "\n" /* pad len */
+ " .byte " alt_rlen(num) "\n" /* replacement len */
-#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+#define ALTINSTR_REPLACEMENT(newinstr, num) /* replacement */ \
"# ALT: replacement " #num "\n" \
b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n"
@@ -161,7 +162,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
+ ALTINSTR_REPLACEMENT(newinstr, 1) \
".popsection\n"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
@@ -171,10 +172,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
- ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
+ ALTINSTR_REPLACEMENT(newinstr1, 1) \
+ ALTINSTR_REPLACEMENT(newinstr2, 2) \
".popsection\n"
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2(oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature)
+
#define ALTERNATIVE_3(oldinsn, newinsn1, feat1, newinsn2, feat2, newinsn3, feat3) \
OLDINSTR_3(oldinsn, 1, 2, 3) \
".pushsection .altinstructions,\"a\"\n" \
@@ -183,9 +189,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
ALTINSTR_ENTRY(feat3, 3) \
".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
- ALTINSTR_REPLACEMENT(newinsn1, feat1, 1) \
- ALTINSTR_REPLACEMENT(newinsn2, feat2, 2) \
- ALTINSTR_REPLACEMENT(newinsn3, feat3, 3) \
+ ALTINSTR_REPLACEMENT(newinsn1, 1) \
+ ALTINSTR_REPLACEMENT(newinsn2, 2) \
+ ALTINSTR_REPLACEMENT(newinsn3, 3) \
".popsection\n"
/*
@@ -206,15 +212,15 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
asm_inline volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+#define alternative_ternary(oldinstr, feature, newinstr_yes, newinstr_no) \
+ asm_inline volatile(ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
* Peculiarities:
* No memory clobber here.
* Argument numbers start with 1.
- * Best is to use constraints that are fixed size (like (%1) ... "r")
- * If you use variable sized constraints like "m" or "g" in the
- * replacement make sure to pad to the worst case length.
* Leaving an unused argument 0 to keep API compatibility.
*/
#define alternative_input(oldinstr, newinstr, feature, input...) \
@@ -271,6 +277,115 @@ static inline int alternatives_text_reserved(void *start, void *end)
*/
#define ASM_NO_INPUT_CLOBBER(clbr...) "i" (0) : clbr
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_SMP
+ .macro LOCK_PREFIX
+672: lock
+ .pushsection .smp_locks,"a"
+ .balign 4
+ .long 672b - .
+ .popsection
+ .endm
+#else
+ .macro LOCK_PREFIX
+ .endm
+#endif
+
+/*
+ * objtool annotation to ignore the alternatives and only consider the original
+ * instruction(s).
+ */
+.macro ANNOTATE_IGNORE_ALTERNATIVE
+ .Lannotate_\@:
+ .pushsection .discard.ignore_alts
+ .long .Lannotate_\@ - .
+ .popsection
+.endm
+
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
+.macro altinstruction_entry orig alt feature orig_len alt_len
+ .long \orig - .
+ .long \alt - .
+ .word \feature
+ .byte \orig_len
+ .byte \alt_len
+.endm
+
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+#define old_len 141b-140b
+#define new_len1 144f-143f
+#define new_len2 145f-144f
+
+/*
+ * gas compatible max based on the idea from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas uses a "true" value of -1.
+ */
+#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+ (alt_max_short(new_len1, new_len2) - (old_len)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
+.endm
+
+/* If @feature is set, patch in @newinstr_yes, otherwise @newinstr_no. */
+#define ALTERNATIVE_TERNARY(oldinstr, feature, newinstr_yes, newinstr_no) \
+ ALTERNATIVE_2 oldinstr, newinstr_no, X86_FEATURE_ALWAYS, \
+ newinstr_yes, feature
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 51e2bf27cc9b..4cb726c71ed8 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -19,18 +19,19 @@ extern void cmpxchg8b_emu(void);
#ifdef CONFIG_RETPOLINE
-#define DECL_INDIRECT_THUNK(reg) \
+#undef GEN
+#define GEN(reg) \
extern asmlinkage void __x86_indirect_thunk_ ## reg (void);
-
-#define DECL_RETPOLINE(reg) \
- extern asmlinkage void __x86_retpoline_ ## reg (void);
+#include <asm/GEN-for-each-reg.h>
#undef GEN
-#define GEN(reg) DECL_INDIRECT_THUNK(reg)
+#define GEN(reg) \
+ extern asmlinkage void __x86_indirect_alt_call_ ## reg (void);
#include <asm/GEN-for-each-reg.h>
#undef GEN
-#define GEN(reg) DECL_RETPOLINE(reg)
+#define GEN(reg) \
+ extern asmlinkage void __x86_indirect_alt_jmp_ ## reg (void);
#include <asm/GEN-for-each-reg.h>
#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 4d4ec5cbdc51..94fbe6ae7431 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -22,7 +22,7 @@ extern void __add_wrong_size(void)
/*
* Constants for operation sizes. On 32-bit, the 64-bit size it set to
* -1 because sizeof will never return -1, thereby making those switch
- * case statements guaranteeed dead code which the compiler will
+ * case statements guaranteed dead code which the compiler will
* eliminate, and allowing the "missing symbol in the default case" to
* indicate a usage error.
*/
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index da78ccbd493b..33d41e350c79 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -41,12 +41,14 @@ unsigned int x86_family(unsigned int sig);
unsigned int x86_model(unsigned int sig);
unsigned int x86_stepping(unsigned int sig);
#ifdef CONFIG_CPU_SUP_INTEL
-extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c);
+extern void __init sld_setup(struct cpuinfo_x86 *c);
extern void switch_to_sld(unsigned long tifn);
extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
extern bool handle_guest_split_lock(unsigned long ip);
+extern void handle_bus_lock(struct pt_regs *regs);
+u8 get_this_hybrid_cpu_type(void);
#else
-static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
+static inline void __init sld_setup(struct cpuinfo_x86 *c) {}
static inline void switch_to_sld(unsigned long tifn) {}
static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
@@ -57,6 +59,13 @@ static inline bool handle_guest_split_lock(unsigned long ip)
{
return false;
}
+
+static inline void handle_bus_lock(struct pt_regs *regs) {}
+
+static inline u8 get_this_hybrid_cpu_type(void)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_IA32_FEAT_CTL
void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1728d4ce5730..16a51e7288d5 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -8,6 +8,7 @@
#include <asm/asm.h>
#include <linux/bitops.h>
+#include <asm/alternative.h>
enum cpuid_leafs
{
@@ -175,39 +176,15 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
*/
static __always_inline bool _static_cpu_has(u16 bit)
{
- asm_volatile_goto("1: jmp 6f\n"
- "2:\n"
- ".skip -(((5f-4f) - (2b-1b)) > 0) * "
- "((5f-4f) - (2b-1b)),0x90\n"
- "3:\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 4f - .\n" /* repl offset */
- " .word %P[always]\n" /* always replace */
- " .byte 3b - 1b\n" /* src len */
- " .byte 5f - 4f\n" /* repl len */
- " .byte 3b - 2b\n" /* pad len */
- ".previous\n"
- ".section .altinstr_replacement,\"ax\"\n"
- "4: jmp %l[t_no]\n"
- "5:\n"
- ".previous\n"
- ".section .altinstructions,\"a\"\n"
- " .long 1b - .\n" /* src offset */
- " .long 0\n" /* no replacement */
- " .word %P[feature]\n" /* feature bit */
- " .byte 3b - 1b\n" /* src len */
- " .byte 0\n" /* repl len */
- " .byte 0\n" /* pad len */
- ".previous\n"
- ".section .altinstr_aux,\"ax\"\n"
- "6:\n"
- " testb %[bitnum],%[cap_byte]\n"
- " jnz %l[t_yes]\n"
- " jmp %l[t_no]\n"
- ".previous\n"
+ asm_volatile_goto(
+ ALTERNATIVE_TERNARY("jmp 6f", %P[feature], "", "jmp %l[t_no]")
+ ".section .altinstr_aux,\"ax\"\n"
+ "6:\n"
+ " testb %[bitnum],%[cap_byte]\n"
+ " jnz %l[t_yes]\n"
+ " jmp %l[t_no]\n"
+ ".previous\n"
: : [feature] "i" (bit),
- [always] "i" (X86_FEATURE_ALWAYS),
[bitnum] "i" (1 << (bit & 7)),
[cap_byte] "m" (((const char *)boot_cpu_data.x86_capability)[bit >> 3])
: : t_yes, t_no);
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..3c94316169a3 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -84,7 +84,7 @@
/* CPU types for specific tunings: */
#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */
+/* FREE, was #define X86_FEATURE_K7 ( 3*32+ 5) "" Athlon */
#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */
#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */
#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
@@ -236,6 +236,8 @@
#define X86_FEATURE_EPT_AD ( 8*32+17) /* Intel Extended Page Table access-dirty bit */
#define X86_FEATURE_VMCALL ( 8*32+18) /* "" Hypervisor supports the VMCALL instruction */
#define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */
+#define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */
+#define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
@@ -290,6 +292,8 @@
#define X86_FEATURE_FENCE_SWAPGS_KERNEL (11*32+ 5) /* "" LFENCE in kernel entry SWAPGS path */
#define X86_FEATURE_SPLIT_LOCK_DETECT (11*32+ 6) /* #AC for split lock */
#define X86_FEATURE_PER_THREAD_MBA (11*32+ 7) /* "" Per-thread Memory Bandwidth Allocation */
+#define X86_FEATURE_SGX1 (11*32+ 8) /* "" Basic SGX */
+#define X86_FEATURE_SGX2 (11*32+ 9) /* "" SGX Enclave Dynamic Memory Management (EDMM) */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
@@ -354,6 +358,7 @@
#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */
#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */
+#define X86_FEATURE_BUS_LOCK_DETECT (16*32+24) /* Bus Lock detect */
#define X86_FEATURE_CLDEMOTE (16*32+25) /* CLDEMOTE instruction */
#define X86_FEATURE_MOVDIRI (16*32+27) /* MOVDIRI instruction */
#define X86_FEATURE_MOVDIR64B (16*32+28) /* MOVDIR64B instruction */
@@ -374,6 +379,7 @@
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
#define X86_FEATURE_SERIALIZE (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* "" This part has CPUs of more than one type */
#define X86_FEATURE_TSXLDTRK (18*32+16) /* TSX Suspend Load Address Tracking */
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
#define X86_FEATURE_ARCH_LBR (18*32+19) /* Intel ARCH LBR */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..7d7500806af8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -283,12 +283,12 @@ extern u32 elf_hwcap2;
*
* The decision process for determining the results are:
*
- *              CPU: | lacks NX*  | has NX, ia32     | has NX, x86_64 |
- * ELF:              |            |                  |                |
+ * CPU: | lacks NX* | has NX, ia32 | has NX, x86_64 |
+ * ELF: | | | |
* ---------------------|------------|------------------|----------------|
- * missing PT_GNU_STACK | exec-all   | exec-all         | exec-none      |
- * PT_GNU_STACK == RWX  | exec-stack | exec-stack       | exec-stack     |
- * PT_GNU_STACK == RW   | exec-none  | exec-none        | exec-none      |
+ * missing PT_GNU_STACK | exec-all | exec-all | exec-none |
+ * PT_GNU_STACK == RWX | exec-stack | exec-stack | exec-stack |
+ * PT_GNU_STACK == RW | exec-none | exec-none | exec-none |
*
* exec-all : all PROT_READ user mappings are executable, except when
* backed by files on a noexec-filesystem.
diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h
index 2b87b191b3b8..14ebd2196569 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -2,6 +2,7 @@
#ifndef _ASM_X86_ENTRY_COMMON_H
#define _ASM_X86_ENTRY_COMMON_H
+#include <linux/randomize_kstack.h>
#include <linux/user-return-notifier.h>
#include <asm/nospec-branch.h>
@@ -70,6 +71,21 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs,
*/
current_thread_info()->status &= ~(TS_COMPAT | TS_I386_REGS_POKED);
#endif
+
+ /*
+ * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
+ * but not enough for x86 stack utilization comfort. To keep
+ * reasonable stack head room, reduce the maximum offset to 8 bits.
+ *
+ * The actual entropy will be further reduced by the compiler when
+ * applying stack alignment constraints (see cc_stack_align4/8 in
+ * arch/x86/Makefile), which will remove the 3 (x86_64) or 2 (ia32)
+ * low bits from any entropy chosen here.
+ *
+ * Therefore, final stack offset entropy will be 5 (x86_64) or
+ * 6 (ia32) bits.
+ */
+ choose_random_kstack_offset(rdtsc() & 0xFF);
}
#define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index d43717b423cb..6ec3fc969ad5 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -74,7 +74,6 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id)
int lcount;
char *lptr;
- st = 1;
for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
lcount; lcount--, lptr++) {
st = inb(virtual_dma_port + FD_STATUS);
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index e6cd3fee562b..606f5cc579b2 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -156,7 +156,7 @@ enum hv_isolation_type {
#define HV_X64_MSR_HYPERCALL 0x40000001
/* MSR used to provide vcpu index */
-#define HV_X64_MSR_VP_INDEX 0x40000002
+#define HV_REGISTER_VP_INDEX 0x40000002
/* MSR used to reset the guest OS. */
#define HV_X64_MSR_RESET 0x40000003
@@ -165,10 +165,10 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_RUNTIME 0x40000010
/* MSR used to read the per-partition time reference counter */
-#define HV_X64_MSR_TIME_REF_COUNT 0x40000020
+#define HV_REGISTER_TIME_REF_COUNT 0x40000020
/* A partition's reference time stamp counter (TSC) page */
-#define HV_X64_MSR_REFERENCE_TSC 0x40000021
+#define HV_REGISTER_REFERENCE_TSC 0x40000021
/* MSR used to retrieve the TSC frequency */
#define HV_X64_MSR_TSC_FREQUENCY 0x40000022
@@ -183,50 +183,50 @@ enum hv_isolation_type {
#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
/* Define synthetic interrupt controller model specific registers. */
-#define HV_X64_MSR_SCONTROL 0x40000080
-#define HV_X64_MSR_SVERSION 0x40000081
-#define HV_X64_MSR_SIEFP 0x40000082
-#define HV_X64_MSR_SIMP 0x40000083
-#define HV_X64_MSR_EOM 0x40000084
-#define HV_X64_MSR_SINT0 0x40000090
-#define HV_X64_MSR_SINT1 0x40000091
-#define HV_X64_MSR_SINT2 0x40000092
-#define HV_X64_MSR_SINT3 0x40000093
-#define HV_X64_MSR_SINT4 0x40000094
-#define HV_X64_MSR_SINT5 0x40000095
-#define HV_X64_MSR_SINT6 0x40000096
-#define HV_X64_MSR_SINT7 0x40000097
-#define HV_X64_MSR_SINT8 0x40000098
-#define HV_X64_MSR_SINT9 0x40000099
-#define HV_X64_MSR_SINT10 0x4000009A
-#define HV_X64_MSR_SINT11 0x4000009B
-#define HV_X64_MSR_SINT12 0x4000009C
-#define HV_X64_MSR_SINT13 0x4000009D
-#define HV_X64_MSR_SINT14 0x4000009E
-#define HV_X64_MSR_SINT15 0x4000009F
+#define HV_REGISTER_SCONTROL 0x40000080
+#define HV_REGISTER_SVERSION 0x40000081
+#define HV_REGISTER_SIEFP 0x40000082
+#define HV_REGISTER_SIMP 0x40000083
+#define HV_REGISTER_EOM 0x40000084
+#define HV_REGISTER_SINT0 0x40000090
+#define HV_REGISTER_SINT1 0x40000091
+#define HV_REGISTER_SINT2 0x40000092
+#define HV_REGISTER_SINT3 0x40000093
+#define HV_REGISTER_SINT4 0x40000094
+#define HV_REGISTER_SINT5 0x40000095
+#define HV_REGISTER_SINT6 0x40000096
+#define HV_REGISTER_SINT7 0x40000097
+#define HV_REGISTER_SINT8 0x40000098
+#define HV_REGISTER_SINT9 0x40000099
+#define HV_REGISTER_SINT10 0x4000009A
+#define HV_REGISTER_SINT11 0x4000009B
+#define HV_REGISTER_SINT12 0x4000009C
+#define HV_REGISTER_SINT13 0x4000009D
+#define HV_REGISTER_SINT14 0x4000009E
+#define HV_REGISTER_SINT15 0x4000009F
/*
* Synthetic Timer MSRs. Four timers per vcpu.
*/
-#define HV_X64_MSR_STIMER0_CONFIG 0x400000B0
-#define HV_X64_MSR_STIMER0_COUNT 0x400000B1
-#define HV_X64_MSR_STIMER1_CONFIG 0x400000B2
-#define HV_X64_MSR_STIMER1_COUNT 0x400000B3
-#define HV_X64_MSR_STIMER2_CONFIG 0x400000B4
-#define HV_X64_MSR_STIMER2_COUNT 0x400000B5
-#define HV_X64_MSR_STIMER3_CONFIG 0x400000B6
-#define HV_X64_MSR_STIMER3_COUNT 0x400000B7
+#define HV_REGISTER_STIMER0_CONFIG 0x400000B0
+#define HV_REGISTER_STIMER0_COUNT 0x400000B1
+#define HV_REGISTER_STIMER1_CONFIG 0x400000B2
+#define HV_REGISTER_STIMER1_COUNT 0x400000B3
+#define HV_REGISTER_STIMER2_CONFIG 0x400000B4
+#define HV_REGISTER_STIMER2_COUNT 0x400000B5
+#define HV_REGISTER_STIMER3_CONFIG 0x400000B6
+#define HV_REGISTER_STIMER3_COUNT 0x400000B7
/* Hyper-V guest idle MSR */
#define HV_X64_MSR_GUEST_IDLE 0x400000F0
/* Hyper-V guest crash notification MSR's */
-#define HV_X64_MSR_CRASH_P0 0x40000100
-#define HV_X64_MSR_CRASH_P1 0x40000101
-#define HV_X64_MSR_CRASH_P2 0x40000102
-#define HV_X64_MSR_CRASH_P3 0x40000103
-#define HV_X64_MSR_CRASH_P4 0x40000104
-#define HV_X64_MSR_CRASH_CTL 0x40000105
+#define HV_REGISTER_CRASH_P0 0x40000100
+#define HV_REGISTER_CRASH_P1 0x40000101
+#define HV_REGISTER_CRASH_P2 0x40000102
+#define HV_REGISTER_CRASH_P3 0x40000103
+#define HV_REGISTER_CRASH_P4 0x40000104
+#define HV_REGISTER_CRASH_CTL 0x40000105
/* TSC emulation after migration */
#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106
@@ -236,6 +236,32 @@ enum hv_isolation_type {
/* TSC invariant control */
#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118
+/* Register name aliases for temporary compatibility */
+#define HV_X64_MSR_STIMER0_COUNT HV_REGISTER_STIMER0_COUNT
+#define HV_X64_MSR_STIMER0_CONFIG HV_REGISTER_STIMER0_CONFIG
+#define HV_X64_MSR_STIMER1_COUNT HV_REGISTER_STIMER1_COUNT
+#define HV_X64_MSR_STIMER1_CONFIG HV_REGISTER_STIMER1_CONFIG
+#define HV_X64_MSR_STIMER2_COUNT HV_REGISTER_STIMER2_COUNT
+#define HV_X64_MSR_STIMER2_CONFIG HV_REGISTER_STIMER2_CONFIG
+#define HV_X64_MSR_STIMER3_COUNT HV_REGISTER_STIMER3_COUNT
+#define HV_X64_MSR_STIMER3_CONFIG HV_REGISTER_STIMER3_CONFIG
+#define HV_X64_MSR_SCONTROL HV_REGISTER_SCONTROL
+#define HV_X64_MSR_SVERSION HV_REGISTER_SVERSION
+#define HV_X64_MSR_SIMP HV_REGISTER_SIMP
+#define HV_X64_MSR_SIEFP HV_REGISTER_SIEFP
+#define HV_X64_MSR_VP_INDEX HV_REGISTER_VP_INDEX
+#define HV_X64_MSR_EOM HV_REGISTER_EOM
+#define HV_X64_MSR_SINT0 HV_REGISTER_SINT0
+#define HV_X64_MSR_SINT15 HV_REGISTER_SINT15
+#define HV_X64_MSR_CRASH_P0 HV_REGISTER_CRASH_P0
+#define HV_X64_MSR_CRASH_P1 HV_REGISTER_CRASH_P1
+#define HV_X64_MSR_CRASH_P2 HV_REGISTER_CRASH_P2
+#define HV_X64_MSR_CRASH_P3 HV_REGISTER_CRASH_P3
+#define HV_X64_MSR_CRASH_P4 HV_REGISTER_CRASH_P4
+#define HV_X64_MSR_CRASH_CTL HV_REGISTER_CRASH_CTL
+#define HV_X64_MSR_TIME_REF_COUNT HV_REGISTER_TIME_REF_COUNT
+#define HV_X64_MSR_REFERENCE_TSC HV_REGISTER_REFERENCE_TSC
+
/*
* Declare the MSR used to setup pages used to communicate with the hypervisor.
*/
@@ -288,35 +314,6 @@ struct hv_tsc_emulation_status {
#define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001
#define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12
-
-/* Define hypervisor message types. */
-enum hv_message_type {
- HVMSG_NONE = 0x00000000,
-
- /* Memory access messages. */
- HVMSG_UNMAPPED_GPA = 0x80000000,
- HVMSG_GPA_INTERCEPT = 0x80000001,
-
- /* Timer notification messages. */
- HVMSG_TIMER_EXPIRED = 0x80000010,
-
- /* Error messages. */
- HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020,
- HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021,
- HVMSG_UNSUPPORTED_FEATURE = 0x80000022,
-
- /* Trace buffer complete messages. */
- HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040,
-
- /* Platform-specific processor intercept messages. */
- HVMSG_X64_IOPORT_INTERCEPT = 0x80010000,
- HVMSG_X64_MSR_INTERCEPT = 0x80010001,
- HVMSG_X64_CPUID_INTERCEPT = 0x80010002,
- HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003,
- HVMSG_X64_APIC_EOI = 0x80010004,
- HVMSG_X64_LEGACY_FP_ERROR = 0x80010005
-};
-
struct hv_nested_enlightenments_control {
struct {
__u32 directhypercall:1;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 5eb3bdf36a41..e35e342673c7 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -547,7 +547,7 @@ SYM_CODE_END(spurious_entries_start)
/*
* Dummy trap number so the low level ASM macro vector number checks do not
* match which results in emitting plain IDTENTRY stubs without bells and
- * whistels.
+ * whistles.
*/
#define X86_TRAP_OTHER 0xFFFF
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 4cf2ad521f65..b56c5741581a 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -6,7 +6,7 @@
*
* Written by Masami Hiramatsu <mhiramat@redhat.com>
*/
-#include <asm/inat_types.h>
+#include <asm/inat_types.h> /* __ignore_sync_check__ */
/*
* Internal bits. Don't use bitmasks directly, because these bits are
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index 98b4dae5e8bc..91d7182ad2d6 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -25,7 +25,7 @@ int insn_fetch_from_user(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
int insn_fetch_from_user_inatomic(struct pt_regs *regs,
unsigned char buf[MAX_INSN_SIZE]);
-bool insn_decode(struct insn *insn, struct pt_regs *regs,
- unsigned char buf[MAX_INSN_SIZE], int buf_size);
+bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 95a448fbb44c..05a6ab940f45 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -9,7 +9,7 @@
#include <asm/byteorder.h>
/* insn_attr_t is defined in inat.h */
-#include <asm/inat.h>
+#include <asm/inat.h> /* __ignore_sync_check__ */
#if defined(__BYTE_ORDER) ? __BYTE_ORDER == __LITTLE_ENDIAN : defined(__LITTLE_ENDIAN)
@@ -132,13 +132,25 @@ struct insn {
#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64);
-extern void insn_get_prefixes(struct insn *insn);
-extern void insn_get_opcode(struct insn *insn);
-extern void insn_get_modrm(struct insn *insn);
-extern void insn_get_sib(struct insn *insn);
-extern void insn_get_displacement(struct insn *insn);
-extern void insn_get_immediate(struct insn *insn);
-extern void insn_get_length(struct insn *insn);
+extern int insn_get_prefixes(struct insn *insn);
+extern int insn_get_opcode(struct insn *insn);
+extern int insn_get_modrm(struct insn *insn);
+extern int insn_get_sib(struct insn *insn);
+extern int insn_get_displacement(struct insn *insn);
+extern int insn_get_immediate(struct insn *insn);
+extern int insn_get_length(struct insn *insn);
+
+enum insn_mode {
+ INSN_MODE_32,
+ INSN_MODE_64,
+ /* Mode is determined by the current kernel build. */
+ INSN_MODE_KERN,
+ INSN_NUM_MODES,
+};
+
+extern int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m);
+
+#define insn_decode_kernel(_insn, _ptr) insn_decode((_insn), (_ptr), MAX_INSN_SIZE, INSN_MODE_KERN)
/* Attribute will be determined after getting ModRM (for opcode groups) */
static inline void insn_get_attribute(struct insn *insn)
@@ -149,17 +161,6 @@ static inline void insn_get_attribute(struct insn *insn)
/* Instruction uses RIP-relative addressing */
extern int insn_rip_relative(struct insn *insn);
-/* Init insn for kernel text */
-static inline void kernel_insn_init(struct insn *insn,
- const void *kaddr, int buf_len)
-{
-#ifdef CONFIG_X86_64
- insn_init(insn, kaddr, buf_len, 1);
-#else /* CONFIG_X86_32 */
- insn_init(insn, kaddr, buf_len, 0);
-#endif
-}
-
static inline int insn_is_avx(struct insn *insn)
{
if (!insn->prefixes.got)
@@ -179,13 +180,6 @@ static inline int insn_has_emulate_prefix(struct insn *insn)
return !!insn->emulate_prefix_size;
}
-/* Ensure this instruction is decoded completely */
-static inline int insn_complete(struct insn *insn)
-{
- return insn->opcode.got && insn->modrm.got && insn->sib.got &&
- insn->displacement.got && insn->immediate.got;
-}
-
static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
{
if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index 9abe842dbd84..955b06d6325a 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -32,7 +32,9 @@
* _EP - 2 socket server parts
* _EX - 4+ socket server parts
*
- * The #define line may optionally include a comment including platform names.
+ * The #define line may optionally include a comment including platform or core
+ * names. An exception is made for skylake/kabylake where steppings seem to have gotten
+ * their own names :-(
*/
/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
@@ -69,35 +71,41 @@
#define INTEL_FAM6_BROADWELL_X 0x4F
#define INTEL_FAM6_BROADWELL_D 0x56
-#define INTEL_FAM6_SKYLAKE_L 0x4E
-#define INTEL_FAM6_SKYLAKE 0x5E
-#define INTEL_FAM6_SKYLAKE_X 0x55
-#define INTEL_FAM6_KABYLAKE_L 0x8E
-#define INTEL_FAM6_KABYLAKE 0x9E
+#define INTEL_FAM6_SKYLAKE_L 0x4E /* Sky Lake */
+#define INTEL_FAM6_SKYLAKE 0x5E /* Sky Lake */
+#define INTEL_FAM6_SKYLAKE_X 0x55 /* Sky Lake */
+/* CASCADELAKE_X 0x55 Sky Lake -- s: 7 */
+/* COOPERLAKE_X 0x55 Sky Lake -- s: 11 */
-#define INTEL_FAM6_CANNONLAKE_L 0x66
+#define INTEL_FAM6_KABYLAKE_L 0x8E /* Sky Lake */
+/* AMBERLAKE_L 0x8E Sky Lake -- s: 9 */
+/* COFFEELAKE_L 0x8E Sky Lake -- s: 10 */
+/* WHISKEYLAKE_L 0x8E Sky Lake -- s: 11,12 */
-#define INTEL_FAM6_ICELAKE_X 0x6A
-#define INTEL_FAM6_ICELAKE_D 0x6C
-#define INTEL_FAM6_ICELAKE 0x7D
-#define INTEL_FAM6_ICELAKE_L 0x7E
-#define INTEL_FAM6_ICELAKE_NNPI 0x9D
+#define INTEL_FAM6_KABYLAKE 0x9E /* Sky Lake */
+/* COFFEELAKE 0x9E Sky Lake -- s: 10-13 */
-#define INTEL_FAM6_TIGERLAKE_L 0x8C
-#define INTEL_FAM6_TIGERLAKE 0x8D
+#define INTEL_FAM6_COMETLAKE 0xA5 /* Sky Lake */
+#define INTEL_FAM6_COMETLAKE_L 0xA6 /* Sky Lake */
-#define INTEL_FAM6_COMETLAKE 0xA5
-#define INTEL_FAM6_COMETLAKE_L 0xA6
+#define INTEL_FAM6_CANNONLAKE_L 0x66 /* Palm Cove */
-#define INTEL_FAM6_ROCKETLAKE 0xA7
+#define INTEL_FAM6_ICELAKE_X 0x6A /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_D 0x6C /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE 0x7D /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */
+#define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */
-#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F
+#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */
-/* Hybrid Core/Atom Processors */
+#define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */
-#define INTEL_FAM6_LAKEFIELD 0x8A
-#define INTEL_FAM6_ALDERLAKE 0x97
-#define INTEL_FAM6_ALDERLAKE_L 0x9A
+#define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */
+#define INTEL_FAM6_TIGERLAKE 0x8D /* Willow Cove */
+#define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Willow Cove */
+
+#define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */
+#define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */
/* "Small Core" Processors (Atom) */
diff --git a/arch/x86/include/asm/intel_pconfig.h b/arch/x86/include/asm/intel_pconfig.h
index 3cb002b1d0f9..994638ef171b 100644
--- a/arch/x86/include/asm/intel_pconfig.h
+++ b/arch/x86/include/asm/intel_pconfig.h
@@ -38,7 +38,7 @@ enum pconfig_leaf {
#define MKTME_INVALID_ENC_ALG 4
#define MKTME_DEVICE_BUSY 5
-/* Hardware requires the structure to be 256 byte alinged. Otherwise #GP(0). */
+/* Hardware requires the structure to be 256 byte aligned. Otherwise #GP(0). */
struct mktme_key_program {
u16 keyid;
u32 keyid_ctrl;
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index 423b788f495e..ebe8d2ea44fe 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -3,7 +3,7 @@
#define _ASM_X86_INTEL_PT_H
#define PT_CPUID_LEAVES 2
-#define PT_CPUID_REGS_NUM 4 /* number of regsters (eax, ebx, ecx, edx) */
+#define PT_CPUID_REGS_NUM 4 /* number of registers (eax, ebx, ecx, edx) */
enum pt_capabilities {
PT_CAP_max_subleaf = 0,
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d726459d08e5..841a5d104afa 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -159,7 +159,7 @@ static inline void *phys_to_virt(phys_addr_t address)
/*
* ISA I/O bus memory addresses are 1:1 with the physical address.
* However, we truncate the address to unsigned int to avoid undesirable
- * promitions in legacy drivers.
+ * promotions in legacy drivers.
*/
static inline unsigned int isa_virt_to_bus(volatile void *address)
{
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
index 9b2a0ff76c73..562854c60808 100644
--- a/arch/x86/include/asm/irq_stack.h
+++ b/arch/x86/include/asm/irq_stack.h
@@ -190,7 +190,7 @@
/*
* Macro to invoke __do_softirq on the irq stack. This is only called from
- * task context when bottom halfs are about to be reenabled and soft
+ * task context when bottom halves are about to be reenabled and soft
* interrupts are pending to be processed. The interrupt stack cannot be in
* use here.
*/
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 144d70ea4393..c5ce9845c999 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -109,18 +109,13 @@ static __always_inline unsigned long arch_local_irq_save(void)
}
#else
-#define ENABLE_INTERRUPTS(x) sti
-#define DISABLE_INTERRUPTS(x) cli
-
#ifdef CONFIG_X86_64
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(x) pushfq; popq %rax
+#define SAVE_FLAGS pushfq; popq %rax
#endif
#define INTERRUPT_RETURN jmp native_iret
-#else
-#define INTERRUPT_RETURN iret
#endif
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 06c3cc22a058..610a05374c02 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -6,12 +6,6 @@
#define JUMP_LABEL_NOP_SIZE 5
-#ifdef CONFIG_X86_64
-# define STATIC_KEY_INIT_NOP P6_NOP5_ATOMIC
-#else
-# define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC
-#endif
-
#include <asm/asm.h>
#include <asm/nops.h>
@@ -20,10 +14,10 @@
#include <linux/stringify.h>
#include <linux/types.h>
-static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
+static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch)
{
asm_volatile_goto("1:"
- ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t"
+ ".byte " __stringify(BYTES_NOP5) "\n\t"
".pushsection __jump_table, \"aw\" \n\t"
_ASM_ALIGN "\n\t"
".long 1b - ., %l[l_yes] - . \n\t"
@@ -36,7 +30,7 @@ l_yes:
return true;
}
-static __always_inline bool arch_static_branch_jump(struct static_key *key, bool branch)
+static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch)
{
asm_volatile_goto("1:"
".byte 0xe9\n\t .long %l[l_yes] - 2f\n\t"
@@ -63,7 +57,7 @@ l_yes:
.long \target - .Lstatic_jump_after_\@
.Lstatic_jump_after_\@:
.else
- .byte STATIC_KEY_INIT_NOP
+ .byte BYTES_NOP5
.endif
.pushsection __jump_table, "aw"
_ASM_ALIGN
@@ -75,7 +69,7 @@ l_yes:
.macro STATIC_JUMP_IF_FALSE target, key, def
.Lstatic_jump_\@:
.if \def
- .byte STATIC_KEY_INIT_NOP
+ .byte BYTES_NOP5
.else
/* Equivalent to "jmp.d32 \target" */
.byte 0xe9
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 6802c59e8252..0a6e34b07017 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -150,11 +150,6 @@ struct kimage_arch {
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
-
- /* Core ELF header buffer */
- void *elf_headers;
- unsigned long elf_headers_sz;
- unsigned long elf_load_addr;
};
#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index d20a3d6be36e..bd7f5886a789 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -65,10 +65,22 @@ struct arch_specific_insn {
* a post_handler).
*/
unsigned boostable:1;
- unsigned if_modifier:1;
- unsigned is_call:1;
- unsigned is_pushf:1;
- unsigned is_abs_ip:1;
+ unsigned char size; /* The size of insn */
+ union {
+ unsigned char opcode;
+ struct {
+ unsigned char type;
+ } jcc;
+ struct {
+ unsigned char type;
+ unsigned char asize;
+ } loop;
+ struct {
+ unsigned char reg;
+ } indirect;
+ };
+ s32 rel32; /* relative offset must be s32, s16, or s8 */
+ void (*emulate_op)(struct kprobe *p, struct pt_regs *regs);
/* Number of bytes of text poked */
int tp_len;
};
@@ -107,7 +119,6 @@ extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data);
extern int kprobe_int3_handler(struct pt_regs *regs);
-extern int kprobe_debug_handler(struct pt_regs *regs);
#else
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3768819693e5..10eca9e8f7f6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1488,7 +1488,7 @@ extern u64 kvm_mce_cap_supported;
/*
* EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing
* userspace I/O) to indicate that the emulation context
- * should be resued as is, i.e. skip initialization of
+ * should be reused as is, i.e. skip initialization of
* emulation context, instruction fetch and decode.
*
* EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware.
@@ -1513,7 +1513,7 @@ extern u64 kvm_mce_cap_supported;
*
* EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware
* backdoor emulation, which is opt in via module param.
- * VMware backoor emulation handles select instructions
+ * VMware backdoor emulation handles select instructions
* and reinjects the #GP for all other cases.
*
* EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index ccf60a809a17..67ff0d637e55 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -9,70 +9,29 @@
#include <asm/hyperv-tlfs.h>
#include <asm/nospec-branch.h>
#include <asm/paravirt.h>
+#include <asm/mshyperv.h>
typedef int (*hyperv_fill_flush_list_func)(
struct hv_guest_mapping_flush_list *flush,
void *data);
-#define hv_init_timer(timer, tick) \
- wrmsrl(HV_X64_MSR_STIMER0_COUNT + (2*timer), tick)
-#define hv_init_timer_config(timer, val) \
- wrmsrl(HV_X64_MSR_STIMER0_CONFIG + (2*timer), val)
-
-#define hv_get_simp(val) rdmsrl(HV_X64_MSR_SIMP, val)
-#define hv_set_simp(val) wrmsrl(HV_X64_MSR_SIMP, val)
-
-#define hv_get_siefp(val) rdmsrl(HV_X64_MSR_SIEFP, val)
-#define hv_set_siefp(val) wrmsrl(HV_X64_MSR_SIEFP, val)
-
-#define hv_get_synic_state(val) rdmsrl(HV_X64_MSR_SCONTROL, val)
-#define hv_set_synic_state(val) wrmsrl(HV_X64_MSR_SCONTROL, val)
-
-#define hv_get_vp_index(index) rdmsrl(HV_X64_MSR_VP_INDEX, index)
-
-#define hv_signal_eom() wrmsrl(HV_X64_MSR_EOM, 0)
-
-#define hv_get_synint_state(int_num, val) \
- rdmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_set_synint_state(int_num, val) \
- wrmsrl(HV_X64_MSR_SINT0 + int_num, val)
-#define hv_recommend_using_aeoi() \
- (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED))
+static inline void hv_set_register(unsigned int reg, u64 value)
+{
+ wrmsrl(reg, value);
+}
-#define hv_get_crash_ctl(val) \
- rdmsrl(HV_X64_MSR_CRASH_CTL, val)
+static inline u64 hv_get_register(unsigned int reg)
+{
+ u64 value;
-#define hv_get_time_ref_count(val) \
- rdmsrl(HV_X64_MSR_TIME_REF_COUNT, val)
+ rdmsrl(reg, value);
+ return value;
+}
-#define hv_get_reference_tsc(val) \
- rdmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_reference_tsc(val) \
- wrmsrl(HV_X64_MSR_REFERENCE_TSC, val)
-#define hv_set_clocksource_vdso(val) \
- ((val).vdso_clock_mode = VDSO_CLOCKMODE_HVCLOCK)
-#define hv_enable_vdso_clocksource() \
- vclocks_set_used(VDSO_CLOCKMODE_HVCLOCK);
#define hv_get_raw_timer() rdtsc_ordered()
-#define hv_get_vector() HYPERVISOR_CALLBACK_VECTOR
-
-/*
- * Reference to pv_ops must be inline so objtool
- * detection of noinstr violations can work correctly.
- */
-static __always_inline void hv_setup_sched_clock(void *sched_clock)
-{
-#ifdef CONFIG_PARAVIRT
- pv_ops.time.sched_clock = sched_clock;
-#endif
-}
void hyperv_vector_handler(struct pt_regs *regs);
-static inline void hv_enable_stimer0_percpu_irq(int irq) {}
-static inline void hv_disable_stimer0_percpu_irq(int irq) {}
-
-
#if IS_ENABLED(CONFIG_HYPERV)
extern int hyperv_init_cpuhp;
@@ -189,38 +148,6 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2)
return hv_status;
}
-/*
- * Rep hypercalls. Callers of this functions are supposed to ensure that
- * rep_count and varhead_size comply with Hyper-V hypercall definition.
- */
-static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size,
- void *input, void *output)
-{
- u64 control = code;
- u64 status;
- u16 rep_comp;
-
- control |= (u64)varhead_size << HV_HYPERCALL_VARHEAD_OFFSET;
- control |= (u64)rep_count << HV_HYPERCALL_REP_COMP_OFFSET;
-
- do {
- status = hv_do_hypercall(control, input, output);
- if ((status & HV_HYPERCALL_RESULT_MASK) != HV_STATUS_SUCCESS)
- return status;
-
- /* Bits 32-43 of status have 'Reps completed' data. */
- rep_comp = (status & HV_HYPERCALL_REP_COMP_MASK) >>
- HV_HYPERCALL_REP_COMP_OFFSET;
-
- control &= ~HV_HYPERCALL_REP_START_MASK;
- control |= (u64)rep_comp << HV_HYPERCALL_REP_START_OFFSET;
-
- touch_nmi_watchdog();
- } while (rep_comp < rep_count);
-
- return status;
-}
-
extern struct hv_vp_assist_page **hv_vp_assist_page;
static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
@@ -233,9 +160,6 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
void __init hyperv_init(void);
void hyperv_setup_mmu_ops(void);
-void *hv_alloc_hyperv_page(void);
-void *hv_alloc_hyperv_zeroed_page(void);
-void hv_free_hyperv_page(unsigned long addr);
void set_hv_tscchange_cb(void (*cb)(void));
void clear_hv_tscchange_cb(void);
void hyperv_stop_tsc_emulation(void);
@@ -272,8 +196,6 @@ int hv_unmap_ioapic_interrupt(int ioapic_id, struct hv_interrupt_entry *entry);
#else /* CONFIG_HYPERV */
static inline void hyperv_init(void) {}
static inline void hyperv_setup_mmu_ops(void) {}
-static inline void *hv_alloc_hyperv_page(void) { return NULL; }
-static inline void hv_free_hyperv_page(unsigned long addr) {}
static inline void set_hv_tscchange_cb(void (*cb)(void)) {}
static inline void clear_hv_tscchange_cb(void) {}
static inline void hyperv_stop_tsc_emulation(void) {};
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 546d6ecf0a35..742d89a00721 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -185,6 +185,9 @@
#define MSR_PEBS_DATA_CFG 0x000003f2
#define MSR_IA32_DS_AREA 0x00000600
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
+#define PERF_CAP_METRICS_IDX 15
+#define PERF_CAP_PT_IDX 16
+
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define MSR_IA32_RTIT_CTL 0x00000570
@@ -265,6 +268,7 @@
#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */
#define DEBUGCTLMSR_BTF_SHIFT 1
#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */
+#define DEBUGCTLMSR_BUS_LOCK_DETECT (1UL << 2)
#define DEBUGCTLMSR_TR (1UL << 6)
#define DEBUGCTLMSR_BTS (1UL << 7)
#define DEBUGCTLMSR_BTINT (1UL << 8)
@@ -628,8 +632,6 @@
#define MSR_IA32_APICBASE_ENABLE (1<<11)
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
-#define MSR_IA32_TSCDEADLINE 0x000006e0
-
#define MSR_IA32_UCODE_WRITE 0x00000079
#define MSR_IA32_UCODE_REV 0x0000008b
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 12f12b5cf2ca..c1e5e818ba16 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -4,89 +4,58 @@
/*
* Define nops for use with alternative() and for tracing.
- *
- * *_NOP5_ATOMIC must be a single instruction.
*/
-#define NOP_DS_PREFIX 0x3e
+#ifndef CONFIG_64BIT
-/* generic versions from gas
- 1: nop
- the following instructions are NOT nops in 64-bit mode,
- for 64-bit mode use K8 or P6 nops instead
- 2: movl %esi,%esi
- 3: leal 0x00(%esi),%esi
- 4: leal 0x00(,%esi,1),%esi
- 6: leal 0x00000000(%esi),%esi
- 7: leal 0x00000000(,%esi,1),%esi
-*/
-#define GENERIC_NOP1 0x90
-#define GENERIC_NOP2 0x89,0xf6
-#define GENERIC_NOP3 0x8d,0x76,0x00
-#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
-#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
-#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
-#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
-#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
-#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
+/*
+ * Generic 32bit nops from GAS:
+ *
+ * 1: nop
+ * 2: movl %esi,%esi
+ * 3: leal 0x0(%esi),%esi
+ * 4: leal 0x0(%esi,%eiz,1),%esi
+ * 5: leal %ds:0x0(%esi,%eiz,1),%esi
+ * 6: leal 0x0(%esi),%esi
+ * 7: leal 0x0(%esi,%eiz,1),%esi
+ * 8: leal %ds:0x0(%esi,%eiz,1),%esi
+ *
+ * Except 5 and 8, which are DS prefixed 4 and 7 resp, where GAS would emit 2
+ * nop instructions.
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x89,0xf6
+#define BYTES_NOP3 0x8d,0x76,0x00
+#define BYTES_NOP4 0x8d,0x74,0x26,0x00
+#define BYTES_NOP5 0x3e,BYTES_NOP4
+#define BYTES_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define BYTES_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x3e,BYTES_NOP7
-/* Opteron 64bit nops
- 1: nop
- 2: osp nop
- 3: osp osp nop
- 4: osp osp osp nop
-*/
-#define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2 0x66,K8_NOP1
-#define K8_NOP3 0x66,K8_NOP2
-#define K8_NOP4 0x66,K8_NOP3
-#define K8_NOP5 K8_NOP3,K8_NOP2
-#define K8_NOP6 K8_NOP3,K8_NOP3
-#define K8_NOP7 K8_NOP4,K8_NOP3
-#define K8_NOP8 K8_NOP4,K8_NOP4
-#define K8_NOP5_ATOMIC 0x66,K8_NOP4
+#else
-/* K7 nops
- uses eax dependencies (arbitrary choice)
- 1: nop
- 2: movl %eax,%eax
- 3: leal (,%eax,1),%eax
- 4: leal 0x00(,%eax,1),%eax
- 6: leal 0x00000000(%eax),%eax
- 7: leal 0x00000000(,%eax,1),%eax
-*/
-#define K7_NOP1 GENERIC_NOP1
-#define K7_NOP2 0x8b,0xc0
-#define K7_NOP3 0x8d,0x04,0x20
-#define K7_NOP4 0x8d,0x44,0x20,0x00
-#define K7_NOP5 K7_NOP4,K7_NOP1
-#define K7_NOP6 0x8d,0x80,0,0,0,0
-#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
-#define K7_NOP8 K7_NOP7,K7_NOP1
-#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
+/*
+ * Generic 64bit nops from GAS:
+ *
+ * 1: nop
+ * 2: osp nop
+ * 3: nopl (%eax)
+ * 4: nopl 0x00(%eax)
+ * 5: nopl 0x00(%eax,%eax,1)
+ * 6: osp nopl 0x00(%eax,%eax,1)
+ * 7: nopl 0x00000000(%eax)
+ * 8: nopl 0x00000000(%eax,%eax,1)
+ */
+#define BYTES_NOP1 0x90
+#define BYTES_NOP2 0x66,BYTES_NOP1
+#define BYTES_NOP3 0x0f,0x1f,0x00
+#define BYTES_NOP4 0x0f,0x1f,0x40,0x00
+#define BYTES_NOP5 0x0f,0x1f,0x44,0x00,0x00
+#define BYTES_NOP6 0x66,BYTES_NOP5
+#define BYTES_NOP7 0x0f,0x1f,0x80,0x00,0x00,0x00,0x00
+#define BYTES_NOP8 0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
-/* P6 nops
- uses eax dependencies (Intel-recommended choice)
- 1: nop
- 2: osp nop
- 3: nopl (%eax)
- 4: nopl 0x00(%eax)
- 5: nopl 0x00(%eax,%eax,1)
- 6: osp nopl 0x00(%eax,%eax,1)
- 7: nopl 0x00000000(%eax)
- 8: nopl 0x00000000(%eax,%eax,1)
- Note: All the above are assumed to be a single instruction.
- There is kernel code that depends on this.
-*/
-#define P6_NOP1 GENERIC_NOP1
-#define P6_NOP2 0x66,0x90
-#define P6_NOP3 0x0f,0x1f,0x00
-#define P6_NOP4 0x0f,0x1f,0x40,0
-#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
-#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
-#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
-#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
-#define P6_NOP5_ATOMIC P6_NOP5
+#endif /* CONFIG_64BIT */
#ifdef __ASSEMBLY__
#define _ASM_MK_NOP(x) .byte x
@@ -94,54 +63,19 @@
#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
#endif
-#if defined(CONFIG_MK7)
-#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
-#elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
-#elif defined(CONFIG_X86_64)
-#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
-#else
-#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
-#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
-#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
-#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
-#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
-#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
-#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
-#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
-#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
-#endif
+#define ASM_NOP1 _ASM_MK_NOP(BYTES_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(BYTES_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(BYTES_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(BYTES_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(BYTES_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(BYTES_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(BYTES_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(BYTES_NOP8)
#define ASM_NOP_MAX 8
-#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
#ifndef __ASSEMBLY__
-extern const unsigned char * const *ideal_nops;
-extern void arch_init_ideal_nops(void);
+extern const unsigned char * const x86_nops[];
#endif
#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index cb9ad6b73973..3ad8c6d3cbb3 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,7 +7,6 @@
#include <linux/objtool.h>
#include <asm/alternative.h>
-#include <asm/alternative-asm.h>
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
#include <asm/unwind_hints.h>
@@ -33,7 +32,7 @@
/*
* Google experimented with loop-unrolling and this turned out to be
- * the optimal version — two calls, each with their own speculation
+ * the optimal version - two calls, each with their own speculation
* trap should their return address end up getting used, in a loop.
*/
#define __FILL_RETURN_BUFFER(reg, nr, sp) \
@@ -81,7 +80,7 @@
.macro JMP_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
- __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(jmp __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
#else
jmp *%\reg
@@ -91,7 +90,7 @@
.macro CALL_NOSPEC reg:req
#ifdef CONFIG_RETPOLINE
ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \
- __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \
+ __stringify(call __x86_indirect_thunk_\reg), X86_FEATURE_RETPOLINE, \
__stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD
#else
call *%\reg
@@ -129,7 +128,7 @@
ALTERNATIVE_2( \
ANNOTATE_RETPOLINE_SAFE \
"call *%[thunk_target]\n", \
- "call __x86_retpoline_%V[thunk_target]\n", \
+ "call __x86_indirect_thunk_%V[thunk_target]\n", \
X86_FEATURE_RETPOLINE, \
"lfence;\n" \
ANNOTATE_RETPOLINE_SAFE \
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4abf110e2243..da3a1ac82be5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -15,11 +15,20 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
+#include <linux/static_call_types.h>
#include <asm/frame.h>
-static inline unsigned long long paravirt_sched_clock(void)
+u64 dummy_steal_clock(int cpu);
+u64 dummy_sched_clock(void);
+
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock);
+
+void paravirt_set_sched_clock(u64 (*func)(void));
+
+static inline u64 paravirt_sched_clock(void)
{
- return PVOP_CALL0(unsigned long long, time.sched_clock);
+ return static_call(pv_sched_clock)();
}
struct static_key;
@@ -33,9 +42,13 @@ bool pv_is_native_vcpu_is_preempted(void);
static inline u64 paravirt_steal_clock(int cpu)
{
- return PVOP_CALL1(u64, time.steal_clock, cpu);
+ return static_call(pv_steal_clock)(cpu);
}
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+void __init paravirt_set_cap(void);
+#endif
+
/* The paravirtualized I/O functions */
static inline void slow_down_io(void)
{
@@ -50,7 +63,7 @@ static inline void slow_down_io(void)
void native_flush_tlb_local(void);
void native_flush_tlb_global(void);
void native_flush_tlb_one_user(unsigned long addr);
-void native_flush_tlb_others(const struct cpumask *cpumask,
+void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
static inline void __flush_tlb_local(void)
@@ -68,10 +81,10 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
}
-static inline void __flush_tlb_others(const struct cpumask *cpumask,
+static inline void __flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+ PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
}
static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
@@ -122,7 +135,9 @@ static inline void write_cr0(unsigned long x)
static inline unsigned long read_cr2(void)
{
- return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
+ return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
+ "mov %%cr2, %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
static inline void write_cr2(unsigned long x)
@@ -132,12 +147,14 @@ static inline void write_cr2(unsigned long x)
static inline unsigned long __read_cr3(void)
{
- return PVOP_CALL0(unsigned long, mmu.read_cr3);
+ return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
+ "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void write_cr3(unsigned long x)
{
- PVOP_VCALL1(mmu.write_cr3, x);
+ PVOP_ALT_VCALL1(mmu.write_cr3, x,
+ "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __write_cr4(unsigned long x)
@@ -157,7 +174,7 @@ static inline void halt(void)
static inline void wbinvd(void)
{
- PVOP_VCALL0(cpu.wbinvd);
+ PVOP_ALT_VCALL0(cpu.wbinvd, "wbinvd", ALT_NOT(X86_FEATURE_XENPV));
}
static inline u64 paravirt_read_msr(unsigned msr)
@@ -371,22 +388,28 @@ static inline void paravirt_release_p4d(unsigned long pfn)
static inline pte_t __pte(pteval_t val)
{
- return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) };
+ return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pteval_t pte_val(pte_t pte)
{
- return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte);
+ return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline pgd_t __pgd(pgdval_t val)
{
- return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) };
+ return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pgdval_t pgd_val(pgd_t pgd)
{
- return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd);
+ return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
@@ -419,12 +442,15 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
static inline pmd_t __pmd(pmdval_t val)
{
- return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) };
+ return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV)) };
}
static inline pmdval_t pmd_val(pmd_t pmd)
{
- return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd);
+ return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void set_pud(pud_t *pudp, pud_t pud)
@@ -436,14 +462,16 @@ static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
+ ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
+ return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void pud_clear(pud_t *pudp)
@@ -462,14 +490,17 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
static inline p4d_t __p4d(p4dval_t val)
{
- p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val);
+ p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val,
+ "mov %%rdi, %%rax",
+ ALT_NOT(X86_FEATURE_XENPV));
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
- return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d);
+ return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d,
+ "mov %%rdi, %%rax", ALT_NOT(X86_FEATURE_XENPV));
}
static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd)
@@ -556,7 +587,9 @@ static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
{
- PVOP_VCALLEE1(lock.queued_spin_unlock, lock);
+ PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock,
+ "movb $0, (%%" _ASM_ARG1 ");",
+ ALT_NOT(X86_FEATURE_PVUNLOCK));
}
static __always_inline void pv_wait(u8 *ptr, u8 val)
@@ -571,7 +604,9 @@ static __always_inline void pv_kick(int cpu)
static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
- return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu);
+ return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu,
+ "xor %%" _ASM_AX ", %%" _ASM_AX ";",
+ ALT_NOT(X86_FEATURE_VCPUPREEMPT));
}
void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock);
@@ -645,17 +680,18 @@ bool __raw_callee_save___native_vcpu_is_preempted(long cpu);
#ifdef CONFIG_PARAVIRT_XXL
static inline notrace unsigned long arch_local_save_flags(void)
{
- return PVOP_CALLEE0(unsigned long, irq.save_fl);
+ return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
+ ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace void arch_local_irq_disable(void)
{
- PVOP_VCALLEE0(irq.irq_disable);
+ PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace void arch_local_irq_enable(void)
{
- PVOP_VCALLEE0(irq.irq_enable);
+ PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
}
static inline notrace unsigned long arch_local_irq_save(void)
@@ -700,84 +736,27 @@ extern void default_banner(void);
.popsection
-#define COND_PUSH(set, mask, reg) \
- .if ((~(set)) & mask); push %reg; .endif
-#define COND_POP(set, mask, reg) \
- .if ((~(set)) & mask); pop %reg; .endif
-
#ifdef CONFIG_X86_64
-
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_RAX, rax); \
- COND_PUSH(set, CLBR_RCX, rcx); \
- COND_PUSH(set, CLBR_RDX, rdx); \
- COND_PUSH(set, CLBR_RSI, rsi); \
- COND_PUSH(set, CLBR_RDI, rdi); \
- COND_PUSH(set, CLBR_R8, r8); \
- COND_PUSH(set, CLBR_R9, r9); \
- COND_PUSH(set, CLBR_R10, r10); \
- COND_PUSH(set, CLBR_R11, r11)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_R11, r11); \
- COND_POP(set, CLBR_R10, r10); \
- COND_POP(set, CLBR_R9, r9); \
- COND_POP(set, CLBR_R8, r8); \
- COND_POP(set, CLBR_RDI, rdi); \
- COND_POP(set, CLBR_RSI, rsi); \
- COND_POP(set, CLBR_RDX, rdx); \
- COND_POP(set, CLBR_RCX, rcx); \
- COND_POP(set, CLBR_RAX, rax)
+#ifdef CONFIG_PARAVIRT_XXL
#define PARA_PATCH(off) ((off) / 8)
#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
-#else
-#define PV_SAVE_REGS(set) \
- COND_PUSH(set, CLBR_EAX, eax); \
- COND_PUSH(set, CLBR_EDI, edi); \
- COND_PUSH(set, CLBR_ECX, ecx); \
- COND_PUSH(set, CLBR_EDX, edx)
-#define PV_RESTORE_REGS(set) \
- COND_POP(set, CLBR_EDX, edx); \
- COND_POP(set, CLBR_ECX, ecx); \
- COND_POP(set, CLBR_EDI, edi); \
- COND_POP(set, CLBR_EAX, eax)
-
-#define PARA_PATCH(off) ((off) / 4)
-#define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4)
-#define PARA_INDIRECT(addr) *%cs:addr
-#endif
-#ifdef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN \
- PARA_SITE(PARA_PATCH(PV_CPU_iret), \
- ANNOTATE_RETPOLINE_SAFE; \
- jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);)
-
-#define DISABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-
-#define ENABLE_INTERRUPTS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
-#endif
+ ANNOTATE_RETPOLINE_SAFE; \
+ ALTERNATIVE_TERNARY("jmp *paravirt_iret(%rip);", \
+ X86_FEATURE_XENPV, "jmp xen_iret;", "jmp native_iret;")
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_PARAVIRT_XXL
#ifdef CONFIG_DEBUG_ENTRY
-#define SAVE_FLAGS(clobbers) \
- PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \
- PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \
- ANNOTATE_RETPOLINE_SAFE; \
- call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \
- PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+.macro PARA_IRQ_save_fl
+ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl),
+ ANNOTATE_RETPOLINE_SAFE;
+ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);)
+.endm
+
+#define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
+ ALT_NOT(X86_FEATURE_XENPV)
#endif
#endif /* CONFIG_PARAVIRT_XXL */
#endif /* CONFIG_X86_64 */
@@ -800,5 +779,11 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
#endif
+
+#ifndef CONFIG_PARAVIRT_SPINLOCKS
+static inline void paravirt_set_cap(void)
+{
+}
+#endif
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index de87087d3bde..d9d6b0203ec4 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -3,7 +3,6 @@
#define _ASM_X86_PARAVIRT_TYPES_H
/* Bitmask of what can be clobbered: usually at least eax. */
-#define CLBR_NONE 0
#define CLBR_EAX (1 << 0)
#define CLBR_ECX (1 << 1)
#define CLBR_EDX (1 << 2)
@@ -15,7 +14,6 @@
#define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX)
#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX)
-#define CLBR_SCRATCH (0)
#else
#define CLBR_RAX CLBR_EAX
#define CLBR_RCX CLBR_ECX
@@ -32,12 +30,9 @@
#define CLBR_ARG_REGS (CLBR_RDI | CLBR_RSI | CLBR_RDX | \
CLBR_RCX | CLBR_R8 | CLBR_R9)
#define CLBR_RET_REG (CLBR_RAX)
-#define CLBR_SCRATCH (CLBR_R10 | CLBR_R11)
#endif /* X86_64 */
-#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
-
#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
@@ -73,19 +68,6 @@ struct pv_info {
const char *name;
};
-struct pv_init_ops {
- /*
- * Patch may replace one of the defined code sequences with
- * arbitrary code, subject to the same register constraints.
- * This generally means the code is not free to clobber any
- * registers other than EAX. The patch function should return
- * the number of bytes of code generated, as we nop pad the
- * rest in generic code.
- */
- unsigned (*patch)(u8 type, void *insn_buff,
- unsigned long addr, unsigned len);
-} __no_randomize_layout;
-
#ifdef CONFIG_PARAVIRT_XXL
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
@@ -95,11 +77,6 @@ struct pv_lazy_ops {
} __no_randomize_layout;
#endif
-struct pv_time_ops {
- unsigned long long (*sched_clock)(void);
- unsigned long long (*steal_clock)(int cpu);
-} __no_randomize_layout;
-
struct pv_cpu_ops {
/* hooks for various privileged instructions */
void (*io_delay)(void);
@@ -156,10 +133,6 @@ struct pv_cpu_ops {
u64 (*read_pmc)(int counter);
- /* Normal iret. Jump to this with the standard iret stack
- frame set up. */
- void (*iret)(void);
-
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
#endif
@@ -188,8 +161,8 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_one_user)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
- const struct flush_tlb_info *info);
+ void (*flush_tlb_multi)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
@@ -290,8 +263,6 @@ struct pv_lock_ops {
* number for each function using the offset which we use to indicate
* what to patch. */
struct paravirt_patch_template {
- struct pv_init_ops init;
- struct pv_time_ops time;
struct pv_cpu_ops cpu;
struct pv_irq_ops irq;
struct pv_mmu_ops mmu;
@@ -300,6 +271,7 @@ struct paravirt_patch_template {
extern struct pv_info pv_info;
extern struct paravirt_patch_template pv_ops;
+extern void (*paravirt_iret)(void);
#define PARAVIRT_PATCH(x) \
(offsetof(struct paravirt_patch_template, x) / sizeof(void *))
@@ -331,11 +303,7 @@ extern struct paravirt_patch_template pv_ops;
/* Simple instruction patching code. */
#define NATIVE_LABEL(a,x,b) "\n\t.globl " a #x "_" #b "\n" a #x "_" #b ":\n\t"
-unsigned paravirt_patch_ident_64(void *insn_buff, unsigned len);
-unsigned paravirt_patch_default(u8 type, void *insn_buff, unsigned long addr, unsigned len);
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len, const char *start, const char *end);
-
-unsigned native_patch(u8 type, void *insn_buff, unsigned long addr, unsigned len);
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr, unsigned int len);
int paravirt_disable_iospace(void);
@@ -371,7 +339,7 @@ int paravirt_disable_iospace(void);
* on the stack. All caller-save registers (eax,edx,ecx) are expected
* to be modified (either clobbered or used for return values).
* X86_64, on the other hand, already specifies a register-based calling
- * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
+ * conventions, returning at %rax, with parameters going on %rdi, %rsi,
* %rdx, and %rcx. Note that for this reason, x86_64 does not need any
* special handling for dealing with 4 arguments, unlike i386.
* However, x86_64 also have to clobber all caller saved registers, which
@@ -414,11 +382,9 @@ int paravirt_disable_iospace(void);
* makes sure the incoming and outgoing types are always correct.
*/
#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "d" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "c" ((unsigned long)(x))
@@ -434,12 +400,10 @@ int paravirt_disable_iospace(void);
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
/* [re]ax isn't an arg, but the return val */
-#define PVOP_VCALL_ARGS \
+#define PVOP_CALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
__edx = __edx, __ecx = __ecx, __eax = __eax;
-#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
-
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
#define PVOP_CALL_ARG2(x) "S" ((unsigned long)(x))
#define PVOP_CALL_ARG3(x) "d" ((unsigned long)(x))
@@ -464,152 +428,138 @@ int paravirt_disable_iospace(void);
#define PVOP_TEST_NULL(op) ((void)pv_ops.op)
#endif
-#define PVOP_RETMASK(rettype) \
+#define PVOP_RETVAL(rettype) \
({ unsigned long __mask = ~0UL; \
+ BUILD_BUG_ON(sizeof(rettype) > sizeof(unsigned long)); \
switch (sizeof(rettype)) { \
case 1: __mask = 0xffUL; break; \
case 2: __mask = 0xffffUL; break; \
case 4: __mask = 0xffffffffUL; break; \
default: break; \
} \
- __mask; \
+ __mask & __eax; \
})
-#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
- pre, post, ...) \
+#define ____PVOP_CALL(ret, op, clbr, call_clbr, extra_clbr, ...) \
({ \
- rettype __ret; \
PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- /* This is 32-bit specific, but is okay in 64-bit */ \
- /* since this condition will never hold */ \
- if (sizeof(rettype) > sizeof(unsigned long)) { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)((((u64)__edx) << 32) | __eax); \
- } else { \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
- : call_clbr, ASM_CALL_CONSTRAINT \
- : paravirt_type(op), \
- paravirt_clobber(clbr), \
- ##__VA_ARGS__ \
- : "memory", "cc" extra_clbr); \
- __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \
- } \
- __ret; \
+ asm volatile(paravirt_alt(PARAVIRT_CALL) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
+ : paravirt_type(op), \
+ paravirt_clobber(clbr), \
+ ##__VA_ARGS__ \
+ : "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_CALL(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op, CLBR_ANY, PVOP_CALL_CLOBBERS, \
- EXTRA_CLOBBERS, pre, post, ##__VA_ARGS__)
-
-#define __PVOP_CALLEESAVE(rettype, op, pre, post, ...) \
- ____PVOP_CALL(rettype, op.func, CLBR_RET_REG, \
- PVOP_CALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
-
-
-#define ____PVOP_VCALL(op, clbr, call_clbr, extra_clbr, pre, post, ...) \
+#define ____PVOP_ALT_CALL(ret, op, alt, cond, clbr, call_clbr, \
+ extra_clbr, ...) \
({ \
- PVOP_VCALL_ARGS; \
+ PVOP_CALL_ARGS; \
PVOP_TEST_NULL(op); \
- asm volatile(pre \
- paravirt_alt(PARAVIRT_CALL) \
- post \
+ asm volatile(ALTERNATIVE(paravirt_alt(PARAVIRT_CALL), \
+ alt, cond) \
: call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
+ ret; \
})
-#define __PVOP_VCALL(op, pre, post, ...) \
- ____PVOP_VCALL(op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
- VEXTRA_CLOBBERS, \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_CALL(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op, CLBR_ANY, \
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALL(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op, alt, cond, CLBR_ANY,\
+ PVOP_CALL_CLOBBERS, EXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
+
+#define __PVOP_CALLEESAVE(rettype, op, ...) \
+ ____PVOP_CALL(PVOP_RETVAL(rettype), op.func, CLBR_RET_REG, \
+ PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+#define __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, ...) \
+ ____PVOP_ALT_CALL(PVOP_RETVAL(rettype), op.func, alt, cond, \
+ CLBR_RET_REG, PVOP_CALLEE_CLOBBERS, , ##__VA_ARGS__)
+
+
+#define __PVOP_VCALL(op, ...) \
+ (void)____PVOP_CALL(, op, CLBR_ANY, PVOP_VCALL_CLOBBERS, \
+ VEXTRA_CLOBBERS, ##__VA_ARGS__)
+
+#define __PVOP_ALT_VCALL(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op, alt, cond, CLBR_ANY, \
+ PVOP_VCALL_CLOBBERS, VEXTRA_CLOBBERS, \
+ ##__VA_ARGS__)
-#define __PVOP_VCALLEESAVE(op, pre, post, ...) \
- ____PVOP_VCALL(op.func, CLBR_RET_REG, \
- PVOP_VCALLEE_CLOBBERS, , \
- pre, post, ##__VA_ARGS__)
+#define __PVOP_VCALLEESAVE(op, ...) \
+ (void)____PVOP_CALL(, op.func, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
+#define __PVOP_ALT_VCALLEESAVE(op, alt, cond, ...) \
+ (void)____PVOP_ALT_CALL(, op.func, alt, cond, CLBR_RET_REG, \
+ PVOP_VCALLEE_CLOBBERS, , ##__VA_ARGS__)
#define PVOP_CALL0(rettype, op) \
- __PVOP_CALL(rettype, op, "", "")
+ __PVOP_CALL(rettype, op)
#define PVOP_VCALL0(op) \
- __PVOP_VCALL(op, "", "")
+ __PVOP_VCALL(op)
+#define PVOP_ALT_CALL0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALL(rettype, op, alt, cond)
+#define PVOP_ALT_VCALL0(op, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond)
#define PVOP_CALLEE0(rettype, op) \
- __PVOP_CALLEESAVE(rettype, op, "", "")
+ __PVOP_CALLEESAVE(rettype, op)
#define PVOP_VCALLEE0(op) \
- __PVOP_VCALLEESAVE(op, "", "")
+ __PVOP_VCALLEESAVE(op)
+#define PVOP_ALT_CALLEE0(rettype, op, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond)
+#define PVOP_ALT_VCALLEE0(op, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond)
#define PVOP_CALL1(rettype, op, arg1) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALL1(op, arg1) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALL1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALL(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALLEE1(rettype, op, arg1) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_CALLEESAVE(rettype, op, PVOP_CALL_ARG1(arg1))
#define PVOP_VCALLEE1(op, arg1) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1))
+ __PVOP_VCALLEESAVE(op, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_CALLEE1(rettype, op, arg1, alt, cond) \
+ __PVOP_ALT_CALLEESAVE(rettype, op, alt, cond, PVOP_CALL_ARG1(arg1))
+#define PVOP_ALT_VCALLEE1(op, arg1, alt, cond) \
+ __PVOP_ALT_VCALLEESAVE(op, alt, cond, PVOP_CALL_ARG1(arg1))
#define PVOP_CALL2(rettype, op, arg1, arg2) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_VCALL2(op, arg1, arg2) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
-#define PVOP_CALLEE2(rettype, op, arg1, arg2) \
- __PVOP_CALLEESAVE(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-#define PVOP_VCALLEE2(op, arg1, arg2) \
- __PVOP_VCALLEESAVE(op, "", "", PVOP_CALL_ARG1(arg1), \
- PVOP_CALL_ARG2(arg2))
-
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2))
#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
- __PVOP_CALL(rettype, op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_CALL(rettype, op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
#define PVOP_VCALL3(op, arg1, arg2, arg3) \
- __PVOP_VCALL(op, "", "", PVOP_CALL_ARG1(arg1), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), \
PVOP_CALL_ARG2(arg2), PVOP_CALL_ARG3(arg3))
-/* This is the only difference in x86_64. We can make it much simpler */
-#ifdef CONFIG_X86_32
#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
__PVOP_CALL(rettype, op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
- PVOP_CALL_ARG3(arg3), [_arg4] "mr" ((u32)(arg4)))
-#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, \
- "push %[_arg4];", "lea 4(%%esp),%%esp;", \
- "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
- "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
-#else
-#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
- __PVOP_CALL(rettype, op, "", "", \
PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
- __PVOP_VCALL(op, "", "", \
- PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
+ __PVOP_VCALL(op, PVOP_CALL_ARG1(arg1), PVOP_CALL_ARG2(arg2), \
PVOP_CALL_ARG3(arg3), PVOP_CALL_ARG4(arg4))
-#endif
/* Lazy mode for batching updates / context switch */
enum paravirt_lazy_mode {
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..b1099f2d9800 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1244,7 +1244,7 @@ static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
- * dst - pointer to pgd range anwhere on a pgd page
+ * dst - pointer to pgd range anywhere on a pgd page
* src - ""
* count - the number of pgds to copy.
*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f1b9ed5efaa9..154321d29050 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
- /*
- * We store cpu_current_top_of_stack in sp1 so it's always accessible.
- * Linux does not use ring 1, so sp1 is not otherwise needed.
- */
u64 sp1;
/*
@@ -426,12 +421,7 @@ struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
-#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
#ifdef CONFIG_X86_64
struct fixed_percpu_data {
@@ -439,6 +429,9 @@ struct fixed_percpu_data {
* GCC hardcodes the stack canary as %gs:40. Since the
* irq_stack is the object at %gs:0, we reserve the bottom
* 48 bytes of the irq stack for the canary.
+ *
+ * Once we are willing to require -mstack-protector-guard-symbol=
+ * support for x86_64 stackprotector, we can get rid of this.
*/
char gs_base[40];
unsigned long stack_canary;
@@ -460,17 +453,7 @@ extern asmlinkage void ignore_sysret(void);
void current_save_fsgs(void);
#else /* X86_64 */
#ifdef CONFIG_STACKPROTECTOR
-/*
- * Make sure stack canary segment base is cached-aligned:
- * "For Intel Atom processors, avoid non zero segment base address
- * that is not aligned to cache line boundary at all cost."
- * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
- */
-struct stack_canary {
- char __pad[20]; /* canary at %gs:20 */
- unsigned long canary;
-};
-DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+DECLARE_PER_CPU(unsigned long, __stack_chk_guard);
#endif
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
@@ -527,7 +510,7 @@ struct thread_struct {
struct io_bitmap *io_bitmap;
/*
- * IOPL. Priviledge level dependent I/O permission which is
+ * IOPL. Privilege level dependent I/O permission which is
* emulated via the I/O bitmap to prevent user space from disabling
* interrupts.
*/
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index b6a9d51d1d79..8c5d1910a848 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -4,6 +4,8 @@
#include <asm/ldt.h>
+struct task_struct;
+
/* misc architecture specific prototypes */
void syscall_init(void);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 409f661481e1..b94f615600d5 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -37,7 +37,10 @@ struct pt_regs {
unsigned short __esh;
unsigned short fs;
unsigned short __fsh;
- /* On interrupt, gs and __gsh store the vector number. */
+ /*
+ * On interrupt, gs and __gsh store the vector number. They never
+ * store gs any more.
+ */
unsigned short gs;
unsigned short __gsh;
/* On interrupt, this is the error code. */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 7fdd4facfce7..72044026eb3c 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -95,7 +95,7 @@
*
* 26 - ESPFIX small SS
* 27 - per-cpu [ offset to per-cpu data area ]
- * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8
+ * 28 - unused
* 29 - unused
* 30 - unused
* 31 - TSS for double fault handler
@@ -118,7 +118,6 @@
#define GDT_ENTRY_ESPFIX_SS 26
#define GDT_ENTRY_PERCPU 27
-#define GDT_ENTRY_STACK_CANARY 28
#define GDT_ENTRY_DOUBLEFAULT_TSS 31
@@ -158,12 +157,6 @@
# define __KERNEL_PERCPU 0
#endif
-#ifdef CONFIG_STACKPROTECTOR
-# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
-#else
-# define __KERNEL_STACK_CANARY 0
-#endif
-
#else /* 64-bit: */
#include <asm/cache.h>
@@ -364,22 +357,15 @@ static inline void __loadsegment_fs(unsigned short value)
asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
/*
- * x86-32 user GS accessors:
+ * x86-32 user GS accessors. This is ugly and could do with some cleaning up.
*/
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_32_LAZY_GS
-# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
-# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
-# define task_user_gs(tsk) ((tsk)->thread.gs)
-# define lazy_save_gs(v) savesegment(gs, (v))
-# define lazy_load_gs(v) loadsegment(gs, (v))
-# else /* X86_32_LAZY_GS */
-# define get_user_gs(regs) (u16)((regs)->gs)
-# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
-# define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
-# define lazy_save_gs(v) do { } while (0)
-# define lazy_load_gs(v) do { } while (0)
-# endif /* X86_32_LAZY_GS */
+# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; })
+# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
+# define task_user_gs(tsk) ((tsk)->thread.gs)
+# define lazy_save_gs(v) savesegment(gs, (v))
+# define lazy_load_gs(v) loadsegment(gs, (v))
+# define load_gs_index(v) loadsegment(gs, (v))
#endif /* X86_32 */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 4352f08bfbb5..43fa081a1adb 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -8,8 +8,8 @@
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
- * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack
- * Executability : eXeutable, NoteXecutable
+ * Cacheability : UnCached, WriteCombining, WriteThrough, WriteBack
+ * Executability : eXecutable, NoteXecutable
* Read/Write : ReadOnly, ReadWrite
* Presence : NotPresent
* Encryption : Encrypted, Decrypted
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 389d851a02c4..a12458a7a8d4 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -130,11 +130,6 @@ void *extend_brk(size_t size, size_t align);
: : "i" (sz)); \
}
-/* Helper for reserving space for arrays of things */
-#define RESERVE_BRK_ARRAY(type, name, entries) \
- type *name; \
- RESERVE_BRK(name, sizeof(type) * entries)
-
extern void probe_roms(void);
#ifdef __i386__
diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx.h
index dd7602c44c72..9c31e0ebc55b 100644
--- a/arch/x86/kernel/cpu/sgx/arch.h
+++ b/arch/x86/include/asm/sgx.h
@@ -2,15 +2,20 @@
/**
* Copyright(c) 2016-20 Intel Corporation.
*
- * Contains data structures defined by the SGX architecture. Data structures
- * defined by the Linux software stack should not be placed here.
+ * Intel Software Guard Extensions (SGX) support.
*/
-#ifndef _ASM_X86_SGX_ARCH_H
-#define _ASM_X86_SGX_ARCH_H
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
#include <linux/bits.h>
#include <linux/types.h>
+/*
+ * This file contains both data structures defined by SGX architecture and Linux
+ * defined software data structures and functions. The two should not be mixed
+ * together for better readibility. The architectural definitions come first.
+ */
+
/* The SGX specific CPUID function. */
#define SGX_CPUID 0x12
/* EPC enumeration. */
@@ -22,16 +27,36 @@
/* The bitmask for the EPC section type. */
#define SGX_CPUID_EPC_MASK GENMASK(3, 0)
+enum sgx_encls_function {
+ ECREATE = 0x00,
+ EADD = 0x01,
+ EINIT = 0x02,
+ EREMOVE = 0x03,
+ EDGBRD = 0x04,
+ EDGBWR = 0x05,
+ EEXTEND = 0x06,
+ ELDU = 0x08,
+ EBLOCK = 0x09,
+ EPA = 0x0A,
+ EWB = 0x0B,
+ ETRACK = 0x0C,
+ EAUG = 0x0D,
+ EMODPR = 0x0E,
+ EMODT = 0x0F,
+};
+
/**
* enum sgx_return_code - The return code type for ENCLS, ENCLU and ENCLV
* %SGX_NOT_TRACKED: Previous ETRACK's shootdown sequence has not
* been completed yet.
+ * %SGX_CHILD_PRESENT SECS has child pages present in the EPC.
* %SGX_INVALID_EINITTOKEN: EINITTOKEN is invalid and enclave signer's
* public key does not match IA32_SGXLEPUBKEYHASH.
* %SGX_UNMASKED_EVENT: An unmasked event, e.g. INTR, was received
*/
enum sgx_return_code {
SGX_NOT_TRACKED = 11,
+ SGX_CHILD_PRESENT = 13,
SGX_INVALID_EINITTOKEN = 16,
SGX_UNMASKED_EVENT = 128,
};
@@ -271,7 +296,7 @@ struct sgx_pcmd {
* @header1: constant byte string
* @vendor: must be either 0x0000 or 0x8086
* @date: YYYYMMDD in BCD
- * @header2: costant byte string
+ * @header2: constant byte string
* @swdefined: software defined value
*/
struct sgx_sigstruct_header {
@@ -335,4 +360,19 @@ struct sgx_sigstruct {
#define SGX_LAUNCH_TOKEN_SIZE 304
-#endif /* _ASM_X86_SGX_ARCH_H */
+/*
+ * Do not put any hardware-defined SGX structure representations below this
+ * comment!
+ */
+
+#ifdef CONFIG_X86_SGX_KVM
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr);
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd);
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 0bc9b0895f33..d17b39893b79 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -11,6 +11,7 @@
#include <asm/nops.h>
#include <asm/cpufeatures.h>
+#include <asm/alternative.h>
/* "Raw" instruction opcodes */
#define __ASM_CLAC ".byte 0x0f,0x01,0xca"
@@ -18,8 +19,6 @@
#ifdef __ASSEMBLY__
-#include <asm/alternative-asm.h>
-
#ifdef CONFIG_X86_SMAP
#define ASM_CLAC \
@@ -37,8 +36,6 @@
#else /* __ASSEMBLY__ */
-#include <asm/alternative.h>
-
#ifdef CONFIG_X86_SMAP
static __always_inline void clac(void)
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 1d3cbaef4bb7..2acd6cb62328 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -214,7 +214,7 @@ static inline void clflush(volatile void *__p)
static inline void clflushopt(volatile void *__p)
{
- alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0",
+ alternative_io(".byte 0x3e; clflush %P0",
".byte 0x66; clflush %P0",
X86_FEATURE_CLFLUSHOPT,
"+m" (*(volatile char __force *)__p));
@@ -225,7 +225,7 @@ static inline void clwb(volatile void *__p)
volatile struct { char x[64]; } *p = __p;
asm volatile(ALTERNATIVE_2(
- ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+ ".byte 0x3e; clflush (%[pax])",
".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
X86_FEATURE_CLFLUSHOPT,
".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 7fb482f0f25b..b6ffe58c70fa 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -5,30 +5,23 @@
* Stack protector works by putting predefined pattern at the start of
* the stack frame and verifying that it hasn't been overwritten when
* returning from the function. The pattern is called stack canary
- * and unfortunately gcc requires it to be at a fixed offset from %gs.
- * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64
- * and x86_32 use segment registers differently and thus handles this
- * requirement differently.
+ * and unfortunately gcc historically required it to be at a fixed offset
+ * from the percpu segment base. On x86_64, the offset is 40 bytes.
*
- * On x86_64, %gs is shared by percpu area and stack canary. All
- * percpu symbols are zero based and %gs points to the base of percpu
- * area. The first occupant of the percpu area is always
- * fixed_percpu_data which contains stack_canary at offset 40. Userland
- * %gs is always saved and restored on kernel entry and exit using
- * swapgs, so stack protector doesn't add any complexity there.
+ * The same segment is shared by percpu area and stack canary. On
+ * x86_64, percpu symbols are zero based and %gs (64-bit) points to the
+ * base of percpu area. The first occupant of the percpu area is always
+ * fixed_percpu_data which contains stack_canary at the approproate
+ * offset. On x86_32, the stack canary is just a regular percpu
+ * variable.
*
- * On x86_32, it's slightly more complicated. As in x86_64, %gs is
- * used for userland TLS. Unfortunately, some processors are much
- * slower at loading segment registers with different value when
- * entering and leaving the kernel, so the kernel uses %fs for percpu
- * area and manages %gs lazily so that %gs is switched only when
- * necessary, usually during task switch.
+ * Putting percpu data in %fs on 32-bit is a minor optimization compared to
+ * using %gs. Since 32-bit userspace normally has %fs == 0, we are likely
+ * to load 0 into %fs on exit to usermode, whereas with percpu data in
+ * %gs, we are likely to load a non-null %gs on return to user mode.
*
- * As gcc requires the stack canary at %gs:20, %gs can't be managed
- * lazily if stack protector is enabled, so the kernel saves and
- * restores userland %gs on kernel entry and exit. This behavior is
- * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
- * system.h to hide the details.
+ * Once we are willing to require GCC 8.1 or better for 64-bit stackprotector
+ * support, we can remove some of this complexity.
*/
#ifndef _ASM_STACKPROTECTOR_H
@@ -45,14 +38,6 @@
#include <linux/sched.h>
/*
- * 24 byte read-only segment initializer for stack canary. Linker
- * can't handle the address bit shifting. Address will be set in
- * head_32 for boot CPU and setup_per_cpu_areas() for others.
- */
-#define GDT_STACK_CANARY_INIT \
- [GDT_ENTRY_STACK_CANARY] = GDT_ENTRY_INIT(0x4090, 0, 0x18),
-
-/*
* Initialize the stackprotector canary value.
*
* NOTE: this must only be called from functions that never return
@@ -86,7 +71,7 @@ static __always_inline void boot_init_stack_canary(void)
#ifdef CONFIG_X86_64
this_cpu_write(fixed_percpu_data.stack_canary, canary);
#else
- this_cpu_write(stack_canary.canary, canary);
+ this_cpu_write(__stack_chk_guard, canary);
#endif
}
@@ -95,48 +80,16 @@ static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_64
per_cpu(fixed_percpu_data.stack_canary, cpu) = idle->stack_canary;
#else
- per_cpu(stack_canary.canary, cpu) = idle->stack_canary;
-#endif
-}
-
-static inline void setup_stack_canary_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
- struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
- struct desc_struct desc;
-
- desc = gdt_table[GDT_ENTRY_STACK_CANARY];
- set_desc_base(&desc, canary);
- write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S);
-#endif
-}
-
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory");
+ per_cpu(__stack_chk_guard, cpu) = idle->stack_canary;
#endif
}
#else /* STACKPROTECTOR */
-#define GDT_STACK_CANARY_INIT
-
/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */
-static inline void setup_stack_canary_segment(int cpu)
-{ }
-
static inline void cpu_init_stack_canary(int cpu, struct task_struct *idle)
{ }
-static inline void load_stack_canary_segment(void)
-{
-#ifdef CONFIG_X86_32
- asm volatile ("mov %0, %%gs" : : "r" (0));
-#endif
-}
-
#endif /* STACKPROTECTOR */
#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fdbd9d7b7bca..7b132d0312eb 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -13,12 +13,10 @@
/* image of the saved processor state */
struct saved_context {
/*
- * On x86_32, all segment registers, with the possible exception of
- * gs, are saved at kernel entry in pt_regs.
+ * On x86_32, all segment registers except gs are saved at kernel
+ * entry in pt_regs.
*/
-#ifdef CONFIG_X86_32_LAZY_GS
u16 gs;
-#endif
unsigned long cr0, cr2, cr3, cr4;
u64 misc_enable;
bool misc_enable_saved;
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct *task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
#else
- /*
- * x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
- * doesn't work on x86-32 because sp1 and
- * cpu_current_top_of_stack have different values (because of
- * the non-zero stack-padding on 32bit).
- */
+ /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
#endif
diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h
index a84333adeef2..80c08c7d5e72 100644
--- a/arch/x86/include/asm/syscall_wrapper.h
+++ b/arch/x86/include/asm/syscall_wrapper.h
@@ -80,6 +80,7 @@ extern long __ia32_sys_ni_syscall(const struct pt_regs *regs);
}
#define __COND_SYSCALL(abi, name) \
+ __weak long __##abi##_##name(const struct pt_regs *__unused); \
__weak long __##abi##_##name(const struct pt_regs *__unused) \
{ \
return sys_ni_syscall(); \
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 06b740bae431..de406d93b515 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * const stack,
#endif
}
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
-#endif
+#endif /* !__ASSEMBLY__ */
/*
* Thread-synchronous status.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 8c87a2e0b660..fa952eadbc2e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -90,23 +90,6 @@ struct tlb_state {
u16 next_asid;
/*
- * We can be in one of several states:
- *
- * - Actively using an mm. Our CPU's bit will be set in
- * mm_cpumask(loaded_mm) and is_lazy == false;
- *
- * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
- * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
- *
- * - Lazily using a real mm. loaded_mm != &init_mm, our bit
- * is set in mm_cpumask(loaded_mm), but is_lazy == true.
- * We're heuristically guessing that the CR3 load we
- * skipped more than makes up for the overhead added by
- * lazy mode.
- */
- bool is_lazy;
-
- /*
* If set we changed the page tables in such a way that we
* needed an invalidation of all contexts (aka. PCIDs / ASIDs).
* This tells us to go invalidate all the non-loaded ctxs[]
@@ -151,7 +134,27 @@ struct tlb_state {
*/
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
};
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+struct tlb_state_shared {
+ /*
+ * We can be in one of several states:
+ *
+ * - Actively using an mm. Our CPU's bit will be set in
+ * mm_cpumask(loaded_mm) and is_lazy == false;
+ *
+ * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit
+ * will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+ *
+ * - Lazily using a real mm. loaded_mm != &init_mm, our bit
+ * is set in mm_cpumask(loaded_mm), but is_lazy == true.
+ * We're heuristically guessing that the CR3 load we
+ * skipped more than makes up for the overhead added by
+ * lazy mode.
+ */
+ bool is_lazy;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
bool nmi_uaccess_okay(void);
#define nmi_uaccess_okay nmi_uaccess_okay
@@ -175,7 +178,7 @@ extern void initialize_tlbstate_and_flush(void);
* - flush_tlb_page(vma, vmaddr) flushes one page
* - flush_tlb_range(vma, start, end) flushes a range of pages
* - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
+ * - flush_tlb_multi(cpumask, info) flushes TLBs on multiple cpus
*
* ..but the i386 has somewhat limited tlb flushing capabilities,
* and page-granular flushes are available only on i486 and up.
@@ -201,14 +204,15 @@ struct flush_tlb_info {
unsigned long start;
unsigned long end;
u64 new_tlb_gen;
- unsigned int stride_shift;
- bool freed_tables;
+ unsigned int initiating_cpu;
+ u8 stride_shift;
+ u8 freed_tables;
};
void flush_tlb_local(void);
void flush_tlb_one_user(unsigned long addr);
void flush_tlb_one_kernel(unsigned long addr);
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info);
#ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h
index 4d705cb4d63b..a8e5a7a2b460 100644
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -8,7 +8,7 @@
#if IS_ENABLED(CONFIG_HYPERV)
-TRACE_EVENT(hyperv_mmu_flush_tlb_others,
+TRACE_EVENT(hyperv_mmu_flush_tlb_multi,
TP_PROTO(const struct cpumask *cpus,
const struct flush_tlb_info *info),
TP_ARGS(cpus, info),
diff --git a/arch/x86/include/asm/uv/uv_geo.h b/arch/x86/include/asm/uv/uv_geo.h
index f241451035fb..027a9258dbca 100644
--- a/arch/x86/include/asm/uv/uv_geo.h
+++ b/arch/x86/include/asm/uv/uv_geo.h
@@ -10,7 +10,7 @@
#ifndef _ASM_UV_GEO_H
#define _ASM_UV_GEO_H
-/* Type declaractions */
+/* Type declarations */
/* Size of a geoid_s structure (must be before decl. of geoid_u) */
#define GEOID_SIZE 8
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 5002f52be332..d3e3197917be 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -353,7 +353,7 @@ union uvh_apicid {
*
* Note there are NO leds on a UV system. This register is only
* used by the system controller to monitor system-wide operation.
- * There are 64 regs per node. With Nahelem cpus (2 cores per node,
+ * There are 64 regs per node. With Nehalem cpus (2 cores per node,
* 8 cpus per core, 2 threads per cpu) there are 32 cpu threads on
* a node.
*
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index df01d7349d79..1936f21ed8cd 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -58,7 +58,8 @@ extern struct ms_hyperv_tsc_page hvclock_page
#endif
#ifdef CONFIG_TIME_NS
-static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void)
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
{
return __timens_vdso_data;
}
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
index 29837740b520..49ce331f3ac6 100644
--- a/arch/x86/include/asm/vmalloc.h
+++ b/arch/x86/include/asm/vmalloc.h
@@ -1,6 +1,26 @@
#ifndef _ASM_X86_VMALLOC_H
#define _ASM_X86_VMALLOC_H
+#include <asm/cpufeature.h>
+#include <asm/page.h>
#include <asm/pgtable_areas.h>
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+
+#ifdef CONFIG_X86_64
+#define arch_vmap_pud_supported arch_vmap_pud_supported
+static inline bool arch_vmap_pud_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_GBPAGES);
+}
+#endif
+
+#define arch_vmap_pmd_supported arch_vmap_pmd_supported
+static inline bool arch_vmap_pmd_supported(pgprot_t prot)
+{
+ return boot_cpu_has(X86_FEATURE_PSE);
+}
+
+#endif
+
#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 600a141c8805..b25d3f82c2f3 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -234,7 +234,7 @@ struct boot_params {
* handling of page tables.
*
* These enums should only ever be used by x86 code, and the code that uses
- * it should be well contained and compartamentalized.
+ * it should be well contained and compartmentalized.
*
* KVM and Xen HVM do not have a subarch as these are expected to follow
* standard x86 boot entries. If there is a genuine need for "hypervisor" type
@@ -252,7 +252,7 @@ struct boot_params {
* @X86_SUBARCH_XEN: Used for Xen guest types which follow the PV boot path,
* which start at asm startup_xen() entry point and later jump to the C
* xen_start_kernel() entry point. Both domU and dom0 type of guests are
- * currently supportd through this PV boot path.
+ * currently supported through this PV boot path.
* @X86_SUBARCH_INTEL_MID: Used for Intel MID (Mobile Internet Device) platform
* systems which do not have the PCI legacy interfaces.
* @X86_SUBARCH_CE4100: Used for Intel CE media processor (CE4100) SoC
diff --git a/arch/x86/include/uapi/asm/debugreg.h b/arch/x86/include/uapi/asm/debugreg.h
index d95d080b30e3..0007ba077c0c 100644
--- a/arch/x86/include/uapi/asm/debugreg.h
+++ b/arch/x86/include/uapi/asm/debugreg.h
@@ -24,6 +24,7 @@
#define DR_TRAP3 (0x8) /* db3 */
#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
+#define DR_BUS_LOCK (0x800) /* bus_lock */
#define DR_STEP (0x4000) /* single-step */
#define DR_SWITCH (0x8000) /* task switch */
diff --git a/arch/x86/include/uapi/asm/msgbuf.h b/arch/x86/include/uapi/asm/msgbuf.h
index b3d0664fadc9..ac83e25bbf37 100644
--- a/arch/x86/include/uapi/asm/msgbuf.h
+++ b/arch/x86/include/uapi/asm/msgbuf.h
@@ -12,7 +12,7 @@
* The msqid64_ds structure for x86 architecture with x32 ABI.
*
* On x86-32 and x86-64 we can just use the generic definition, but
- * x32 uses the same binary layout as x86_64, which is differnet
+ * x32 uses the same binary layout as x86_64, which is different
* from other 32-bit architectures.
*/
diff --git a/arch/x86/include/uapi/asm/sgx.h b/arch/x86/include/uapi/asm/sgx.h
index 9034f3007c4e..9690d6899ad9 100644
--- a/arch/x86/include/uapi/asm/sgx.h
+++ b/arch/x86/include/uapi/asm/sgx.h
@@ -152,7 +152,7 @@ struct sgx_enclave_run {
* Most exceptions reported on ENCLU, including those that occur within the
* enclave, are fixed up and reported synchronously instead of being delivered
* via a standard signal. Debug Exceptions (#DB) and Breakpoints (#BP) are
- * never fixed up and are always delivered via standard signals. On synchrously
+ * never fixed up and are always delivered via standard signals. On synchronously
* reported exceptions, -EFAULT is returned and details about the exception are
* recorded in @run.exception, the optional sgx_enclave_exception struct.
*
diff --git a/arch/x86/include/uapi/asm/shmbuf.h b/arch/x86/include/uapi/asm/shmbuf.h
index f0305dc660c9..fce18eaa070c 100644
--- a/arch/x86/include/uapi/asm/shmbuf.h
+++ b/arch/x86/include/uapi/asm/shmbuf.h
@@ -9,7 +9,7 @@
* The shmid64_ds structure for x86 architecture with x32 ABI.
*
* On x86-32 and x86-64 we can just use the generic definition, but
- * x32 uses the same binary layout as x86_64, which is differnet
+ * x32 uses the same binary layout as x86_64, which is different
* from other 32-bit architectures.
*/
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..d0d9b331d3a1 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -139,7 +139,7 @@ struct _fpstate_32 {
* The 64-bit FPU frame. (FXSAVE format and later)
*
* Note1: If sw_reserved.magic1 == FP_XSTATE_MAGIC1 then the structure is
- * larger: 'struct _xstate'. Note that 'struct _xstate' embedds
+ * larger: 'struct _xstate'. Note that 'struct _xstate' embeds
* 'struct _fpstate' so that you can always assume the _fpstate portion
* exists so that you can check the magic value.
*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..0704c2a94272 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -35,7 +35,6 @@ KASAN_SANITIZE_sev-es.o := n
KCSAN_SANITIZE := n
OBJECT_FILES_NON_STANDARD_test_nx.o := y
-OBJECT_FILES_NON_STANDARD_paravirt_patch.o := y
ifdef CONFIG_FRAME_POINTER
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
@@ -121,7 +120,7 @@ obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 14cd3186dc77..e90310cbe73a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -830,7 +830,7 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
EXPORT_SYMBOL(acpi_unregister_ioapic);
/**
- * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base
+ * acpi_ioapic_registered - Check whether IOAPIC associated with @gsi_base
* has been registered
* @handle: ACPI handle of the IOAPIC device
* @gsi_base: GSI base associated with the IOAPIC
@@ -1656,7 +1656,7 @@ static int __init parse_acpi(char *arg)
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
}
- /* "acpi=copy_dsdt" copys DSDT */
+ /* "acpi=copy_dsdt" copies DSDT */
else if (strcmp(arg, "copy_dsdt") == 0) {
acpi_gbl_copy_dsdt_locally = 1;
}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index cc1fea76aab0..3f85fcae450c 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -41,7 +41,7 @@ unsigned long acpi_get_wakeup_address(void)
* x86_acpi_enter_sleep_state - enter sleep state
* @state: Sleep state to enter.
*
- * Wrapper around acpi_enter_sleep_state() to be called by assmebly.
+ * Wrapper around acpi_enter_sleep_state() to be called by assembly.
*/
asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
{
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 56b6865afb2a..d5d8a352eafa 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -115,7 +115,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
-#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK
+#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK)
/*
* The suspend path may have poisoned some areas deeper in the stack,
* which we now need to unpoison.
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8d778e46725d..6974b5174495 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -28,6 +28,7 @@
#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
+#include <asm/paravirt.h>
int __read_mostly alternatives_patched;
@@ -74,186 +75,30 @@ do { \
} \
} while (0)
-/*
- * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
- * that correspond to that nop. Getting from one nop to the next, we
- * add to the array the offset that is equal to the sum of all sizes of
- * nops preceding the one we are after.
- *
- * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
- * nice symmetry of sizes of the previous nops.
- */
-#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char intelnops[] =
-{
- GENERIC_NOP1,
- GENERIC_NOP2,
- GENERIC_NOP3,
- GENERIC_NOP4,
- GENERIC_NOP5,
- GENERIC_NOP6,
- GENERIC_NOP7,
- GENERIC_NOP8,
- GENERIC_NOP5_ATOMIC
-};
-static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+const unsigned char x86nops[] =
{
- NULL,
- intelnops,
- intelnops + 1,
- intelnops + 1 + 2,
- intelnops + 1 + 2 + 3,
- intelnops + 1 + 2 + 3 + 4,
- intelnops + 1 + 2 + 3 + 4 + 5,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ BYTES_NOP1,
+ BYTES_NOP2,
+ BYTES_NOP3,
+ BYTES_NOP4,
+ BYTES_NOP5,
+ BYTES_NOP6,
+ BYTES_NOP7,
+ BYTES_NOP8,
};
-#endif
-#ifdef K8_NOP1
-static const unsigned char k8nops[] =
-{
- K8_NOP1,
- K8_NOP2,
- K8_NOP3,
- K8_NOP4,
- K8_NOP5,
- K8_NOP6,
- K8_NOP7,
- K8_NOP8,
- K8_NOP5_ATOMIC
-};
-static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
{
NULL,
- k8nops,
- k8nops + 1,
- k8nops + 1 + 2,
- k8nops + 1 + 2 + 3,
- k8nops + 1 + 2 + 3 + 4,
- k8nops + 1 + 2 + 3 + 4 + 5,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ x86nops,
+ x86nops + 1,
+ x86nops + 1 + 2,
+ x86nops + 1 + 2 + 3,
+ x86nops + 1 + 2 + 3 + 4,
+ x86nops + 1 + 2 + 3 + 4 + 5,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6,
+ x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
};
-#endif
-
-#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-static const unsigned char k7nops[] =
-{
- K7_NOP1,
- K7_NOP2,
- K7_NOP3,
- K7_NOP4,
- K7_NOP5,
- K7_NOP6,
- K7_NOP7,
- K7_NOP8,
- K7_NOP5_ATOMIC
-};
-static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
-{
- NULL,
- k7nops,
- k7nops + 1,
- k7nops + 1 + 2,
- k7nops + 1 + 2 + 3,
- k7nops + 1 + 2 + 3 + 4,
- k7nops + 1 + 2 + 3 + 4 + 5,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
-#endif
-
-#ifdef P6_NOP1
-static const unsigned char p6nops[] =
-{
- P6_NOP1,
- P6_NOP2,
- P6_NOP3,
- P6_NOP4,
- P6_NOP5,
- P6_NOP6,
- P6_NOP7,
- P6_NOP8,
- P6_NOP5_ATOMIC
-};
-static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
-{
- NULL,
- p6nops,
- p6nops + 1,
- p6nops + 1 + 2,
- p6nops + 1 + 2 + 3,
- p6nops + 1 + 2 + 3 + 4,
- p6nops + 1 + 2 + 3 + 4 + 5,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
- p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
-};
-#endif
-
-/* Initialize these to a safe default */
-#ifdef CONFIG_X86_64
-const unsigned char * const *ideal_nops = p6_nops;
-#else
-const unsigned char * const *ideal_nops = intel_nops;
-#endif
-
-void __init arch_init_ideal_nops(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- /*
- * Due to a decoder implementation quirk, some
- * specific Intel CPUs actually perform better with
- * the "k8_nops" than with the SDM-recommended NOPs.
- */
- if (boot_cpu_data.x86 == 6 &&
- boot_cpu_data.x86_model >= 0x0f &&
- boot_cpu_data.x86_model != 0x1c &&
- boot_cpu_data.x86_model != 0x26 &&
- boot_cpu_data.x86_model != 0x27 &&
- boot_cpu_data.x86_model < 0x30) {
- ideal_nops = k8_nops;
- } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
- ideal_nops = p6_nops;
- } else {
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- ideal_nops = intel_nops;
-#endif
- }
- break;
-
- case X86_VENDOR_HYGON:
- ideal_nops = p6_nops;
- return;
-
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 > 0xf) {
- ideal_nops = p6_nops;
- return;
- }
-
- fallthrough;
-
- default:
-#ifdef CONFIG_X86_64
- ideal_nops = k8_nops;
-#else
- if (boot_cpu_has(X86_FEATURE_K8))
- ideal_nops = k8_nops;
- else if (boot_cpu_has(X86_FEATURE_K7))
- ideal_nops = k7_nops;
- else
- ideal_nops = intel_nops;
-#endif
- }
-}
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
static void __init_or_module add_nops(void *insns, unsigned int len)
@@ -262,7 +107,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
unsigned int noplen = len;
if (noplen > ASM_NOP_MAX)
noplen = ASM_NOP_MAX;
- memcpy(insns, ideal_nops[noplen], noplen);
+ memcpy(insns, x86_nops[noplen], noplen);
insns += noplen;
len -= noplen;
}
@@ -344,19 +189,35 @@ done:
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
unsigned long flags;
- int i;
+ struct insn insn;
+ int nop, i = 0;
+
+ /*
+ * Jump over the non-NOP insns, the remaining bytes must be single-byte
+ * NOPs, optimize them.
+ */
+ for (;;) {
+ if (insn_decode_kernel(&insn, &instr[i]))
+ return;
+
+ if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
+ break;
+
+ if ((i += insn.length) >= a->instrlen)
+ return;
+ }
- for (i = 0; i < a->padlen; i++) {
- if (instr[i] != 0x90)
+ for (nop = i; i < a->instrlen; i++) {
+ if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i]))
return;
}
local_irq_save(flags);
- add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+ add_nops(instr + nop, i - nop);
local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
- instr, a->instrlen - a->padlen, a->padlen);
+ instr, nop, a->instrlen);
}
/*
@@ -388,23 +249,29 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
*/
for (a = start; a < end; a++) {
int insn_buff_sz = 0;
+ /* Mask away "NOT" flag bit for feature to test. */
+ u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV;
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
BUG_ON(a->instrlen > sizeof(insn_buff));
- BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid)) {
- if (a->padlen > 1)
- optimize_nops(a, instr);
+ BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32);
- continue;
- }
+ /*
+ * Patch if either:
+ * - feature is present
+ * - feature not present but ALTINSTR_FLAG_INV is set to mean,
+ * patch if feature is *NOT* present.
+ */
+ if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV))
+ goto next;
- DPRINTK("feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d), pad: %d",
- a->cpuid >> 5,
- a->cpuid & 0x1f,
+ DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+ (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "",
+ feature >> 5,
+ feature & 0x1f,
instr, instr, a->instrlen,
- replacement, a->replacementlen, a->padlen);
+ replacement, a->replacementlen);
DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr);
DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement);
@@ -428,14 +295,15 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start,
if (a->replacementlen && is_jmp(replacement[0]))
recompute_jump(a, instr, replacement, insn_buff);
- if (a->instrlen > a->replacementlen) {
- add_nops(insn_buff + a->replacementlen,
- a->instrlen - a->replacementlen);
- insn_buff_sz += a->instrlen - a->replacementlen;
- }
+ for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
+ insn_buff[insn_buff_sz] = 0x90;
+
DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
text_poke_early(instr, insn_buff, insn_buff_sz);
+
+next:
+ optimize_nops(a, instr);
}
}
@@ -605,7 +473,7 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
BUG_ON(p->len > MAX_PATCH_LEN);
/* prep the buffer with the original instructions */
memcpy(insn_buff, p->instr, p->len);
- used = pv_ops.init.patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
+ used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
BUG_ON(used > p->len);
@@ -723,6 +591,33 @@ void __init alternative_instructions(void)
* patching.
*/
+ /*
+ * Paravirt patching and alternative patching can be combined to
+ * replace a function call with a short direct code sequence (e.g.
+ * by setting a constant return value instead of doing that in an
+ * external function).
+ * In order to make this work the following sequence is required:
+ * 1. set (artificial) features depending on used paravirt
+ * functions which can later influence alternative patching
+ * 2. apply paravirt patching (generally replacing an indirect
+ * function call with a direct one)
+ * 3. apply alternative patching (e.g. replacing a direct function
+ * call with a custom code sequence)
+ * Doing paravirt patching after alternative patching would clobber
+ * the optimization of the custom code with a function call again.
+ */
+ paravirt_set_cap();
+
+ /*
+ * First patch paravirt functions, such that we overwrite the indirect
+ * call with the direct call.
+ */
+ apply_paravirt(__parainstructions, __parainstructions_end);
+
+ /*
+ * Then patch alternatives, such that those paravirt calls that are in
+ * alternatives can be overwritten by their immediate fragments.
+ */
apply_alternatives(__alt_instructions, __alt_instructions_end);
#ifdef CONFIG_SMP
@@ -741,8 +636,6 @@ void __init alternative_instructions(void)
}
#endif
- apply_paravirt(__parainstructions, __parainstructions_end);
-
restart_nmi();
alternatives_patched = 1;
}
@@ -813,7 +706,7 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
* with a stale address space WITHOUT being in lazy mode after
* restoring the previous mm.
*/
- if (this_cpu_read(cpu_tlbstate.is_lazy))
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
leave_mm(smp_processor_id());
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
@@ -1274,15 +1167,15 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
const void *opcode, size_t len, const void *emulate)
{
struct insn insn;
+ int ret;
memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
- kernel_insn_init(&insn, emulate, MAX_INSN_SIZE);
- insn_get_length(&insn);
+ ret = insn_decode_kernel(&insn, emulate);
- BUG_ON(!insn_complete(&insn));
+ BUG_ON(ret < 0);
BUG_ON(len != insn.length);
tp->rel_addr = addr - (void *)_stext;
@@ -1302,13 +1195,13 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
default: /* assume NOP */
switch (len) {
case 2: /* NOP2 -- emulate as JMP8+0 */
- BUG_ON(memcmp(emulate, ideal_nops[len], len));
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP8_INSN_OPCODE;
tp->rel32 = 0;
break;
case 5: /* NOP5 -- emulate as JMP32+0 */
- BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len));
+ BUG_ON(memcmp(emulate, x86_nops[len], len));
tp->opcode = JMP32_INSN_OPCODE;
tp->rel32 = 0;
break;
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index b4396952c9a6..09083094eb57 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Shared support code for AMD K8 northbridges and derivates.
+ * Shared support code for AMD K8 northbridges and derivatives.
* Copyright 2006 Andi Kleen, SUSE Labs.
*/
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 4f26700f314d..4a39fb429f15 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -619,7 +619,7 @@ static void setup_APIC_timer(void)
if (this_cpu_has(X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
- /* Make LAPIC timer preferrable over percpu HPET */
+ /* Make LAPIC timer preferable over percpu HPET */
lapic_clockevent.rating = 150;
}
@@ -666,7 +666,7 @@ void lapic_update_tsc_freq(void)
* In this functions we calibrate APIC bus clocks to the external timer.
*
* We want to do the calibration only once since we want to have local timer
- * irqs syncron. CPUs connected by the same APIC bus have the very same bus
+ * irqs synchronous. CPUs connected by the same APIC bus have the very same bus
* frequency.
*
* This was previously done by reading the PIT/HPET and waiting for a wrap
@@ -1532,7 +1532,7 @@ static bool apic_check_and_ack(union apic_ir *irr, union apic_ir *isr)
* Most probably by now the CPU has serviced that pending interrupt and it
* might not have done the ack_APIC_irq() because it thought, interrupt
* came from i8259 as ExtInt. LAPIC did not get EOI so it does not clear
- * the ISR bit and cpu thinks it has already serivced the interrupt. Hence
+ * the ISR bit and cpu thinks it has already serviced the interrupt. Hence
* a vector might get locked. It was noticed for timer irq (vector
* 0x31). Issue an extra EOI to clear ISR.
*
@@ -1657,7 +1657,7 @@ static void setup_local_APIC(void)
*/
/*
* Actually disabling the focus CPU check just makes the hang less
- * frequent as it makes the interrupt distributon model be more
+ * frequent as it makes the interrupt distribution model be more
* like LRU than MRU (the short-term load is more even across CPUs).
*/
@@ -1875,7 +1875,7 @@ static __init void try_to_enable_x2apic(int remap_mode)
/*
* Without IR, all CPUs can be addressed by IOAPIC/MSI only
- * in physical mode, and CPUs with an APIC ID that cannnot
+ * in physical mode, and CPUs with an APIC ID that cannot
* be addressed must not be brought online.
*/
x2apic_set_max_apicid(apic_limit);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 73ff4dd426a8..d5c691a3208b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -928,7 +928,7 @@ static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
/*
* setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
- * and polarity attirbutes. So allow the first user to reprogram the
+ * and polarity attributes. So allow the first user to reprogram the
* pin with real trigger and polarity attributes.
*/
if (irq < nr_legacy_irqs() && data->count == 1) {
@@ -994,7 +994,7 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain,
/*
* Legacy ISA IRQ has already been allocated, just add pin to
- * the pin list assoicated with this IRQ and program the IOAPIC
+ * the pin list associated with this IRQ and program the IOAPIC
* entry. The IOAPIC entry
*/
if (irq_data && irq_data->parent_data) {
@@ -1752,7 +1752,7 @@ static inline void ioapic_finish_move(struct irq_data *data, bool moveit)
* with masking the ioapic entry and then polling until
* Remote IRR was clear before reprogramming the
* ioapic I don't trust the Remote IRR bit to be
- * completey accurate.
+ * completely accurate.
*
* However there appears to be no other way to plug
* this race, so if the Remote IRR bit is not
@@ -1830,7 +1830,7 @@ static void ioapic_ack_level(struct irq_data *irq_data)
/*
* Tail end of clearing remote IRR bit (either by delivering the EOI
* message via io-apic EOI register write or simulating it using
- * mask+edge followed by unnask+level logic) manually when the
+ * mask+edge followed by unmask+level logic) manually when the
* level triggered interrupt is seen as the edge triggered interrupt
* at the cpu.
*/
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3c9c7492252f..6dbdc7c22bb7 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -543,6 +543,14 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
return -ENOSYS;
+ /*
+ * Catch any attempt to touch the cascade interrupt on a PIC
+ * equipped system.
+ */
+ if (WARN_ON_ONCE(info->flags & X86_IRQ_ALLOC_LEGACY &&
+ virq == PIC_CASCADE_IR))
+ return -EINVAL;
+
for (i = 0; i < nr_irqs; i++) {
irqd = irq_domain_get_irq_data(domain, virq + i);
BUG_ON(!irqd);
@@ -745,6 +753,11 @@ void __init lapic_assign_system_vectors(void)
/* Mark the preallocated legacy interrupts */
for (i = 0; i < nr_legacy_irqs(); i++) {
+ /*
+ * Don't touch the cascade interrupt. It's unusable
+ * on PIC equipped machines. See the large comment
+ * in the IO/APIC code.
+ */
if (i != PIC_CASCADE_IR)
irq_matrix_assign(vector_matrix, ISA_IRQ_VECTOR(i));
}
@@ -1045,7 +1058,7 @@ void irq_force_complete_move(struct irq_desc *desc)
*
* But in case of cpu hotplug this should be a non issue
* because if the affinity update happens right before all
- * cpus rendevouz in stop machine, there is no way that the
+ * cpus rendezvous in stop machine, there is no way that the
* interrupt can be blocked on the target cpu because all cpus
* loops first with interrupts enabled in stop machine, so the
* old vector is not yet cleaned up when the interrupt fires.
@@ -1054,7 +1067,7 @@ void irq_force_complete_move(struct irq_desc *desc)
* of the interrupt on the apic/system bus would be delayed
* beyond the point where the target cpu disables interrupts
* in stop machine. I doubt that it can happen, but at least
- * there is a theroretical chance. Virtualization might be
+ * there is a theoretical chance. Virtualization might be
* able to expose this, but AFAICT the IOAPIC emulation is not
* as stupid as the real hardware.
*
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 52bc217ca8c3..f5a48e66e4f5 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -369,6 +369,15 @@ static int __init early_get_arch_type(void)
return ret;
}
+/* UV system found, check which APIC MODE BIOS already selected */
+static void __init early_set_apic_mode(void)
+{
+ if (x2apic_enabled())
+ uv_system_type = UV_X2APIC;
+ else
+ uv_system_type = UV_LEGACY_APIC;
+}
+
static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
{
/* Save OEM_ID passed from ACPI MADT */
@@ -404,11 +413,12 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
else
uv_hubless_system |= 0x8;
- /* Copy APIC type */
+ /* Copy OEM Table ID */
uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
pr_info("UV: OEM IDs %s/%s, SystemType %d, HUBLESS ID %x\n",
oem_id, oem_table_id, uv_system_type, uv_hubless_system);
+
return 0;
}
@@ -453,6 +463,7 @@ static int __init uv_set_system_type(char *_oem_id, char *_oem_table_id)
early_set_hub_type();
/* Other UV setup functions */
+ early_set_apic_mode();
early_get_pnodeid();
early_get_apic_socketid_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -472,29 +483,14 @@ static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id)
if (uv_set_system_type(_oem_id, _oem_table_id) == 0)
return 0;
- /* Save and Decode OEM Table ID */
+ /* Save for display of the OEM Table ID */
uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id);
- /* This is the most common hardware variant, x2apic mode */
- if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
-
- /* Only used for very small systems, usually 1 chassis, legacy mode */
- else if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
-
- else
- goto badbios;
-
pr_info("UV: OEM IDs %s/%s, System/UVType %d/0x%x, HUB RevID %d\n",
oem_id, oem_table_id, uv_system_type, is_uv(UV_ANY),
uv_min_hub_revision_id);
return 0;
-
-badbios:
- pr_err("UV: UVarchtype:%s not supported\n", uv_archtype);
- BUG();
}
enum uv_system_type get_uv_system_type(void)
@@ -1671,6 +1667,9 @@ static __init int uv_system_init_hubless(void)
if (rc < 0)
return rc;
+ /* Set section block size for current node memory */
+ set_block_size();
+
/* Create user access node */
if (rc >= 0)
uv_setup_proc_files(1);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 660270359d39..241dda687eb9 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -94,7 +94,7 @@
* Remove APM dependencies in arch/i386/kernel/process.c
* Remove APM dependencies in drivers/char/sysrq.c
* Reset time across standby.
- * Allow more inititialisation on SMP.
+ * Allow more initialisation on SMP.
* Remove CONFIG_APM_POWER_OFF and make it boot time
* configurable (default on).
* Make debug only a boot time parameter (remove APM_DEBUG).
@@ -766,7 +766,7 @@ static int apm_driver_version(u_short *val)
* not cleared until it is acknowledged.
*
* Additional information is returned in the info pointer, providing
- * that APM 1.2 is in use. If no messges are pending the value 0x80
+ * that APM 1.2 is in use. If no messages are pending the value 0x80
* is returned (No power management events pending).
*/
static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info)
@@ -1025,7 +1025,7 @@ static int apm_enable_power_management(int enable)
* status which gives the rough battery status, and current power
* source. The bat value returned give an estimate as a percentage
* of life and a status value for the battery. The estimated life
- * if reported is a lifetime in secodnds/minutes at current powwer
+ * if reported is a lifetime in seconds/minutes at current power
* consumption.
*/
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..ecd3fd6993d1 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -61,13 +61,6 @@ static void __used common(void)
OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
#endif
-#ifdef CONFIG_PARAVIRT_XXL
- BLANK();
- OFFSET(PV_IRQ_irq_disable, paravirt_patch_template, irq.irq_disable);
- OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
- OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
-#endif
-
#ifdef CONFIG_XEN
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..2b411cd00a4e 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -53,11 +53,6 @@ void foo(void)
offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-#ifdef CONFIG_STACKPROTECTOR
- BLANK();
- OFFSET(stack_canary_offset, stack_canary, canary);
-#endif
-
BLANK();
DEFINE(EFI_svam, offsetof(efi_runtime_services_t, set_virtual_address_map));
}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 347a956f71ca..2d11384dc9ab 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -628,11 +628,6 @@ static void early_init_amd(struct cpuinfo_x86 *c)
early_init_amd_mc(c);
-#ifdef CONFIG_X86_32
- if (c->x86 == 6)
- set_cpu_cap(c, X86_FEATURE_K7);
-#endif
-
if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c
index 3ca9be482a9e..d66af2950e06 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -877,7 +877,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
struct _cpuid4_info_regs *base)
{
- struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cpu_cacheinfo *this_cpu_ci;
struct cacheinfo *this_leaf;
int i, sibling;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ab640abe26b6..6bdb69a9a7dc 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -161,7 +161,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
[GDT_ENTRY_ESPFIX_SS] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
[GDT_ENTRY_PERCPU] = GDT_ENTRY_INIT(0xc092, 0, 0xfffff),
- GDT_STACK_CANARY_INIT
#endif
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
@@ -482,7 +481,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
if (pk)
pk->pkru = init_pkru_value;
/*
- * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ * Setting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
* cpuid bit to be set. We need to ensure that we
* update that bit in this CPU's "cpu_info".
*/
@@ -599,7 +598,6 @@ void load_percpu_segment(int cpu)
__loadsegment_simple(gs, 0);
wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
#endif
- load_stack_canary_segment();
}
#ifdef CONFIG_X86_32
@@ -1330,7 +1328,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
cpu_set_bug_bits(c);
- cpu_set_core_cap_bits(c);
+ sld_setup(c);
fpu__init_system(c);
@@ -1404,7 +1402,7 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c)
* where GS is unused by the prev and next threads.
*
* Since neither vendor documents this anywhere that I can see,
- * detect it directly instead of hardcoding the choice by
+ * detect it directly instead of hard-coding the choice by
* vendor.
*
* I've designated AMD's behavior as the "bug" because it's
@@ -1748,6 +1746,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
@@ -1796,7 +1796,8 @@ DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
#ifdef CONFIG_STACKPROTECTOR
-DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
+DEFINE_PER_CPU(unsigned long, __stack_chk_guard);
+EXPORT_PER_CPU_SYMBOL(__stack_chk_guard);
#endif
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..defda61f372d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,9 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16, X86_FEATURE_AVX512BW },
{ X86_FEATURE_ENQCMD, X86_FEATURE_XSAVES },
{ X86_FEATURE_PER_THREAD_MBA, X86_FEATURE_MBA },
+ { X86_FEATURE_SGX_LC, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX1, X86_FEATURE_SGX },
+ { X86_FEATURE_SGX2, X86_FEATURE_SGX1 },
{}
};
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 1d9b8aaea06c..7227c15299d0 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -291,7 +291,7 @@ static void init_cyrix(struct cpuinfo_x86 *c)
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache that's it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 3b1b01f2b248..da696eb4821a 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -93,15 +93,9 @@ static void init_vmx_capabilities(struct cpuinfo_x86 *c)
}
#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
-static void clear_sgx_caps(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_SGX);
- setup_clear_cpu_cap(X86_FEATURE_SGX_LC);
-}
-
static int __init nosgx(char *str)
{
- clear_sgx_caps();
+ setup_clear_cpu_cap(X86_FEATURE_SGX);
return 0;
}
@@ -110,23 +104,30 @@ early_param("nosgx", nosgx);
void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
{
+ bool enable_sgx_kvm = false, enable_sgx_driver = false;
bool tboot = tboot_enabled();
- bool enable_sgx;
+ bool enable_vmx;
u64 msr;
if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
clear_cpu_cap(c, X86_FEATURE_VMX);
- clear_sgx_caps();
+ clear_cpu_cap(c, X86_FEATURE_SGX);
return;
}
- /*
- * Enable SGX if and only if the kernel supports SGX and Launch Control
- * is supported, i.e. disable SGX if the LE hash MSRs can't be written.
- */
- enable_sgx = cpu_has(c, X86_FEATURE_SGX) &&
- cpu_has(c, X86_FEATURE_SGX_LC) &&
- IS_ENABLED(CONFIG_X86_SGX);
+ enable_vmx = cpu_has(c, X86_FEATURE_VMX) &&
+ IS_ENABLED(CONFIG_KVM_INTEL);
+
+ if (cpu_has(c, X86_FEATURE_SGX) && IS_ENABLED(CONFIG_X86_SGX)) {
+ /*
+ * Separate out SGX driver enabling from KVM. This allows KVM
+ * guests to use SGX even if the kernel SGX driver refuses to
+ * use it. This happens if flexible Launch Control is not
+ * available.
+ */
+ enable_sgx_driver = cpu_has(c, X86_FEATURE_SGX_LC);
+ enable_sgx_kvm = enable_vmx && IS_ENABLED(CONFIG_X86_SGX_KVM);
+ }
if (msr & FEAT_CTL_LOCKED)
goto update_caps;
@@ -142,15 +143,18 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
* i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
* for the kernel, e.g. using VMX to hide malicious code.
*/
- if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ if (enable_vmx) {
msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
if (tboot)
msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
}
- if (enable_sgx)
- msr |= FEAT_CTL_SGX_ENABLED | FEAT_CTL_SGX_LC_ENABLED;
+ if (enable_sgx_kvm || enable_sgx_driver) {
+ msr |= FEAT_CTL_SGX_ENABLED;
+ if (enable_sgx_driver)
+ msr |= FEAT_CTL_SGX_LC_ENABLED;
+ }
wrmsrl(MSR_IA32_FEAT_CTL, msr);
@@ -173,10 +177,29 @@ update_caps:
}
update_sgx:
- if (!(msr & FEAT_CTL_SGX_ENABLED) ||
- !(msr & FEAT_CTL_SGX_LC_ENABLED) || !enable_sgx) {
- if (enable_sgx)
- pr_err_once("SGX disabled by BIOS\n");
- clear_sgx_caps();
+ if (!(msr & FEAT_CTL_SGX_ENABLED)) {
+ if (enable_sgx_kvm || enable_sgx_driver)
+ pr_err_once("SGX disabled by BIOS.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ return;
+ }
+
+ /*
+ * VMX feature bit may be cleared due to being disabled in BIOS,
+ * in which case SGX virtualization cannot be supported either.
+ */
+ if (!cpu_has(c, X86_FEATURE_VMX) && enable_sgx_kvm) {
+ pr_err_once("SGX virtualization disabled due to lack of VMX.\n");
+ enable_sgx_kvm = 0;
+ }
+
+ if (!(msr & FEAT_CTL_SGX_LC_ENABLED) && enable_sgx_driver) {
+ if (!enable_sgx_kvm) {
+ pr_err_once("SGX Launch Control is locked. Disable SGX.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX);
+ } else {
+ pr_err_once("SGX Launch Control is locked. Support SGX virtualization only.\n");
+ clear_cpu_cap(c, X86_FEATURE_SGX_LC);
+ }
}
}
diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c
index ae59115d18f9..0bd6c74e3ba1 100644
--- a/arch/x86/kernel/cpu/hygon.c
+++ b/arch/x86/kernel/cpu/hygon.c
@@ -215,12 +215,12 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c)
u32 ecx;
ecx = cpuid_ecx(0x8000001e);
- nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes_per_socket = ((value >> 3) & 7) + 1;
+ __max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
}
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0e422a544835..8adffc17fa8b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -44,9 +44,9 @@ enum split_lock_detect_state {
};
/*
- * Default to sld_off because most systems do not support split lock detection
- * split_lock_setup() will switch this to sld_warn on systems that support
- * split lock detect, unless there is a command line override.
+ * Default to sld_off because most systems do not support split lock detection.
+ * sld_state_setup() will switch this to sld_warn on systems that support
+ * split lock/bus lock detect, unless there is a command line override.
*/
static enum split_lock_detect_state sld_state __ro_after_init = sld_off;
static u64 msr_test_ctrl_cache __ro_after_init;
@@ -301,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
* The operating system must reload CR3 to cause the TLB to be flushed"
*
* As a result, boot_cpu_has(X86_FEATURE_PGE) in arch/x86/include/asm/tlbflush.h
- * should be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
+ * should be false so that __flush_tlb_all() causes CR3 instead of CR4.PGE
* to be modified.
*/
if (c->x86 == 5 && c->x86_model == 9) {
@@ -603,6 +603,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c)
}
static void split_lock_init(void);
+static void bus_lock_init(void);
static void init_intel(struct cpuinfo_x86 *c)
{
@@ -720,6 +721,7 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
+ bus_lock_init();
intel_init_thermal(c);
}
@@ -1020,16 +1022,15 @@ static bool split_lock_verify_msr(bool on)
return ctrl == tmp;
}
-static void __init split_lock_setup(void)
+static void __init sld_state_setup(void)
{
enum split_lock_detect_state state = sld_warn;
char arg[20];
int i, ret;
- if (!split_lock_verify_msr(false)) {
- pr_info("MSR access failed: Disabled\n");
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
return;
- }
ret = cmdline_find_option(boot_command_line, "split_lock_detect",
arg, sizeof(arg));
@@ -1041,17 +1042,14 @@ static void __init split_lock_setup(void)
}
}
}
+ sld_state = state;
+}
- switch (state) {
- case sld_off:
- pr_info("disabled\n");
+static void __init __split_lock_setup(void)
+{
+ if (!split_lock_verify_msr(false)) {
+ pr_info("MSR access failed: Disabled\n");
return;
- case sld_warn:
- pr_info("warning about user-space split_locks\n");
- break;
- case sld_fatal:
- pr_info("sending SIGBUS on user-space split_locks\n");
- break;
}
rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
@@ -1061,7 +1059,9 @@ static void __init split_lock_setup(void)
return;
}
- sld_state = state;
+ /* Restore the MSR to its cached value. */
+ wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache);
+
setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT);
}
@@ -1118,6 +1118,29 @@ bool handle_guest_split_lock(unsigned long ip)
}
EXPORT_SYMBOL_GPL(handle_guest_split_lock);
+static void bus_lock_init(void)
+{
+ u64 val;
+
+ /*
+ * Warn and fatal are handled by #AC for split lock if #AC for
+ * split lock is supported.
+ */
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) ||
+ (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) &&
+ (sld_state == sld_warn || sld_state == sld_fatal)) ||
+ sld_state == sld_off)
+ return;
+
+ /*
+ * Enable #DB for bus lock. All bus locks are handled in #DB except
+ * split locks are handled in #AC in the fatal case.
+ */
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ val |= DEBUGCTLMSR_BUS_LOCK_DETECT;
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+}
+
bool handle_user_split_lock(struct pt_regs *regs, long error_code)
{
if ((regs->flags & X86_EFLAGS_AC) || sld_state == sld_fatal)
@@ -1126,6 +1149,21 @@ bool handle_user_split_lock(struct pt_regs *regs, long error_code)
return true;
}
+void handle_bus_lock(struct pt_regs *regs)
+{
+ switch (sld_state) {
+ case sld_off:
+ break;
+ case sld_warn:
+ pr_warn_ratelimited("#DB: %s/%d took a bus_lock trap at address: 0x%lx\n",
+ current->comm, current->pid, regs->ip);
+ break;
+ case sld_fatal:
+ force_sig_fault(SIGBUS, BUS_ADRALN, NULL);
+ break;
+ }
+}
+
/*
* This function is called only when switching between tasks with
* different split-lock detection modes. It sets the MSR for the
@@ -1166,7 +1204,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
{}
};
-void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
+static void __init split_lock_setup(struct cpuinfo_x86 *c)
{
const struct x86_cpu_id *m;
u64 ia32_core_caps;
@@ -1193,5 +1231,56 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
}
cpu_model_supports_sld = true;
- split_lock_setup();
+ __split_lock_setup();
+}
+
+static void sld_state_show(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
+ !boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return;
+
+ switch (sld_state) {
+ case sld_off:
+ pr_info("disabled\n");
+ break;
+ case sld_warn:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ pr_info("#AC: crashing the kernel on kernel split_locks and warning on user-space split_locks\n");
+ else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT))
+ pr_info("#DB: warning on user-space bus_locks\n");
+ break;
+ case sld_fatal:
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) {
+ pr_info("#AC: crashing the kernel on kernel split_locks and sending SIGBUS on user-space split_locks\n");
+ } else if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) {
+ pr_info("#DB: sending SIGBUS on user-space bus_locks%s\n",
+ boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) ?
+ " from non-WB" : "");
+ }
+ break;
+ }
+}
+
+void __init sld_setup(struct cpuinfo_x86 *c)
+{
+ split_lock_setup(c);
+ sld_state_setup();
+ sld_state_show();
+}
+
+#define X86_HYBRID_CPU_TYPE_ID_SHIFT 24
+
+/**
+ * get_this_hybrid_cpu_type() - Get the type of this hybrid CPU
+ *
+ * Returns the CPU type [31:24] (i.e., Atom or Core) of a CPU in
+ * a hybrid processor. If the processor is not hybrid, returns 0.
+ */
+u8 get_this_hybrid_cpu_type(void)
+{
+ if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
+ return 0;
+
+ return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT;
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 7962355436da..bf7fe87a7e88 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -529,7 +529,7 @@ static void mce_irq_work_cb(struct irq_work *entry)
* Check if the address reported by the CPU is in a format we can parse.
* It would be possible to add code for most other cases, but all would
* be somewhat complicated (e.g. segment offset would require an instruction
- * parser). So only support physical addresses up to page granuality for now.
+ * parser). So only support physical addresses up to page granularity for now.
*/
int mce_usable_address(struct mce *m)
{
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 7b360731fc2d..4e86d97f9653 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -74,6 +74,7 @@ MCE_INJECT_SET(status);
MCE_INJECT_SET(misc);
MCE_INJECT_SET(addr);
MCE_INJECT_SET(synd);
+MCE_INJECT_SET(ipid);
#define MCE_INJECT_GET(reg) \
static int inj_##reg##_get(void *data, u64 *val) \
@@ -88,11 +89,13 @@ MCE_INJECT_GET(status);
MCE_INJECT_GET(misc);
MCE_INJECT_GET(addr);
MCE_INJECT_GET(synd);
+MCE_INJECT_GET(ipid);
DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n");
DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(ipid_fops, inj_ipid_get, inj_ipid_set, "%llx\n");
static void setup_inj_struct(struct mce *m)
{
@@ -629,6 +632,8 @@ static const char readme_msg[] =
"\t is present in hardware. \n"
"\t - \"th\": Trigger APIC interrupt for Threshold errors. Causes threshold \n"
"\t APIC interrupt handler to handle the error. \n"
+"\n"
+"ipid:\t IPID (AMD-specific)\n"
"\n";
static ssize_t
@@ -652,6 +657,7 @@ static struct dfs_node {
{ .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR },
+ { .name = "ipid", .fops = &ipid_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR },
{ .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR },
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e309476743b7..acfd5d9f93c6 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ICELAKE_X:
+ case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 83df991314c5..17e631443116 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -142,7 +142,7 @@ static struct severity {
MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
),
MCESEV(
- KEEP, "Non signalled machine check",
+ KEEP, "Non signaled machine check",
SER, BITCLR(MCI_STATUS_S)
),
@@ -218,15 +218,15 @@ static struct severity {
static bool is_copy_from_user(struct pt_regs *regs)
{
u8 insn_buf[MAX_INSN_SIZE];
- struct insn insn;
unsigned long addr;
+ struct insn insn;
+ int ret;
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
return false;
- kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
- insn_get_opcode(&insn);
- if (!insn.opcode.got)
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
return false;
switch (insn.opcode.value) {
@@ -234,10 +234,6 @@ static bool is_copy_from_user(struct pt_regs *regs)
case 0x8A: case 0x8B:
/* MOVZ mem,reg */
case 0xB60F: case 0xB70F:
- insn_get_modrm(&insn);
- insn_get_sib(&insn);
- if (!insn.modrm.got || !insn.sib.got)
- return false;
addr = (unsigned long)insn_get_addr_ref(&insn, regs);
break;
/* REP MOVS */
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b935e1b5f115..6a6318e9590c 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -629,16 +629,16 @@ static ssize_t reload_store(struct device *dev,
if (val != 1)
return size;
- tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
- if (tmp_ret != UCODE_NEW)
- return size;
-
get_online_cpus();
ret = check_online_cpus();
if (ret)
goto put;
+ tmp_ret = microcode_ops->request_microcode_fw(bsp, &microcode_pdev->dev, true);
+ if (tmp_ret != UCODE_NEW)
+ goto put;
+
mutex_lock(&microcode_mutex);
ret = microcode_reload_late();
mutex_unlock(&microcode_mutex);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index e88bc296afca..22f13343b5da 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -60,23 +60,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
set_irq_regs(old_regs);
}
-int hv_setup_vmbus_irq(int irq, void (*handler)(void))
+void hv_setup_vmbus_handler(void (*handler)(void))
{
- /*
- * The 'irq' argument is ignored on x86/x64 because a hard-coded
- * interrupt vector is used for Hyper-V interrupts.
- */
vmbus_handler = handler;
- return 0;
}
+EXPORT_SYMBOL_GPL(hv_setup_vmbus_handler);
-void hv_remove_vmbus_irq(void)
+void hv_remove_vmbus_handler(void)
{
/* We have no way to deallocate the interrupt gate */
vmbus_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
-EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+EXPORT_SYMBOL_GPL(hv_remove_vmbus_handler);
/*
* Routines to do per-architecture handling of stimer0
@@ -95,21 +90,17 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
set_irq_regs(old_regs);
}
-int hv_setup_stimer0_irq(int *irq, int *vector, void (*handler)(void))
+/* For x86/x64, override weak placeholders in hyperv_timer.c */
+void hv_setup_stimer0_handler(void (*handler)(void))
{
- *vector = HYPERV_STIMER0_VECTOR;
- *irq = -1; /* Unused on x86/x64 */
hv_stimer0_handler = handler;
- return 0;
}
-EXPORT_SYMBOL_GPL(hv_setup_stimer0_irq);
-void hv_remove_stimer0_irq(int irq)
+void hv_remove_stimer0_handler(void)
{
/* We have no way to deallocate the interrupt gate */
hv_stimer0_handler = NULL;
}
-EXPORT_SYMBOL_GPL(hv_remove_stimer0_irq);
void hv_setup_kexec_handler(void (*handler)(void))
{
@@ -197,7 +188,7 @@ static unsigned char hv_get_nmi_reason(void)
#ifdef CONFIG_X86_LOCAL_APIC
/*
* Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
- * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * it difficult to process CHANNELMSG_UNLOAD in case of crash. Handle
* unknown NMI on the first CPU which gets it.
*/
static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
@@ -274,12 +265,13 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
- ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
+ ms_hyperv.priv_high = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
- pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
- ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
+ pr_info("Hyper-V: privilege flags low 0x%x, high 0x%x, hints 0x%x, misc 0x%x\n",
+ ms_hyperv.features, ms_hyperv.priv_high, ms_hyperv.hints,
+ ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -325,7 +317,7 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
- if (ms_hyperv.features_b & HV_ISOLATION) {
+ if (ms_hyperv.priv_high & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
@@ -428,7 +420,7 @@ static void __init ms_hyperv_init_platform(void)
/*
* Hyper-V doesn't provide irq remapping for IO-APIC. To enable x2apic,
- * set x2apic destination mode to physcial mode when x2apic is available
+ * set x2apic destination mode to physical mode when x2apic is available
* and Hyper-V IOMMU driver makes sure cpus assigned with IO-APIC irqs
* have 8-bit APIC id.
*/
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 9231640782fa..0c3b372318b7 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -434,7 +434,7 @@ set_var_mtrr_range(struct var_mtrr_state *state, unsigned long base_pfn,
state->range_sizek = sizek - second_sizek;
}
-/* Mininum size of mtrr block that can take hole: */
+/* Minimum size of mtrr block that can take hole: */
static u64 mtrr_chunk_size __initdata = (256ULL<<20);
static int __init parse_mtrr_chunk_size_opt(char *p)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 28c8a23aa42e..a76694bffe86 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -799,7 +799,7 @@ void mtrr_ap_init(void)
*
* This routine is called in two cases:
*
- * 1. very earily time of software resume, when there absolutely
+ * 1. very early time of software resume, when there absolutely
* isn't mtrr entry changes;
*
* 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 698bb26aeb6e..23001ae03e82 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -192,7 +192,7 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
* Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
* Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
*
- * Probe by trying to write the first of the L3 cach mask registers
+ * Probe by trying to write the first of the L3 cache mask registers
* and checking that the bits stick. Max CLOSids is always 4 and max cbm length
* is always 20 on hsw server parts. The minimum cache bitmask length
* allowed for HSW server is always 2 bits. Hardcode all of them.
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 7ac31210e452..dbeaa8409313 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -387,7 +387,7 @@ void mon_event_count(void *info)
* adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
* that:
*
- * current bandwdith(cur_bw) < user specified bandwidth(user_bw)
+ * current bandwidth(cur_bw) < user specified bandwidth(user_bw)
*
* This uses the MBM counters to measure the bandwidth and MBA throttle
* MSRs to control the bandwidth for a particular rdtgrp. It builds on the
@@ -397,7 +397,7 @@ void mon_event_count(void *info)
* timer. Having 1s interval makes the calculation of bandwidth simpler.
*
* Although MBA's goal is to restrict the bandwidth to a maximum, there may
- * be a need to increase the bandwidth to avoid uncecessarily restricting
+ * be a need to increase the bandwidth to avoid unnecessarily restricting
* the L2 <-> L3 traffic.
*
* Since MBA controls the L2 external bandwidth where as MBM measures the
@@ -480,7 +480,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
/*
* Delta values are updated dynamically package wise for each
- * rdtgrp everytime the throttle MSR changes value.
+ * rdtgrp every time the throttle MSR changes value.
*
* This is because (1)the increase in bandwidth is not perfectly
* linear and only "approximately" linear even when the hardware
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index e916646adc69..05a89e33fde2 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1307,7 +1307,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
* If the thread does not get on the CPU for whatever
* reason and the process which sets up the region is
* interrupted then this will leave the thread in runnable
- * state and once it gets on the CPU it will derefence
+ * state and once it gets on the CPU it will dereference
* the cleared, but not freed, plr struct resulting in an
* empty pseudo-locking loop.
*/
@@ -1391,7 +1391,7 @@ out:
* group is removed from user space via a "rmdir" from userspace or the
* unmount of the resctrl filesystem. On removal the resource group does
* not go back to pseudo-locksetup mode before it is removed, instead it is
- * removed directly. There is thus assymmetry with the creation where the
+ * removed directly. There is thus asymmetry with the creation where the
* &struct pseudo_lock_region is removed here while it was not created in
* rdtgroup_pseudo_lock_create().
*
@@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
return 0;
}
-static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags)
+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
{
/* Not supported */
return -EINVAL;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f9190adc52cb..01fd30e7829d 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * User interface for Resource Alloction in Resource Director Technology(RDT)
+ * User interface for Resource Allocation in Resource Director Technology(RDT)
*
* Copyright (C) 2016 Intel Corporation
*
@@ -294,7 +294,7 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
/*
* This is safe against resctrl_sched_in() called from __switch_to()
* because __switch_to() is executed with interrupts disabled. A local call
- * from update_closid_rmid() is proteced against __switch_to() because
+ * from update_closid_rmid() is protected against __switch_to() because
* preemption is disabled.
*/
static void update_cpu_closid_rmid(void *info)
@@ -2555,7 +2555,7 @@ static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
/*
* This creates a directory mon_data which contains the monitored data.
*
- * mon_data has one directory for each domain whic are named
+ * mon_data has one directory for each domain which are named
* in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
* with L3 domain looks as below:
* ./mon_data:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 972ec3bfa9c0..21d1f062895a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -36,6 +36,8 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CDP_L2, CPUID_ECX, 2, 0x00000010, 2 },
{ X86_FEATURE_MBA, CPUID_EBX, 3, 0x00000010, 0 },
{ X86_FEATURE_PER_THREAD_MBA, CPUID_ECX, 0, 0x00000010, 3 },
+ { X86_FEATURE_SGX1, CPUID_EAX, 0, 0x00000012, 0 },
+ { X86_FEATURE_SGX2, CPUID_EAX, 1, 0x00000012, 0 },
{ X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index 91d3dc784a29..9c1656779b2a 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -3,3 +3,4 @@ obj-y += \
encl.o \
ioctl.o \
main.o
+obj-$(CONFIG_X86_SGX_KVM) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c
index 8ce6d8371cfb..aa9b8b868867 100644
--- a/arch/x86/kernel/cpu/sgx/driver.c
+++ b/arch/x86/kernel/cpu/sgx/driver.c
@@ -136,10 +136,6 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area = sgx_get_unmapped_area,
};
-const struct file_operations sgx_provision_fops = {
- .owner = THIS_MODULE,
-};
-
static struct miscdevice sgx_dev_enclave = {
.minor = MISC_DYNAMIC_MINOR,
.name = "sgx_enclave",
@@ -147,13 +143,6 @@ static struct miscdevice sgx_dev_enclave = {
.fops = &sgx_encl_fops,
};
-static struct miscdevice sgx_dev_provision = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = "sgx_provision",
- .nodename = "sgx_provision",
- .fops = &sgx_provision_fops,
-};
-
int __init sgx_drv_init(void)
{
unsigned int eax, ebx, ecx, edx;
@@ -187,11 +176,5 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
- ret = misc_register(&sgx_dev_provision);
- if (ret) {
- misc_deregister(&sgx_dev_enclave);
- return ret;
- }
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 7449ef33f081..3be203297988 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,7 +7,7 @@
#include <linux/shmem_fs.h>
#include <linux/suspend.h>
#include <linux/sched/mm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#include "encl.h"
#include "encls.h"
#include "sgx.h"
@@ -78,7 +78,7 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
if (ret) {
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(ret);
}
@@ -404,7 +404,7 @@ void sgx_encl_release(struct kref *ref)
if (sgx_unmark_page_reclaimable(entry->epc_page))
continue;
- sgx_free_epc_page(entry->epc_page);
+ sgx_encl_free_epc_page(entry->epc_page);
encl->secs_child_cnt--;
entry->epc_page = NULL;
}
@@ -415,7 +415,7 @@ void sgx_encl_release(struct kref *ref)
xa_destroy(&encl->page_array);
if (!encl->secs_child_cnt && encl->secs.epc_page) {
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
}
@@ -423,7 +423,7 @@ void sgx_encl_release(struct kref *ref)
va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
list);
list_del(&va_page->list);
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
kfree(va_page);
}
@@ -686,7 +686,7 @@ struct sgx_epc_page *sgx_alloc_va_page(void)
ret = __epa(sgx_get_epc_virt_addr(epc_page));
if (ret) {
WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
return ERR_PTR(-EFAULT);
}
@@ -735,3 +735,24 @@ bool sgx_va_page_full(struct sgx_va_page *va_page)
return slot == SGX_VA_SLOT_COUNT;
}
+
+/**
+ * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
+ * @page: EPC page to be freed
+ *
+ * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
+ * only upon success, it puts the page back to free page list. Otherwise, it
+ * gives a WARNING to indicate page is leaked.
+ */
+void sgx_encl_free_epc_page(struct sgx_epc_page *page)
+{
+ int ret;
+
+ WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+
+ ret = __eremove(sgx_get_epc_virt_addr(page));
+ if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
+ return;
+
+ sgx_free_epc_page(page);
+}
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index d8d30ccbef4c..6e74f85b6264 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -115,5 +115,6 @@ struct sgx_epc_page *sgx_alloc_va_page(void);
unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page);
void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset);
bool sgx_va_page_full(struct sgx_va_page *va_page);
+void sgx_encl_free_epc_page(struct sgx_epc_page *page);
#endif /* _X86_ENCL_H */
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index 443188fe7e70..9b204843b78d 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -11,21 +11,6 @@
#include <asm/traps.h>
#include "sgx.h"
-enum sgx_encls_function {
- ECREATE = 0x00,
- EADD = 0x01,
- EINIT = 0x02,
- EREMOVE = 0x03,
- EDGBRD = 0x04,
- EDGBWR = 0x05,
- EEXTEND = 0x06,
- ELDU = 0x08,
- EBLOCK = 0x09,
- EPA = 0x0A,
- EWB = 0x0B,
- ETRACK = 0x0C,
-};
-
/**
* ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
*
@@ -55,6 +40,19 @@ enum sgx_encls_function {
} while (0); \
}
+/*
+ * encls_faulted() - Check if an ENCLS leaf faulted given an error code
+ * @ret: the return value of an ENCLS leaf function call
+ *
+ * Return:
+ * - true: ENCLS leaf faulted.
+ * - false: Otherwise.
+ */
+static inline bool encls_faulted(int ret)
+{
+ return ret & ENCLS_FAULT_FLAG;
+}
+
/**
* encls_failed() - Check if an ENCLS function failed
* @ret: the return value of an ENCLS function call
@@ -65,7 +63,7 @@ enum sgx_encls_function {
*/
static inline bool encls_failed(int ret)
{
- if (ret & ENCLS_FAULT_FLAG)
+ if (encls_faulted(ret))
return ENCLS_TRAPNR(ret) != X86_TRAP_PF;
return !!ret;
diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c
index 90a5caf76939..83df20e3e633 100644
--- a/arch/x86/kernel/cpu/sgx/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/ioctl.c
@@ -2,6 +2,7 @@
/* Copyright(c) 2016-20 Intel Corporation. */
#include <asm/mman.h>
+#include <asm/sgx.h>
#include <linux/mman.h>
#include <linux/delay.h>
#include <linux/file.h>
@@ -47,7 +48,7 @@ static void sgx_encl_shrink(struct sgx_encl *encl, struct sgx_va_page *va_page)
encl->page_cnt--;
if (va_page) {
- sgx_free_epc_page(va_page->epc_page);
+ sgx_encl_free_epc_page(va_page->epc_page);
list_del(&va_page->list);
kfree(va_page);
}
@@ -117,7 +118,7 @@ static int sgx_encl_create(struct sgx_encl *encl, struct sgx_secs *secs)
return 0;
err_out:
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
err_out_backing:
@@ -365,7 +366,7 @@ err_out_unlock:
mmap_read_unlock(current->mm);
err_out_free:
- sgx_free_epc_page(epc_page);
+ sgx_encl_free_epc_page(epc_page);
kfree(encl_page);
return ret;
@@ -495,7 +496,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
void *token)
{
u64 mrsigner[4];
- int i, j, k;
+ int i, j;
void *addr;
int ret;
@@ -544,8 +545,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
preempt_disable();
- for (k = 0; k < 4; k++)
- wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + k, mrsigner[k]);
+ sgx_update_lepubkeyhash(mrsigner);
ret = __einit(sigstruct, token, addr);
@@ -568,7 +568,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct,
}
}
- if (ret & ENCLS_FAULT_FLAG) {
+ if (encls_faulted(ret)) {
if (encls_failed(ret))
ENCLS_WARN(ret, "EINIT");
@@ -604,7 +604,6 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
{
struct sgx_sigstruct *sigstruct;
struct sgx_enclave_init init_arg;
- struct page *initp_page;
void *token;
int ret;
@@ -615,11 +614,15 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
if (copy_from_user(&init_arg, arg, sizeof(init_arg)))
return -EFAULT;
- initp_page = alloc_page(GFP_KERNEL);
- if (!initp_page)
+ /*
+ * 'sigstruct' must be on a page boundary and 'token' on a 512 byte
+ * boundary. kmalloc() will give this alignment when allocating
+ * PAGE_SIZE bytes.
+ */
+ sigstruct = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!sigstruct)
return -ENOMEM;
- sigstruct = kmap(initp_page);
token = (void *)((unsigned long)sigstruct + PAGE_SIZE / 2);
memset(token, 0, SGX_LAUNCH_TOKEN_SIZE);
@@ -645,8 +648,7 @@ static long sgx_ioc_enclave_init(struct sgx_encl *encl, void __user *arg)
ret = sgx_encl_init(encl, sigstruct, token);
out:
- kunmap(initp_page);
- __free_page(initp_page);
+ kfree(sigstruct);
return ret;
}
@@ -665,24 +667,11 @@ out:
static long sgx_ioc_enclave_provision(struct sgx_encl *encl, void __user *arg)
{
struct sgx_enclave_provision params;
- struct file *file;
if (copy_from_user(&params, arg, sizeof(params)))
return -EFAULT;
- file = fget(params.fd);
- if (!file)
- return -EINVAL;
-
- if (file->f_op != &sgx_provision_fops) {
- fput(file);
- return -EINVAL;
- }
-
- encl->attributes_mask |= SGX_ATTR_PROVISIONKEY;
-
- fput(file);
- return 0;
+ return sgx_set_attribute(&encl->attributes_mask, params.fd);
}
long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 8df81a3ed945..63d3de02bbcc 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,14 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2016-20 Intel Corporation. */
+#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
+#include <linux/miscdevice.h>
#include <linux/pagemap.h>
#include <linux/ratelimit.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
+#include <asm/sgx.h>
#include "driver.h"
#include "encl.h"
#include "encls.h"
@@ -23,42 +26,58 @@ static DECLARE_WAIT_QUEUE_HEAD(ksgxd_waitq);
* with sgx_reclaimer_lock acquired.
*/
static LIST_HEAD(sgx_active_page_list);
-
static DEFINE_SPINLOCK(sgx_reclaimer_lock);
+/* The free page list lock protected variables prepend the lock. */
+static unsigned long sgx_nr_free_pages;
+
+/* Nodes with one or more EPC sections. */
+static nodemask_t sgx_numa_mask;
+
+/*
+ * Array with one list_head for each possible NUMA node. Each
+ * list contains all the sgx_epc_section's which are on that
+ * node.
+ */
+static struct sgx_numa_node *sgx_numa_nodes;
+
+static LIST_HEAD(sgx_dirty_page_list);
+
/*
- * Reset dirty EPC pages to uninitialized state. Laundry can be left with SECS
- * pages whose child pages blocked EREMOVE.
+ * Reset post-kexec EPC pages to the uninitialized state. The pages are removed
+ * from the input list, and made available for the page allocator. SECS pages
+ * prepending their children in the input list are left intact.
*/
-static void sgx_sanitize_section(struct sgx_epc_section *section)
+static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
- /* init_laundry_list is thread-local, no need for a lock: */
- while (!list_empty(&section->init_laundry_list)) {
+ /* dirty_page_list is thread-local, no need for a lock: */
+ while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
return;
- /* needed for access to ->page_list: */
- spin_lock(&section->lock);
-
- page = list_first_entry(&section->init_laundry_list,
- struct sgx_epc_page, list);
+ page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
ret = __eremove(sgx_get_epc_virt_addr(page));
- if (!ret)
- list_move(&page->list, &section->page_list);
- else
+ if (!ret) {
+ /*
+ * page is now sanitized. Make it available via the SGX
+ * page allocator:
+ */
+ list_del(&page->list);
+ sgx_free_epc_page(page);
+ } else {
+ /* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
-
- spin_unlock(&section->lock);
+ }
cond_resched();
}
- list_splice(&dirty, &section->init_laundry_list);
+ list_splice(&dirty, dirty_page_list);
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -195,10 +214,10 @@ static const cpumask_t *sgx_encl_ewb_cpumask(struct sgx_encl *encl)
/*
* Swap page to the regular memory transformed to the blocked state by using
- * EBLOCK, which means that it can no loger be referenced (no new TLB entries).
+ * EBLOCK, which means that it can no longer be referenced (no new TLB entries).
*
* The first trial just tries to write the page assuming that some other thread
- * has reset the count for threads inside the enlave by using ETRACK, and
+ * has reset the count for threads inside the enclave by using ETRACK, and
* previous thread count has been zeroed out. The second trial calls ETRACK
* before EWB. If that fails we kick all the HW threads out, and then do EWB,
* which should be guaranteed the succeed.
@@ -278,7 +297,7 @@ static void sgx_reclaimer_write(struct sgx_epc_page *epc_page,
sgx_encl_ewb(encl->secs.epc_page, &secs_backing);
- sgx_free_epc_page(encl->secs.epc_page);
+ sgx_encl_free_epc_page(encl->secs.epc_page);
encl->secs.epc_page = NULL;
sgx_encl_put_backing(&secs_backing, true);
@@ -308,6 +327,7 @@ static void sgx_reclaim_pages(void)
struct sgx_epc_section *section;
struct sgx_encl_page *encl_page;
struct sgx_epc_page *epc_page;
+ struct sgx_numa_node *node;
pgoff_t page_index;
int cnt = 0;
int ret;
@@ -379,50 +399,33 @@ skip:
epc_page->flags &= ~SGX_EPC_PAGE_RECLAIMER_TRACKED;
section = &sgx_epc_sections[epc_page->section];
- spin_lock(&section->lock);
- list_add_tail(&epc_page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
- }
-}
-
-static unsigned long sgx_nr_free_pages(void)
-{
- unsigned long cnt = 0;
- int i;
-
- for (i = 0; i < sgx_nr_epc_sections; i++)
- cnt += sgx_epc_sections[i].free_cnt;
+ node = section->node;
- return cnt;
+ spin_lock(&node->lock);
+ list_add_tail(&epc_page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
+ spin_unlock(&node->lock);
+ }
}
static bool sgx_should_reclaim(unsigned long watermark)
{
- return sgx_nr_free_pages() < watermark &&
- !list_empty(&sgx_active_page_list);
+ return sgx_nr_free_pages < watermark && !list_empty(&sgx_active_page_list);
}
static int ksgxd(void *p)
{
- int i;
-
set_freezable();
/*
* Sanitize pages in order to recover from kexec(). The 2nd pass is
* required for SECS pages, whose child pages blocked EREMOVE.
*/
- for (i = 0; i < sgx_nr_epc_sections; i++)
- sgx_sanitize_section(&sgx_epc_sections[i]);
-
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- sgx_sanitize_section(&sgx_epc_sections[i]);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
+ __sgx_sanitize_pages(&sgx_dirty_page_list);
- /* Should never happen. */
- if (!list_empty(&sgx_epc_sections[i].init_laundry_list))
- WARN(1, "EPC section %d has unsanitized pages.\n", i);
- }
+ /* sanity check: */
+ WARN_ON(!list_empty(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
@@ -454,45 +457,56 @@ static bool __init sgx_page_reclaimer_init(void)
return true;
}
-static struct sgx_epc_page *__sgx_alloc_epc_page_from_section(struct sgx_epc_section *section)
+static struct sgx_epc_page *__sgx_alloc_epc_page_from_node(int nid)
{
- struct sgx_epc_page *page;
+ struct sgx_numa_node *node = &sgx_numa_nodes[nid];
+ struct sgx_epc_page *page = NULL;
- spin_lock(&section->lock);
+ spin_lock(&node->lock);
- if (list_empty(&section->page_list)) {
- spin_unlock(&section->lock);
+ if (list_empty(&node->free_page_list)) {
+ spin_unlock(&node->lock);
return NULL;
}
- page = list_first_entry(&section->page_list, struct sgx_epc_page, list);
+ page = list_first_entry(&node->free_page_list, struct sgx_epc_page, list);
list_del_init(&page->list);
- section->free_cnt--;
+ sgx_nr_free_pages--;
+
+ spin_unlock(&node->lock);
- spin_unlock(&section->lock);
return page;
}
/**
* __sgx_alloc_epc_page() - Allocate an EPC page
*
- * Iterate through EPC sections and borrow a free EPC page to the caller. When a
- * page is no longer needed it must be released with sgx_free_epc_page().
+ * Iterate through NUMA nodes and reserve ia free EPC page to the caller. Start
+ * from the NUMA node, where the caller is executing.
*
* Return:
- * an EPC page,
- * -errno on error
+ * - an EPC page: A borrowed EPC pages were available.
+ * - NULL: Out of EPC pages.
*/
struct sgx_epc_page *__sgx_alloc_epc_page(void)
{
- struct sgx_epc_section *section;
struct sgx_epc_page *page;
- int i;
+ int nid_of_current = numa_node_id();
+ int nid = nid_of_current;
- for (i = 0; i < sgx_nr_epc_sections; i++) {
- section = &sgx_epc_sections[i];
+ if (node_isset(nid_of_current, sgx_numa_mask)) {
+ page = __sgx_alloc_epc_page_from_node(nid_of_current);
+ if (page)
+ return page;
+ }
+
+ /* Fall back to the non-local NUMA nodes: */
+ while (true) {
+ nid = next_node_in(nid, sgx_numa_mask);
+ if (nid == nid_of_current)
+ break;
- page = __sgx_alloc_epc_page_from_section(section);
+ page = __sgx_alloc_epc_page_from_node(nid);
if (page)
return page;
}
@@ -598,23 +612,22 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
* sgx_free_epc_page() - Free an EPC page
* @page: an EPC page
*
- * Call EREMOVE for an EPC page and insert it back to the list of free pages.
+ * Put the EPC page back to the list of free pages. It's the caller's
+ * responsibility to make sure that the page is in uninitialized state. In other
+ * words, do EREMOVE, EWB or whatever operation is necessary before calling
+ * this function.
*/
void sgx_free_epc_page(struct sgx_epc_page *page)
{
struct sgx_epc_section *section = &sgx_epc_sections[page->section];
- int ret;
+ struct sgx_numa_node *node = section->node;
- WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
+ spin_lock(&node->lock);
- ret = __eremove(sgx_get_epc_virt_addr(page));
- if (WARN_ONCE(ret, "EREMOVE returned %d (0x%x)", ret, ret))
- return;
+ list_add_tail(&page->list, &node->free_page_list);
+ sgx_nr_free_pages++;
- spin_lock(&section->lock);
- list_add_tail(&page->list, &section->page_list);
- section->free_cnt++;
- spin_unlock(&section->lock);
+ spin_unlock(&node->lock);
}
static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
@@ -635,18 +648,14 @@ static bool __init sgx_setup_epc_section(u64 phys_addr, u64 size,
}
section->phys_addr = phys_addr;
- spin_lock_init(&section->lock);
- INIT_LIST_HEAD(&section->page_list);
- INIT_LIST_HEAD(&section->init_laundry_list);
for (i = 0; i < nr_pages; i++) {
section->pages[i].section = index;
section->pages[i].flags = 0;
section->pages[i].owner = NULL;
- list_add_tail(&section->pages[i].list, &section->init_laundry_list);
+ list_add_tail(&section->pages[i].list, &sgx_dirty_page_list);
}
- section->free_cnt = nr_pages;
return true;
}
@@ -665,8 +674,13 @@ static bool __init sgx_page_cache_init(void)
{
u32 eax, ebx, ecx, edx, type;
u64 pa, size;
+ int nid;
int i;
+ sgx_numa_nodes = kmalloc_array(num_possible_nodes(), sizeof(*sgx_numa_nodes), GFP_KERNEL);
+ if (!sgx_numa_nodes)
+ return false;
+
for (i = 0; i < ARRAY_SIZE(sgx_epc_sections); i++) {
cpuid_count(SGX_CPUID, i + SGX_CPUID_EPC, &eax, &ebx, &ecx, &edx);
@@ -689,6 +703,21 @@ static bool __init sgx_page_cache_init(void)
break;
}
+ nid = numa_map_to_online_node(phys_to_target_node(pa));
+ if (nid == NUMA_NO_NODE) {
+ /* The physical address is already printed above. */
+ pr_warn(FW_BUG "Unable to map EPC section to online node. Fallback to the NUMA node 0.\n");
+ nid = 0;
+ }
+
+ if (!node_isset(nid, sgx_numa_mask)) {
+ spin_lock_init(&sgx_numa_nodes[nid].lock);
+ INIT_LIST_HEAD(&sgx_numa_nodes[nid].free_page_list);
+ node_set(nid, sgx_numa_mask);
+ }
+
+ sgx_epc_sections[i].node = &sgx_numa_nodes[nid];
+
sgx_nr_epc_sections++;
}
@@ -700,6 +729,67 @@ static bool __init sgx_page_cache_init(void)
return true;
}
+/*
+ * Update the SGX_LEPUBKEYHASH MSRs to the values specified by caller.
+ * Bare-metal driver requires to update them to hash of enclave's signer
+ * before EINIT. KVM needs to update them to guest's virtual MSR values
+ * before doing EINIT from guest.
+ */
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash)
+{
+ int i;
+
+ WARN_ON_ONCE(preemptible());
+
+ for (i = 0; i < 4; i++)
+ wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]);
+}
+
+const struct file_operations sgx_provision_fops = {
+ .owner = THIS_MODULE,
+};
+
+static struct miscdevice sgx_dev_provision = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_provision",
+ .nodename = "sgx_provision",
+ .fops = &sgx_provision_fops,
+};
+
+/**
+ * sgx_set_attribute() - Update allowed attributes given file descriptor
+ * @allowed_attributes: Pointer to allowed enclave attributes
+ * @attribute_fd: File descriptor for specific attribute
+ *
+ * Append enclave attribute indicated by file descriptor to allowed
+ * attributes. Currently only SGX_ATTR_PROVISIONKEY indicated by
+ * /dev/sgx_provision is supported.
+ *
+ * Return:
+ * -0: SGX_ATTR_PROVISIONKEY is appended to allowed_attributes
+ * -EINVAL: Invalid, or not supported file descriptor
+ */
+int sgx_set_attribute(unsigned long *allowed_attributes,
+ unsigned int attribute_fd)
+{
+ struct file *file;
+
+ file = fget(attribute_fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &sgx_provision_fops) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+
+ fput(file);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
+
static int __init sgx_init(void)
{
int ret;
@@ -716,12 +806,28 @@ static int __init sgx_init(void)
goto err_page_cache;
}
- ret = sgx_drv_init();
+ ret = misc_register(&sgx_dev_provision);
if (ret)
goto err_kthread;
+ /*
+ * Always try to initialize the native *and* KVM drivers.
+ * The KVM driver is less picky than the native one and
+ * can function if the native one is not supported on the
+ * current system or fails to initialize.
+ *
+ * Error out only if both fail to initialize.
+ */
+ ret = sgx_drv_init();
+
+ if (sgx_vepc_init() && ret)
+ goto err_provision;
+
return 0;
+err_provision:
+ misc_deregister(&sgx_dev_provision);
+
err_kthread:
kthread_stop(ksgxd_tsk);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 5fa42d143feb..4628acec0009 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,11 +8,15 @@
#include <linux/rwsem.h>
#include <linux/types.h>
#include <asm/asm.h>
-#include "arch.h"
+#include <asm/sgx.h>
#undef pr_fmt
#define pr_fmt(fmt) "sgx: " fmt
+#define EREMOVE_ERROR_MESSAGE \
+ "EREMOVE returned %d (0x%x) and an EPC page was leaked. SGX may become unusable. " \
+ "Refer to Documentation/x86/sgx.rst for more information."
+
#define SGX_MAX_EPC_SECTIONS 8
#define SGX_EEXTEND_BLOCK_SIZE 256
#define SGX_NR_TO_SCAN 16
@@ -30,28 +34,25 @@ struct sgx_epc_page {
};
/*
+ * Contains the tracking data for NUMA nodes having EPC pages. Most importantly,
+ * the free page list local to the node is stored here.
+ */
+struct sgx_numa_node {
+ struct list_head free_page_list;
+ spinlock_t lock;
+};
+
+/*
* The firmware can define multiple chunks of EPC to the different areas of the
* physical memory e.g. for memory areas of the each node. This structure is
* used to store EPC pages for one EPC section and virtual memory area where
* the pages have been mapped.
- *
- * 'lock' must be held before accessing 'page_list' or 'free_cnt'.
*/
struct sgx_epc_section {
unsigned long phys_addr;
void *virt_addr;
struct sgx_epc_page *pages;
-
- spinlock_t lock;
- struct list_head page_list;
- unsigned long free_cnt;
-
- /*
- * Pages which need EREMOVE run on them before they can be
- * used. Only safe to be accessed in ksgxd and init code.
- * Not protected by locks.
- */
- struct list_head init_laundry_list;
+ struct sgx_numa_node *node;
};
extern struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
@@ -83,4 +84,15 @@ void sgx_mark_page_reclaimable(struct sgx_epc_page *page);
int sgx_unmark_page_reclaimable(struct sgx_epc_page *page);
struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim);
+#ifdef CONFIG_X86_SGX_KVM
+int __init sgx_vepc_init(void);
+#else
+static inline int __init sgx_vepc_init(void)
+{
+ return -ENODEV;
+}
+#endif
+
+void sgx_update_lepubkeyhash(u64 *lepubkeyhash);
+
#endif /* _X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index 000000000000..6ad165a5c0cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Device driver to expose SGX enclave memory to KVM guests.
+ *
+ * Copyright(c) 2021 Intel Corporation.
+ */
+
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/sgx.h>
+#include <uapi/asm/sgx.h>
+
+#include "encls.h"
+#include "sgx.h"
+
+struct sgx_vepc {
+ struct xarray page_array;
+ struct mutex lock;
+};
+
+/*
+ * Temporary SECS pages that cannot be EREMOVE'd due to having child in other
+ * virtual EPC instances, and the lock to protect it.
+ */
+static struct mutex zombie_secs_pages_lock;
+static struct list_head zombie_secs_pages;
+
+static int __sgx_vepc_fault(struct sgx_vepc *vepc,
+ struct vm_area_struct *vma, unsigned long addr)
+{
+ struct sgx_epc_page *epc_page;
+ unsigned long index, pfn;
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&vepc->lock));
+
+ /* Calculate index of EPC page in virtual EPC's page_array */
+ index = vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+
+ epc_page = xa_load(&vepc->page_array, index);
+ if (epc_page)
+ return 0;
+
+ epc_page = sgx_alloc_epc_page(vepc, false);
+ if (IS_ERR(epc_page))
+ return PTR_ERR(epc_page);
+
+ ret = xa_err(xa_store(&vepc->page_array, index, epc_page, GFP_KERNEL));
+ if (ret)
+ goto err_free;
+
+ pfn = PFN_DOWN(sgx_get_epc_phys_addr(epc_page));
+
+ ret = vmf_insert_pfn(vma, addr, pfn);
+ if (ret != VM_FAULT_NOPAGE) {
+ ret = -EFAULT;
+ goto err_delete;
+ }
+
+ return 0;
+
+err_delete:
+ xa_erase(&vepc->page_array, index);
+err_free:
+ sgx_free_epc_page(epc_page);
+ return ret;
+}
+
+static vm_fault_t sgx_vepc_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct sgx_vepc *vepc = vma->vm_private_data;
+ int ret;
+
+ mutex_lock(&vepc->lock);
+ ret = __sgx_vepc_fault(vepc, vma, vmf->address);
+ mutex_unlock(&vepc->lock);
+
+ if (!ret)
+ return VM_FAULT_NOPAGE;
+
+ if (ret == -EBUSY && (vmf->flags & FAULT_FLAG_ALLOW_RETRY)) {
+ mmap_read_unlock(vma->vm_mm);
+ return VM_FAULT_RETRY;
+ }
+
+ return VM_FAULT_SIGBUS;
+}
+
+static const struct vm_operations_struct sgx_vepc_vm_ops = {
+ .fault = sgx_vepc_fault,
+};
+
+static int sgx_vepc_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct sgx_vepc *vepc = file->private_data;
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &sgx_vepc_vm_ops;
+ /* Don't copy VMA in fork() */
+ vma->vm_flags |= VM_PFNMAP | VM_IO | VM_DONTDUMP | VM_DONTCOPY;
+ vma->vm_private_data = vepc;
+
+ return 0;
+}
+
+static int sgx_vepc_free_page(struct sgx_epc_page *epc_page)
+{
+ int ret;
+
+ /*
+ * Take a previously guest-owned EPC page and return it to the
+ * general EPC page pool.
+ *
+ * Guests can not be trusted to have left this page in a good
+ * state, so run EREMOVE on the page unconditionally. In the
+ * case that a guest properly EREMOVE'd this page, a superfluous
+ * EREMOVE is harmless.
+ */
+ ret = __eremove(sgx_get_epc_virt_addr(epc_page));
+ if (ret) {
+ /*
+ * Only SGX_CHILD_PRESENT is expected, which is because of
+ * EREMOVE'ing an SECS still with child, in which case it can
+ * be handled by EREMOVE'ing the SECS again after all pages in
+ * virtual EPC have been EREMOVE'd. See comments in below in
+ * sgx_vepc_release().
+ *
+ * The user of virtual EPC (KVM) needs to guarantee there's no
+ * logical processor is still running in the enclave in guest,
+ * otherwise EREMOVE will get SGX_ENCLAVE_ACT which cannot be
+ * handled here.
+ */
+ WARN_ONCE(ret != SGX_CHILD_PRESENT, EREMOVE_ERROR_MESSAGE,
+ ret, ret);
+ return ret;
+ }
+
+ sgx_free_epc_page(epc_page);
+
+ return 0;
+}
+
+static int sgx_vepc_release(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc = file->private_data;
+ struct sgx_epc_page *epc_page, *tmp, *entry;
+ unsigned long index;
+
+ LIST_HEAD(secs_pages);
+
+ xa_for_each(&vepc->page_array, index, entry) {
+ /*
+ * Remove all normal, child pages. sgx_vepc_free_page()
+ * will fail if EREMOVE fails, but this is OK and expected on
+ * SECS pages. Those can only be EREMOVE'd *after* all their
+ * child pages. Retries below will clean them up.
+ */
+ if (sgx_vepc_free_page(entry))
+ continue;
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * Retry EREMOVE'ing pages. This will clean up any SECS pages that
+ * only had children in this 'epc' area.
+ */
+ xa_for_each(&vepc->page_array, index, entry) {
+ epc_page = entry;
+ /*
+ * An EREMOVE failure here means that the SECS page still
+ * has children. But, since all children in this 'sgx_vepc'
+ * have been removed, the SECS page must have a child on
+ * another instance.
+ */
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+
+ xa_erase(&vepc->page_array, index);
+ }
+
+ /*
+ * SECS pages are "pinned" by child pages, and "unpinned" once all
+ * children have been EREMOVE'd. A child page in this instance
+ * may have pinned an SECS page encountered in an earlier release(),
+ * creating a zombie. Since some children were EREMOVE'd above,
+ * try to EREMOVE all zombies in the hopes that one was unpinned.
+ */
+ mutex_lock(&zombie_secs_pages_lock);
+ list_for_each_entry_safe(epc_page, tmp, &zombie_secs_pages, list) {
+ /*
+ * Speculatively remove the page from the list of zombies,
+ * if the page is successfully EREMOVE'd it will be added to
+ * the list of free pages. If EREMOVE fails, throw the page
+ * on the local list, which will be spliced on at the end.
+ */
+ list_del(&epc_page->list);
+
+ if (sgx_vepc_free_page(epc_page))
+ list_add_tail(&epc_page->list, &secs_pages);
+ }
+
+ if (!list_empty(&secs_pages))
+ list_splice_tail(&secs_pages, &zombie_secs_pages);
+ mutex_unlock(&zombie_secs_pages_lock);
+
+ kfree(vepc);
+
+ return 0;
+}
+
+static int sgx_vepc_open(struct inode *inode, struct file *file)
+{
+ struct sgx_vepc *vepc;
+
+ vepc = kzalloc(sizeof(struct sgx_vepc), GFP_KERNEL);
+ if (!vepc)
+ return -ENOMEM;
+ mutex_init(&vepc->lock);
+ xa_init(&vepc->page_array);
+
+ file->private_data = vepc;
+
+ return 0;
+}
+
+static const struct file_operations sgx_vepc_fops = {
+ .owner = THIS_MODULE,
+ .open = sgx_vepc_open,
+ .release = sgx_vepc_release,
+ .mmap = sgx_vepc_mmap,
+};
+
+static struct miscdevice sgx_vepc_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "sgx_vepc",
+ .nodename = "sgx_vepc",
+ .fops = &sgx_vepc_fops,
+};
+
+int __init sgx_vepc_init(void)
+{
+ /* SGX virtualization requires KVM to work */
+ if (!cpu_feature_enabled(X86_FEATURE_VMX))
+ return -ENODEV;
+
+ INIT_LIST_HEAD(&zombie_secs_pages);
+ mutex_init(&zombie_secs_pages_lock);
+
+ return misc_register(&sgx_vepc_dev);
+}
+
+/**
+ * sgx_virt_ecreate() - Run ECREATE on behalf of guest
+ * @pageinfo: Pointer to PAGEINFO structure
+ * @secs: Userspace pointer to SECS page
+ * @trapnr: trap number injected to guest in case of ECREATE error
+ *
+ * Run ECREATE on behalf of guest after KVM traps ECREATE for the purpose
+ * of enforcing policies of guest's enclaves, and return the trap number
+ * which should be injected to guest in case of any ECREATE error.
+ *
+ * Return:
+ * - 0: ECREATE was successful.
+ * - <0: on error.
+ */
+int sgx_virt_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs,
+ int *trapnr)
+{
+ int ret;
+
+ /*
+ * @secs is an untrusted, userspace-provided address. It comes from
+ * KVM and is assumed to be a valid pointer which points somewhere in
+ * userspace. This can fault and call SGX or other fault handlers when
+ * userspace mapping @secs doesn't exist.
+ *
+ * Add a WARN() to make sure @secs is already valid userspace pointer
+ * from caller (KVM), who should already have handled invalid pointer
+ * case (for instance, made by malicious guest). All other checks,
+ * such as alignment of @secs, are deferred to ENCLS itself.
+ */
+ if (WARN_ON_ONCE(!access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __ecreate(pageinfo, (void *)secs);
+ __uaccess_end();
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ /* ECREATE doesn't return an error code, it faults or succeeds. */
+ WARN_ON_ONCE(ret);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_ecreate);
+
+static int __sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs)
+{
+ int ret;
+
+ /*
+ * Make sure all userspace pointers from caller (KVM) are valid.
+ * All other checks deferred to ENCLS itself. Also see comment
+ * for @secs in sgx_virt_ecreate().
+ */
+#define SGX_EINITTOKEN_SIZE 304
+ if (WARN_ON_ONCE(!access_ok(sigstruct, sizeof(struct sgx_sigstruct)) ||
+ !access_ok(token, SGX_EINITTOKEN_SIZE) ||
+ !access_ok(secs, PAGE_SIZE)))
+ return -EINVAL;
+
+ __uaccess_begin();
+ ret = __einit((void *)sigstruct, (void *)token, (void *)secs);
+ __uaccess_end();
+
+ return ret;
+}
+
+/**
+ * sgx_virt_einit() - Run EINIT on behalf of guest
+ * @sigstruct: Userspace pointer to SIGSTRUCT structure
+ * @token: Userspace pointer to EINITTOKEN structure
+ * @secs: Userspace pointer to SECS page
+ * @lepubkeyhash: Pointer to guest's *virtual* SGX_LEPUBKEYHASH MSR values
+ * @trapnr: trap number injected to guest in case of EINIT error
+ *
+ * Run EINIT on behalf of guest after KVM traps EINIT. If SGX_LC is available
+ * in host, SGX driver may rewrite the hardware values at wish, therefore KVM
+ * needs to update hardware values to guest's virtual MSR values in order to
+ * ensure EINIT is executed with expected hardware values.
+ *
+ * Return:
+ * - 0: EINIT was successful.
+ * - <0: on error.
+ */
+int sgx_virt_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+ int ret;
+
+ if (!cpu_feature_enabled(X86_FEATURE_SGX_LC)) {
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ } else {
+ preempt_disable();
+
+ sgx_update_lepubkeyhash(lepubkeyhash);
+
+ ret = __sgx_virt_einit(sigstruct, token, secs);
+ preempt_enable();
+ }
+
+ /* Propagate up the error from the WARN_ON_ONCE in __sgx_virt_einit() */
+ if (ret == -EINVAL)
+ return ret;
+
+ if (encls_faulted(ret)) {
+ *trapnr = ENCLS_TRAPNR(ret);
+ return -EFAULT;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_virt_einit);
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index 8678864ce712..132a2de44d2f 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -30,7 +30,7 @@ EXPORT_SYMBOL(__max_die_per_package);
#ifdef CONFIG_SMP
/*
- * Check if given CPUID extended toplogy "leaf" is implemented
+ * Check if given CPUID extended topology "leaf" is implemented
*/
static int check_extended_topology_leaf(int leaf)
{
@@ -44,7 +44,7 @@ static int check_extended_topology_leaf(int leaf)
return 0;
}
/*
- * Return best CPUID Extended Toplogy Leaf supported
+ * Return best CPUID Extended Topology Leaf supported
*/
static int detect_extended_topology_leaf(struct cpuinfo_x86 *c)
{
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index c6ede3b3d302..c04b933f48d3 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -27,6 +27,7 @@
#include <linux/clocksource.h>
#include <linux/cpu.h>
#include <linux/reboot.h>
+#include <linux/static_call.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
@@ -336,11 +337,11 @@ static void __init vmware_paravirt_ops_setup(void)
vmware_cyc2ns_setup();
if (vmw_sched_clock)
- pv_ops.time.sched_clock = vmware_sched_clock;
+ paravirt_set_sched_clock(vmware_sched_clock);
if (vmware_is_stealclock_available()) {
has_steal_clock = true;
- pv_ops.time.steal_clock = vmware_steal_clock;
+ static_call_update(pv_steal_clock, vmware_steal_clock);
/* We use reboot notifier only to disable steal clock */
register_reboot_notifier(&vmware_pv_reboot_nb);
@@ -378,6 +379,8 @@ static void __init vmware_set_capabilities(void)
{
setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC);
setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ if (vmware_tsc_khz)
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMCALL)
setup_force_cpu_cap(X86_FEATURE_VMCALL);
else if (vmware_hypercall_mode == CPUID_VMWARE_FEATURES_ECX_VMMCALL)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index a8f3af257e26..54ce999ed321 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -323,8 +323,8 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
cmem->nr_ranges = 1;
/* Exclude elf header region */
- start = image->arch.elf_load_addr;
- end = start + image->arch.elf_headers_sz - 1;
+ start = image->elf_load_addr;
+ end = start + image->elf_headers_sz - 1;
return crash_exclude_mem_range(cmem, start, end);
}
@@ -337,7 +337,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
struct crash_memmap_data cmd;
struct crash_mem *cmem;
- cmem = vzalloc(sizeof(struct crash_mem));
+ cmem = vzalloc(struct_size(cmem, ranges, 1));
if (!cmem)
return -ENOMEM;
@@ -407,20 +407,20 @@ int crash_load_segments(struct kimage *image)
if (ret)
return ret;
- image->arch.elf_headers = kbuf.buffer;
- image->arch.elf_headers_sz = kbuf.bufsz;
+ image->elf_headers = kbuf.buffer;
+ image->elf_headers_sz = kbuf.bufsz;
kbuf.memsz = kbuf.bufsz;
kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
kbuf.mem = KEXEC_BUF_MEM_UNKNOWN;
ret = kexec_add_buffer(&kbuf);
if (ret) {
- vfree((void *)image->arch.elf_headers);
+ vfree((void *)image->elf_headers);
return ret;
}
- image->arch.elf_load_addr = kbuf.mem;
+ image->elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
+ image->elf_load_addr, kbuf.bufsz, kbuf.bufsz);
return ret;
}
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index 759d392cbe9f..d1d49e3d536b 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -100,9 +100,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ss = __KERNEL_DS,
.ds = __USER_DS,
.fs = __KERNEL_PERCPU,
-#ifndef CONFIG_X86_32_LAZY_GS
- .gs = __KERNEL_STACK_CANARY,
-#endif
+ .gs = 0,
.__cr3 = __pa_nodebug(swapper_pg_dir),
},
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 22aad412f965..bc0657f0deed 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -31,8 +31,8 @@
* - inform the user about the firmware's notion of memory layout
* via /sys/firmware/memmap
*
- * - the hibernation code uses it to generate a kernel-independent MD5
- * fingerprint of the physical memory layout of a system.
+ * - the hibernation code uses it to generate a kernel-independent CRC32
+ * checksum of the physical memory layout of a system.
*
* - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
* passed to us by the bootloader - the major difference between
@@ -793,7 +793,7 @@ core_initcall(e820__register_nvs_regions);
#endif
/*
- * Allocate the requested number of bytes with the requsted alignment
+ * Allocate the requested number of bytes with the requested alignment
* and return (the physical address) to the caller. Also register this
* range in the 'kexec' E820 table as a reserved range.
*
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a4b5af03dcc1..6edd1e2ee8af 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -551,6 +551,7 @@ static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_EHL_IDS(&gen11_early_ops),
INTEL_TGL_12_IDS(&gen11_early_ops),
INTEL_RKL_IDS(&gen11_early_ops),
+ INTEL_ADLS_IDS(&gen11_early_ops),
};
struct resource intel_graphics_stolen_res __ro_after_init = DEFINE_RES_MEM(0, 0);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 683749b80ae2..a85c64000218 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -253,7 +253,7 @@ static bool xfeature_enabled(enum xfeature xfeature)
static void __init setup_xstate_features(void)
{
u32 eax, ebx, ecx, edx, i;
- /* start at the beginnning of the "extended state" */
+ /* start at the beginning of the "extended state" */
unsigned int last_good_offset = offsetof(struct xregs_state,
extended_state_area);
/*
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 7edbd5ee5ed4..1b3ce3b4a2a2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -66,7 +66,7 @@ int ftrace_arch_code_modify_post_process(void)
static const char *ftrace_nop_replace(void)
{
- return ideal_nops[NOP_ATOMIC5];
+ return x86_nops[5];
}
static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
@@ -377,7 +377,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
ip = trampoline + (jmp_offset - start_offset);
if (WARN_ON(*(char *)ip != 0x75))
goto fail;
- ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2);
+ ret = copy_from_kernel_nofault(ip, x86_nops[2], 2);
if (ret < 0)
goto fail;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 5e9beb77cafd..18be44163a50 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -104,7 +104,7 @@ static unsigned int __head *fixup_int(void *ptr, unsigned long physaddr)
static bool __head check_la57_support(unsigned long physaddr)
{
/*
- * 5-level paging is detected and enabled at kernel decomression
+ * 5-level paging is detected and enabled at kernel decompression
* stage. Only check if it has been enabled there.
*/
if (!(native_read_cr4() & X86_CR4_LA57))
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ed84c282233..67f590425d90 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -318,8 +318,8 @@ SYM_FUNC_START(startup_32_smp)
movl $(__KERNEL_PERCPU), %eax
movl %eax,%fs # set this cpu's percpu
- movl $(__KERNEL_STACK_CANARY),%eax
- movl %eax,%gs
+ xorl %eax,%eax
+ movl %eax,%gs # clear possible garbage in %gs
xorl %eax,%eax # Clear LDT
lldt %ax
@@ -339,20 +339,6 @@ SYM_FUNC_END(startup_32_smp)
*/
__INIT
setup_once:
-#ifdef CONFIG_STACKPROTECTOR
- /*
- * Configure the stack canary. The linker can't handle this by
- * relocation. Manually set base address in stack canary
- * segment descriptor.
- */
- movl $gdt_page,%eax
- movl $stack_canary,%ecx
- movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
- shrl $16, %ecx
- movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
- movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
-#endif
-
andl $0,setup_once_ref /* Once is enough, thanks */
ret
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ee1a283f8e96..d552f177eca0 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -245,7 +245,7 @@ static const __initconst struct idt_data ist_idts[] = {
* after that.
*
* Note, that X86_64 cannot install the real #PF handler in
- * idt_setup_early_traps() because the memory intialization needs the #PF
+ * idt_setup_early_traps() because the memory initialization needs the #PF
* handler from the early_idt_handler_array to initialize the early page
* tables.
*/
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 58aa712973ac..e28f6a5d14f1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -338,7 +338,7 @@ void fixup_irqs(void)
irq_migrate_all_off_this_cpu();
/*
- * We can remove mdelay() and then send spuriuous interrupts to
+ * We can remove mdelay() and then send spurious interrupts to
* new cpu targets for all the irqs that were handled previously by
* this cpu. While it works, I have seen spurious interrupt messages
* (nothing wrong but still...).
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 5ba8477c2cb7..6a2eb62c85e6 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -28,10 +28,8 @@ static void bug_at(const void *ip, int line)
}
static const void *
-__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
+__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type)
{
- const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
- const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
const void *expect, *code;
const void *addr, *dest;
int line;
@@ -41,10 +39,8 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type,
code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
- if (init) {
- expect = default_nop; line = __LINE__;
- } else if (type == JUMP_LABEL_JMP) {
- expect = ideal_nop; line = __LINE__;
+ if (type == JUMP_LABEL_JMP) {
+ expect = x86_nops[5]; line = __LINE__;
} else {
expect = code; line = __LINE__;
}
@@ -53,7 +49,7 @@ __jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type,
bug_at(addr, line);
if (type == JUMP_LABEL_NOP)
- code = ideal_nop;
+ code = x86_nops[5];
return code;
}
@@ -62,7 +58,7 @@ static inline void __jump_label_transform(struct jump_entry *entry,
enum jump_label_type type,
int init)
{
- const void *opcode = __jump_label_set_jump_code(entry, type, init);
+ const void *opcode = __jump_label_set_jump_code(entry, type);
/*
* As long as only a single processor is running and the code is still
@@ -113,7 +109,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
}
mutex_lock(&text_mutex);
- opcode = __jump_label_set_jump_code(entry, type, 0);
+ opcode = __jump_label_set_jump_code(entry, type);
text_poke_queue((void *)jump_entry_code(entry),
opcode, JUMP_LABEL_NOP_SIZE, NULL);
mutex_unlock(&text_mutex);
@@ -136,22 +132,6 @@ static enum {
__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- /*
- * This function is called at boot up and when modules are
- * first loaded. Check if the default nop, the one that is
- * inserted at compile time, is the ideal nop. If it is, then
- * we do not need to update the nop, and we can leave it as is.
- * If it is not, then we need to update the nop to the ideal nop.
- */
- if (jlstate == JL_STATE_START) {
- const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
- const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
-
- if (memcmp(ideal_nop, default_nop, 5) != 0)
- jlstate = JL_STATE_UPDATE;
- else
- jlstate = JL_STATE_NO_UPDATE;
- }
if (jlstate == JL_STATE_UPDATE)
jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index ce831f9448e7..170d0fd68b1f 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -75,7 +75,7 @@ static int setup_cmdline(struct kimage *image, struct boot_params *params,
if (image->type == KEXEC_TYPE_CRASH) {
len = sprintf(cmdline_ptr,
- "elfcorehdr=0x%lx ", image->arch.elf_load_addr);
+ "elfcorehdr=0x%lx ", image->elf_load_addr);
}
memcpy(cmdline_ptr + len, cmdline, cmdline_len);
cmdline_len += len;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ff7878df96b4..3a43a2dee658 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -17,7 +17,7 @@
* Updated by: Tom Rini <trini@kernel.crashing.org>
* Updated by: Jason Wessel <jason.wessel@windriver.com>
* Modified for 386 by Jim Kingdon, Cygnus Support.
- * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * Original kgdb, compatibility with 2.1.xx kernel by
* David Grothe <dave@gcom.com>
* Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
* X86_64 changes from Andi Kleen's patch merged by Jim Houston
@@ -642,7 +642,7 @@ void kgdb_arch_late(void)
struct perf_event **pevent;
/*
- * Pre-allocate the hw breakpoint structions in the non-atomic
+ * Pre-allocate the hw breakpoint instructions in the non-atomic
* portion of kgdb because this operation requires mutexs to
* complete.
*/
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index df776cdca327..d3d65545cb8b 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -139,6 +139,8 @@ NOKPROBE_SYMBOL(synthesize_relcall);
int can_boost(struct insn *insn, void *addr)
{
kprobe_opcode_t opcode;
+ insn_byte_t prefix;
+ int i;
if (search_exception_tables((unsigned long)addr))
return 0; /* Page fault may occur on this address. */
@@ -151,35 +153,39 @@ int can_boost(struct insn *insn, void *addr)
if (insn->opcode.nbytes != 1)
return 0;
- /* Can't boost Address-size override prefix */
- if (unlikely(inat_is_address_size_prefix(insn->attr)))
- return 0;
+ for_each_insn_prefix(insn, i, prefix) {
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(prefix);
+ /* Can't boost Address-size override prefix and CS override prefix */
+ if (prefix == 0x2e || inat_is_address_size_prefix(attr))
+ return 0;
+ }
opcode = insn->opcode.bytes[0];
- switch (opcode & 0xf0) {
- case 0x60:
- /* can't boost "bound" */
- return (opcode != 0x62);
- case 0x70:
- return 0; /* can't boost conditional jump */
- case 0x90:
- return opcode != 0x9a; /* can't boost call far */
- case 0xc0:
- /* can't boost software-interruptions */
- return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
- case 0xd0:
- /* can boost AA* and XLAT */
- return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
- case 0xe0:
- /* can boost in/out and absolute jmps */
- return ((opcode & 0x04) || opcode == 0xea);
- case 0xf0:
- /* clear and set flags are boostable */
- return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
+ switch (opcode) {
+ case 0x62: /* bound */
+ case 0x70 ... 0x7f: /* Conditional jumps */
+ case 0x9a: /* Call far */
+ case 0xc0 ... 0xc1: /* Grp2 */
+ case 0xcc ... 0xce: /* software exceptions */
+ case 0xd0 ... 0xd3: /* Grp2 */
+ case 0xd6: /* (UD) */
+ case 0xd8 ... 0xdf: /* ESC */
+ case 0xe0 ... 0xe3: /* LOOP*, JCXZ */
+ case 0xe8 ... 0xe9: /* near Call, JMP */
+ case 0xeb: /* Short JMP */
+ case 0xf0 ... 0xf4: /* LOCK/REP, HLT */
+ case 0xf6 ... 0xf7: /* Grp3 */
+ case 0xfe: /* Grp4 */
+ /* ... are not boostable */
+ return 0;
+ case 0xff: /* Grp5 */
+ /* Only indirect jmp is boostable */
+ return X86_MODRM_REG(insn->modrm.bytes[0]) == 4;
default:
- /* CS override prefix and call are not boostable */
- return (opcode != 0x2e && opcode != 0x9a);
+ return 1;
}
}
@@ -229,7 +235,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
return 0UL;
if (faddr)
- memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+ memcpy(buf, x86_nops[5], 5);
else
buf[0] = kp->opcode;
return (unsigned long)buf;
@@ -265,6 +271,8 @@ static int can_probe(unsigned long paddr)
/* Decode instructions */
addr = paddr - offset;
while (addr < paddr) {
+ int ret;
+
/*
* Check if the instruction has been modified by another
* kprobe, in which case we replace the breakpoint by the
@@ -276,8 +284,10 @@ static int can_probe(unsigned long paddr)
__addr = recover_probed_instruction(buf, addr);
if (!__addr)
return 0;
- kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
- insn_get_length(&insn);
+
+ ret = insn_decode_kernel(&insn, (void *)__addr);
+ if (ret < 0)
+ return 0;
/*
* Another debugging subsystem might insert this breakpoint.
@@ -301,8 +311,8 @@ static int can_probe(unsigned long paddr)
int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
{
kprobe_opcode_t buf[MAX_INSN_SIZE];
- unsigned long recovered_insn =
- recover_probed_instruction(buf, (unsigned long)src);
+ unsigned long recovered_insn = recover_probed_instruction(buf, (unsigned long)src);
+ int ret;
if (!recovered_insn || !insn)
return 0;
@@ -312,8 +322,9 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
MAX_INSN_SIZE))
return 0;
- kernel_insn_init(insn, dest, MAX_INSN_SIZE);
- insn_get_length(insn);
+ ret = insn_decode_kernel(insn, dest);
+ if (ret < 0)
+ return 0;
/* We can not probe force emulate prefixed instruction */
if (insn_has_emulate_prefix(insn))
@@ -357,13 +368,14 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
return insn->length;
}
-/* Prepare reljump right after instruction to boost */
-static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
- struct insn *insn)
+/* Prepare reljump or int3 right after instruction */
+static int prepare_singlestep(kprobe_opcode_t *buf, struct kprobe *p,
+ struct insn *insn)
{
int len = insn->length;
- if (can_boost(insn, p->addr) &&
+ if (!IS_ENABLED(CONFIG_PREEMPTION) &&
+ !p->post_handler && can_boost(insn, p->addr) &&
MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
/*
* These instructions can be executed directly if it
@@ -374,7 +386,12 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
len += JMP32_INSN_SIZE;
p->ainsn.boostable = 1;
} else {
- p->ainsn.boostable = 0;
+ /* Otherwise, put an int3 for trapping singlestep */
+ if (MAX_INSN_SIZE - len < INT3_INSN_SIZE)
+ return -ENOSPC;
+
+ buf[len] = INT3_INSN_OPCODE;
+ len += INT3_INSN_SIZE;
}
return len;
@@ -411,86 +428,290 @@ void free_insn_page(void *page)
module_memfree(page);
}
-static void set_resume_flags(struct kprobe *p, struct insn *insn)
+/* Kprobe x86 instruction emulation - only regs->ip or IF flag modifiers */
+
+static void kprobe_emulate_ifmodifiers(struct kprobe *p, struct pt_regs *regs)
+{
+ switch (p->ainsn.opcode) {
+ case 0xfa: /* cli */
+ regs->flags &= ~(X86_EFLAGS_IF);
+ break;
+ case 0xfb: /* sti */
+ regs->flags |= X86_EFLAGS_IF;
+ break;
+ case 0x9c: /* pushf */
+ int3_emulate_push(regs, regs->flags);
+ break;
+ case 0x9d: /* popf */
+ regs->flags = int3_emulate_pop(regs);
+ break;
+ }
+ regs->ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ifmodifiers);
+
+static void kprobe_emulate_ret(struct kprobe *p, struct pt_regs *regs)
+{
+ int3_emulate_ret(regs);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_ret);
+
+static void kprobe_emulate_call(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long func = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ func += p->ainsn.rel32;
+ int3_emulate_call(regs, func);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call);
+
+static nokprobe_inline
+void __kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs, bool cond)
+{
+ unsigned long ip = regs->ip - INT3_INSN_SIZE + p->ainsn.size;
+
+ if (cond)
+ ip += p->ainsn.rel32;
+ int3_emulate_jmp(regs, ip);
+}
+
+static void kprobe_emulate_jmp(struct kprobe *p, struct pt_regs *regs)
+{
+ __kprobe_emulate_jmp(p, regs, true);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp);
+
+static const unsigned long jcc_mask[6] = {
+ [0] = X86_EFLAGS_OF,
+ [1] = X86_EFLAGS_CF,
+ [2] = X86_EFLAGS_ZF,
+ [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF,
+ [4] = X86_EFLAGS_SF,
+ [5] = X86_EFLAGS_PF,
+};
+
+static void kprobe_emulate_jcc(struct kprobe *p, struct pt_regs *regs)
+{
+ bool invert = p->ainsn.jcc.type & 1;
+ bool match;
+
+ if (p->ainsn.jcc.type < 0xc) {
+ match = regs->flags & jcc_mask[p->ainsn.jcc.type >> 1];
+ } else {
+ match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^
+ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT);
+ if (p->ainsn.jcc.type >= 0xe)
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+ }
+ __kprobe_emulate_jmp(p, regs, (match && !invert) || (!match && invert));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jcc);
+
+static void kprobe_emulate_loop(struct kprobe *p, struct pt_regs *regs)
+{
+ bool match;
+
+ if (p->ainsn.loop.type != 3) { /* LOOP* */
+ if (p->ainsn.loop.asize == 32)
+ match = ((*(u32 *)&regs->cx)--) != 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = ((*(u64 *)&regs->cx)--) != 0;
+#endif
+ else
+ match = ((*(u16 *)&regs->cx)--) != 0;
+ } else { /* JCXZ */
+ if (p->ainsn.loop.asize == 32)
+ match = *(u32 *)(&regs->cx) == 0;
+#ifdef CONFIG_X86_64
+ else if (p->ainsn.loop.asize == 64)
+ match = *(u64 *)(&regs->cx) == 0;
+#endif
+ else
+ match = *(u16 *)(&regs->cx) == 0;
+ }
+
+ if (p->ainsn.loop.type == 0) /* LOOPNE */
+ match = match && !(regs->flags & X86_EFLAGS_ZF);
+ else if (p->ainsn.loop.type == 1) /* LOOPE */
+ match = match && (regs->flags & X86_EFLAGS_ZF);
+
+ __kprobe_emulate_jmp(p, regs, match);
+}
+NOKPROBE_SYMBOL(kprobe_emulate_loop);
+
+static const int addrmode_regoffs[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+#endif
+};
+
+static void kprobe_emulate_call_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_call(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_call_indirect);
+
+static void kprobe_emulate_jmp_indirect(struct kprobe *p, struct pt_regs *regs)
+{
+ unsigned long offs = addrmode_regoffs[p->ainsn.indirect.reg];
+
+ int3_emulate_jmp(regs, regs_get_register(regs, offs));
+}
+NOKPROBE_SYMBOL(kprobe_emulate_jmp_indirect);
+
+static int prepare_emulation(struct kprobe *p, struct insn *insn)
{
insn_byte_t opcode = insn->opcode.bytes[0];
switch (opcode) {
case 0xfa: /* cli */
case 0xfb: /* sti */
+ case 0x9c: /* pushfl */
case 0x9d: /* popf/popfd */
- /* Check whether the instruction modifies Interrupt Flag or not */
- p->ainsn.if_modifier = 1;
- break;
- case 0x9c: /* pushfl */
- p->ainsn.is_pushf = 1;
+ /*
+ * IF modifiers must be emulated since it will enable interrupt while
+ * int3 single stepping.
+ */
+ p->ainsn.emulate_op = kprobe_emulate_ifmodifiers;
+ p->ainsn.opcode = opcode;
break;
- case 0xcf: /* iret */
- p->ainsn.if_modifier = 1;
- fallthrough;
case 0xc2: /* ret/lret */
case 0xc3:
case 0xca:
case 0xcb:
- case 0xea: /* jmp absolute -- ip is correct */
- /* ip is already adjusted, no more changes required */
- p->ainsn.is_abs_ip = 1;
- /* Without resume jump, this is boostable */
- p->ainsn.boostable = 1;
+ p->ainsn.emulate_op = kprobe_emulate_ret;
break;
- case 0xe8: /* call relative - Fix return addr */
- p->ainsn.is_call = 1;
+ case 0x9a: /* far call absolute -- segment is not supported */
+ case 0xea: /* far jmp absolute -- segment is not supported */
+ case 0xcc: /* int3 */
+ case 0xcf: /* iret -- in-kernel IRET is not supported */
+ return -EOPNOTSUPP;
break;
-#ifdef CONFIG_X86_32
- case 0x9a: /* call absolute -- same as call absolute, indirect */
- p->ainsn.is_call = 1;
- p->ainsn.is_abs_ip = 1;
+ case 0xe8: /* near call relative */
+ p->ainsn.emulate_op = kprobe_emulate_call;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
break;
-#endif
- case 0xff:
+ case 0xeb: /* short jump relative */
+ case 0xe9: /* near jump relative */
+ p->ainsn.emulate_op = kprobe_emulate_jmp;
+ if (insn->immediate.nbytes == 1)
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ else if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ break;
+ case 0x70 ... 0x7f:
+ /* 1 byte conditional jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ p->ainsn.rel32 = *(char *)insn->immediate.bytes;
+ break;
+ case 0x0f:
opcode = insn->opcode.bytes[1];
+ if ((opcode & 0xf0) == 0x80) {
+ /* 2 bytes Conditional Jump */
+ p->ainsn.emulate_op = kprobe_emulate_jcc;
+ p->ainsn.jcc.type = opcode & 0xf;
+ if (insn->immediate.nbytes == 2)
+ p->ainsn.rel32 = *(s16 *)&insn->immediate.value;
+ else
+ p->ainsn.rel32 = *(s32 *)&insn->immediate.value;
+ } else if (opcode == 0x01 &&
+ X86_MODRM_REG(insn->modrm.bytes[0]) == 0 &&
+ X86_MODRM_MOD(insn->modrm.bytes[0]) == 3) {
+ /* VM extensions - not supported */
+ return -EOPNOTSUPP;
+ }
+ break;
+ case 0xe0: /* Loop NZ */
+ case 0xe1: /* Loop */
+ case 0xe2: /* Loop */
+ case 0xe3: /* J*CXZ */
+ p->ainsn.emulate_op = kprobe_emulate_loop;
+ p->ainsn.loop.type = opcode & 0x3;
+ p->ainsn.loop.asize = insn->addr_bytes * 8;
+ p->ainsn.rel32 = *(s8 *)&insn->immediate.value;
+ break;
+ case 0xff:
+ /*
+ * Since the 0xff is an extended group opcode, the instruction
+ * is determined by the MOD/RM byte.
+ */
+ opcode = insn->modrm.bytes[0];
if ((opcode & 0x30) == 0x10) {
- /*
- * call absolute, indirect
- * Fix return addr; ip is correct.
- * But this is not boostable
- */
- p->ainsn.is_call = 1;
- p->ainsn.is_abs_ip = 1;
+ if ((opcode & 0x8) == 0x8)
+ return -EOPNOTSUPP; /* far call */
+ /* call absolute, indirect */
+ p->ainsn.emulate_op = kprobe_emulate_call_indirect;
+ } else if ((opcode & 0x30) == 0x20) {
+ if ((opcode & 0x8) == 0x8)
+ return -EOPNOTSUPP; /* far jmp */
+ /* jmp near absolute indirect */
+ p->ainsn.emulate_op = kprobe_emulate_jmp_indirect;
+ } else
break;
- } else if (((opcode & 0x31) == 0x20) ||
- ((opcode & 0x31) == 0x21)) {
- /*
- * jmp near and far, absolute indirect
- * ip is correct.
- */
- p->ainsn.is_abs_ip = 1;
- /* Without resume jump, this is boostable */
- p->ainsn.boostable = 1;
- }
+
+ if (insn->addr_bytes != sizeof(unsigned long))
+ return -EOPNOTSUPP; /* Don't support differnt size */
+ if (X86_MODRM_MOD(opcode) != 3)
+ return -EOPNOTSUPP; /* TODO: support memory addressing */
+
+ p->ainsn.indirect.reg = X86_MODRM_RM(opcode);
+#ifdef CONFIG_X86_64
+ if (X86_REX_B(insn->rex_prefix.value))
+ p->ainsn.indirect.reg += 8;
+#endif
+ break;
+ default:
break;
}
+ p->ainsn.size = insn->length;
+
+ return 0;
}
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
- int len;
+ int ret, len;
/* Copy an instruction with recovering if other optprobe modifies it.*/
len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
if (!len)
return -EINVAL;
- /*
- * __copy_instruction can modify the displacement of the instruction,
- * but it doesn't affect boostable check.
- */
- len = prepare_boost(buf, p, &insn);
+ /* Analyze the opcode and setup emulate functions */
+ ret = prepare_emulation(p, &insn);
+ if (ret < 0)
+ return ret;
- /* Analyze the opcode and set resume flags */
- set_resume_flags(p, &insn);
+ /* Add int3 for single-step or booster jmp */
+ len = prepare_singlestep(buf, p, &insn);
+ if (len < 0)
+ return len;
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -583,29 +804,7 @@ set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
- = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
- if (p->ainsn.if_modifier)
- kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
-}
-
-static nokprobe_inline void clear_btf(void)
-{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl &= ~DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- }
-}
-
-static nokprobe_inline void restore_btf(void)
-{
- if (test_thread_flag(TIF_BLOCKSTEP)) {
- unsigned long debugctl = get_debugctlmsr();
-
- debugctl |= DEBUGCTLMSR_BTF;
- update_debugctlmsr(debugctl);
- }
+ = (regs->flags & X86_EFLAGS_IF);
}
void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
@@ -620,6 +819,22 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(arch_prepare_kretprobe);
+static void kprobe_post_process(struct kprobe *cur, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ cur->post_handler(cur, regs, 0);
+ }
+
+ /* Restore back the original saved kprobes variables and continue. */
+ if (kcb->kprobe_status == KPROBE_REENTER)
+ restore_previous_kprobe(kcb);
+ else
+ reset_current_kprobe();
+}
+NOKPROBE_SYMBOL(kprobe_post_process);
+
static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb, int reenter)
{
@@ -627,7 +842,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
return;
#if !defined(CONFIG_PREEMPTION)
- if (p->ainsn.boostable && !p->post_handler) {
+ if (p->ainsn.boostable) {
/* Boost up -- we can execute copied instructions directly */
if (!reenter)
reset_current_kprobe();
@@ -646,19 +861,51 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
kcb->kprobe_status = KPROBE_REENTER;
} else
kcb->kprobe_status = KPROBE_HIT_SS;
- /* Prepare real single stepping */
- clear_btf();
- regs->flags |= X86_EFLAGS_TF;
+
+ if (p->ainsn.emulate_op) {
+ p->ainsn.emulate_op(p, regs);
+ kprobe_post_process(p, regs, kcb);
+ return;
+ }
+
+ /* Disable interrupt, and set ip register on trampoline */
regs->flags &= ~X86_EFLAGS_IF;
- /* single step inline if the instruction is an int3 */
- if (p->opcode == INT3_INSN_OPCODE)
- regs->ip = (unsigned long)p->addr;
- else
- regs->ip = (unsigned long)p->ainsn.insn;
+ regs->ip = (unsigned long)p->ainsn.insn;
}
NOKPROBE_SYMBOL(setup_singlestep);
/*
+ * Called after single-stepping. p->addr is the address of the
+ * instruction whose first byte has been replaced by the "int3"
+ * instruction. To avoid the SMP problems that can occur when we
+ * temporarily put back the original opcode to single-step, we
+ * single-stepped a copy of the instruction. The address of this
+ * copy is p->ainsn.insn. We also doesn't use trap, but "int3" again
+ * right after the copied instruction.
+ * Different from the trap single-step, "int3" single-step can not
+ * handle the instruction which changes the ip register, e.g. jmp,
+ * call, conditional jmp, and the instructions which changes the IF
+ * flags because interrupt must be disabled around the single-stepping.
+ * Such instructions are software emulated, but others are single-stepped
+ * using "int3".
+ *
+ * When the 2nd "int3" handled, the regs->ip and regs->flags needs to
+ * be adjusted, so that we can resume execution on correct code.
+ */
+static void resume_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ unsigned long copy_ip = (unsigned long)p->ainsn.insn;
+ unsigned long orig_ip = (unsigned long)p->addr;
+
+ /* Restore saved interrupt flag and ip register */
+ regs->flags |= kcb->kprobe_saved_flags;
+ /* Note that regs->ip is executed int3 so must be a step back */
+ regs->ip += (orig_ip - copy_ip) - INT3_INSN_SIZE;
+}
+NOKPROBE_SYMBOL(resume_singlestep);
+
+/*
* We have reentered the kprobe_handler(), since another probe was hit while
* within the handler. We save the original kprobes variables and just single
* step on the instruction of the new probe without calling any user handlers.
@@ -693,6 +940,12 @@ static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
}
NOKPROBE_SYMBOL(reenter_kprobe);
+static nokprobe_inline int kprobe_is_ss(struct kprobe_ctlblk *kcb)
+{
+ return (kcb->kprobe_status == KPROBE_HIT_SS ||
+ kcb->kprobe_status == KPROBE_REENTER);
+}
+
/*
* Interrupts are disabled on entry as trap3 is an interrupt gate and they
* remain disabled throughout this function.
@@ -737,7 +990,18 @@ int kprobe_int3_handler(struct pt_regs *regs)
reset_current_kprobe();
return 1;
}
- } else if (*addr != INT3_INSN_OPCODE) {
+ } else if (kprobe_is_ss(kcb)) {
+ p = kprobe_running();
+ if ((unsigned long)p->ainsn.insn < regs->ip &&
+ (unsigned long)p->ainsn.insn + MAX_INSN_SIZE > regs->ip) {
+ /* Most provably this is the second int3 for singlestep */
+ resume_singlestep(p, regs, kcb);
+ kprobe_post_process(p, regs, kcb);
+ return 1;
+ }
+ }
+
+ if (*addr != INT3_INSN_OPCODE) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
@@ -810,91 +1074,6 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
}
NOKPROBE_SYMBOL(trampoline_handler);
-/*
- * Called after single-stepping. p->addr is the address of the
- * instruction whose first byte has been replaced by the "int 3"
- * instruction. To avoid the SMP problems that can occur when we
- * temporarily put back the original opcode to single-step, we
- * single-stepped a copy of the instruction. The address of this
- * copy is p->ainsn.insn.
- *
- * This function prepares to return from the post-single-step
- * interrupt. We have to fix up the stack as follows:
- *
- * 0) Except in the case of absolute or indirect jump or call instructions,
- * the new ip is relative to the copied instruction. We need to make
- * it relative to the original instruction.
- *
- * 1) If the single-stepped instruction was pushfl, then the TF and IF
- * flags are set in the just-pushed flags, and may need to be cleared.
- *
- * 2) If the single-stepped instruction was a call, the return address
- * that is atop the stack is the address following the copied instruction.
- * We need to make it the address following the original instruction.
- */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs,
- struct kprobe_ctlblk *kcb)
-{
- unsigned long *tos = stack_addr(regs);
- unsigned long copy_ip = (unsigned long)p->ainsn.insn;
- unsigned long orig_ip = (unsigned long)p->addr;
-
- regs->flags &= ~X86_EFLAGS_TF;
-
- /* Fixup the contents of top of stack */
- if (p->ainsn.is_pushf) {
- *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
- *tos |= kcb->kprobe_old_flags;
- } else if (p->ainsn.is_call) {
- *tos = orig_ip + (*tos - copy_ip);
- }
-
- if (!p->ainsn.is_abs_ip)
- regs->ip += orig_ip - copy_ip;
-
- restore_btf();
-}
-NOKPROBE_SYMBOL(resume_execution);
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate and they
- * remain disabled throughout this function.
- */
-int kprobe_debug_handler(struct pt_regs *regs)
-{
- struct kprobe *cur = kprobe_running();
- struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
-
- if (!cur)
- return 0;
-
- resume_execution(cur, regs, kcb);
- regs->flags |= kcb->kprobe_saved_flags;
-
- if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
- kcb->kprobe_status = KPROBE_HIT_SSDONE;
- cur->post_handler(cur, regs, 0);
- }
-
- /* Restore back the original saved kprobes variables and continue. */
- if (kcb->kprobe_status == KPROBE_REENTER) {
- restore_previous_kprobe(kcb);
- goto out;
- }
- reset_current_kprobe();
-out:
- /*
- * if somebody else is singlestepping across a probe point, flags
- * will have TF set, in which case, continue the remaining processing
- * of do_debug, as if this is not a probe hit.
- */
- if (regs->flags & X86_EFLAGS_TF)
- return 0;
-
- return 1;
-}
-NOKPROBE_SYMBOL(kprobe_debug_handler);
-
int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
@@ -912,20 +1091,9 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
* normal page fault.
*/
regs->ip = (unsigned long)cur->addr;
- /*
- * Trap flag (TF) has been set here because this fault
- * happened where the single stepping will be done.
- * So clear it by resetting the current kprobe:
- */
- regs->flags &= ~X86_EFLAGS_TF;
- /*
- * Since the single step (trap) has been cancelled,
- * we need to restore BTF here.
- */
- restore_btf();
/*
- * If the TF flag was set before the kprobe hit,
+ * If the IF flag was set before the kprobe hit,
* don't touch it:
*/
regs->flags |= kcb->kprobe_old_flags;
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 51c7f5271aee..596de2f6d3a5 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -12,7 +12,7 @@
#include "common.h"
-/* Ftrace callback handler for kprobes -- called under preepmt disabled */
+/* Ftrace callback handler for kprobes -- called under preempt disabled */
void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 08eb23074f92..71425ebba98a 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -312,6 +312,8 @@ static int can_optimize(unsigned long paddr)
addr = paddr - offset;
while (addr < paddr - offset + size) { /* Decode until function end */
unsigned long recovered_insn;
+ int ret;
+
if (search_exception_tables(addr))
/*
* Since some fixup code will jumps into this function,
@@ -321,8 +323,11 @@ static int can_optimize(unsigned long paddr)
recovered_insn = recover_probed_instruction(buf, addr);
if (!recovered_insn)
return 0;
- kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
- insn_get_length(&insn);
+
+ ret = insn_decode_kernel(&insn, (void *)recovered_insn);
+ if (ret < 0)
+ return 0;
+
/*
* In the case of detecting unknown breakpoint, this could be
* a padding INT3 between functions. Let's check that all the
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 78bb0fae3982..5d32fa477a62 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -613,7 +613,7 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
}
#endif
-static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+static void kvm_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
u8 state;
@@ -627,6 +627,11 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
* queue flush_on_enter for pre-empted vCPUs
*/
for_each_cpu(cpu, flushmask) {
+ /*
+ * The local vCPU is never preempted, so we do not explicitly
+ * skip check for local vCPU - it will never be cleared from
+ * flushmask.
+ */
src = &per_cpu(steal_time, cpu);
state = READ_ONCE(src->preempted);
if ((state & KVM_VCPU_PREEMPTED)) {
@@ -636,7 +641,7 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
}
}
- native_flush_tlb_others(flushmask, info);
+ native_flush_tlb_multi(flushmask, info);
}
static void __init kvm_guest_init(void)
@@ -650,11 +655,11 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
- pv_ops.time.steal_clock = kvm_steal_clock;
+ static_call_update(pv_steal_clock, kvm_steal_clock);
}
if (pv_tlb_flush_supported()) {
- pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
+ pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
pr_info("KVM setup pv remote TLB flush\n");
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1fc0962c89c0..d37ed4e1d033 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -106,7 +106,7 @@ static inline void kvm_sched_clock_init(bool stable)
if (!stable)
clear_sched_clock_stable();
kvm_sched_clock_offset = kvm_clock_read();
- pv_ops.time.sched_clock = kvm_sched_clock_read;
+ paravirt_set_sched_clock(kvm_sched_clock_read);
pr_info("kvm-clock: using sched offset of %llu cycles",
kvm_sched_clock_offset);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index a29a44a98e5b..c078b0d3ab0e 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -260,7 +260,7 @@ static void set_idt(void *newidt, u16 limit)
{
struct desc_ptr curidt;
- /* x86-64 supports unaliged loads & stores */
+ /* x86-64 supports unaligned loads & stores */
curidt.size = limit;
curidt.address = (unsigned long)newidt;
@@ -402,8 +402,8 @@ void machine_kexec(struct kimage *image)
#ifdef CONFIG_KEXEC_FILE
void *arch_kexec_kernel_image_load(struct kimage *image)
{
- vfree(image->arch.elf_headers);
- image->arch.elf_headers = NULL;
+ vfree(image->elf_headers);
+ image->elf_headers = NULL;
if (!image->fops || !image->fops->load)
return ERR_PTR(-ENOEXEC);
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 4f75d0cf6305..9e1ea99ad9df 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -32,3 +32,12 @@ bool pv_is_native_vcpu_is_preempted(void)
return pv_ops.lock.vcpu_is_preempted.func ==
__raw_callee_save___native_vcpu_is_preempted;
}
+
+void __init paravirt_set_cap(void)
+{
+ if (!pv_is_native_spin_unlock())
+ setup_force_cpu_cap(X86_FEATURE_PVUNLOCK);
+
+ if (!pv_is_native_vcpu_is_preempted())
+ setup_force_cpu_cap(X86_FEATURE_VCPUPREEMPT);
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c60222ab8ab9..04cafc057bed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -14,6 +14,7 @@
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/pgtable.h>
+#include <linux/static_call.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -52,7 +53,10 @@ void __init default_banner(void)
}
/* Undefined instruction for dealing with missing ops pointers. */
-static const unsigned char ud2a[] = { 0x0f, 0x0b };
+static void paravirt_BUG(void)
+{
+ BUG();
+}
struct branch {
unsigned char opcode;
@@ -85,25 +89,6 @@ u64 notrace _paravirt_ident_64(u64 x)
{
return x;
}
-
-static unsigned paravirt_patch_jmp(void *insn_buff, const void *target,
- unsigned long addr, unsigned len)
-{
- struct branch *b = insn_buff;
- unsigned long delta = (unsigned long)target - (addr+5);
-
- if (len < 5) {
-#ifdef CONFIG_RETPOLINE
- WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
-#endif
- return len; /* call too long for patch site */
- }
-
- b->opcode = 0xe9; /* jmp */
- b->delta = delta;
-
- return 5;
-}
#endif
DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key);
@@ -114,8 +99,8 @@ void __init native_pv_lock_init(void)
static_branch_disable(&virt_spin_lock_key);
}
-unsigned paravirt_patch_default(u8 type, void *insn_buff,
- unsigned long addr, unsigned len)
+unsigned int paravirt_patch(u8 type, void *insn_buff, unsigned long addr,
+ unsigned int len)
{
/*
* Neat trick to map patch type back to the call within the
@@ -125,20 +110,10 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
unsigned ret;
if (opfunc == NULL)
- /* If there's no function, patch it with a ud2a (BUG) */
- ret = paravirt_patch_insns(insn_buff, len, ud2a, ud2a+sizeof(ud2a));
+ /* If there's no function, patch it with paravirt_BUG() */
+ ret = paravirt_patch_call(insn_buff, paravirt_BUG, addr, len);
else if (opfunc == _paravirt_nop)
ret = 0;
-
-#ifdef CONFIG_PARAVIRT_XXL
- /* identity functions just return their single argument */
- else if (opfunc == _paravirt_ident_64)
- ret = paravirt_patch_ident_64(insn_buff, len);
-
- else if (type == PARAVIRT_PATCH(cpu.iret))
- /* If operation requires a jmp, then jmp */
- ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
-#endif
else
/* Otherwise call the function. */
ret = paravirt_patch_call(insn_buff, opfunc, addr, len);
@@ -146,19 +121,6 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
return ret;
}
-unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
- const char *start, const char *end)
-{
- unsigned insn_len = end - start;
-
- /* Alternative instruction is too large for the patch site and we cannot continue: */
- BUG_ON(insn_len > len || start == NULL);
-
- memcpy(insn_buff, start, insn_len);
-
- return insn_len;
-}
-
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -167,6 +129,14 @@ static u64 native_steal_clock(int cpu)
return 0;
}
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock);
+
+void paravirt_set_sched_clock(u64 (*func)(void))
+{
+ static_call_update(pv_sched_clock, func);
+}
+
/* These are in entry.S */
extern void native_iret(void);
@@ -269,13 +239,6 @@ struct pv_info pv_info = {
#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64)
struct paravirt_patch_template pv_ops = {
- /* Init ops. */
- .init.patch = native_patch,
-
- /* Time ops. */
- .time.sched_clock = native_sched_clock,
- .time.steal_clock = native_steal_clock,
-
/* Cpu ops. */
.cpu.io_delay = native_io_delay,
@@ -308,8 +271,6 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
- .cpu.iret = native_iret,
-
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
.cpu.update_io_bitmap = native_tss_update_io_bitmap,
@@ -330,7 +291,7 @@ struct paravirt_patch_template pv_ops = {
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
- .mmu.flush_tlb_others = native_flush_tlb_others,
+ .mmu.flush_tlb_multi = native_flush_tlb_multi,
.mmu.tlb_remove_table =
(void (*)(struct mmu_gather *, void *))tlb_remove_page,
@@ -414,6 +375,8 @@ struct paravirt_patch_template pv_ops = {
NOKPROBE_SYMBOL(native_get_debugreg);
NOKPROBE_SYMBOL(native_set_debugreg);
NOKPROBE_SYMBOL(native_load_idt);
+
+void (*paravirt_iret)(void) = native_iret;
#endif
EXPORT_SYMBOL(pv_ops);
diff --git a/arch/x86/kernel/paravirt_patch.c b/arch/x86/kernel/paravirt_patch.c
deleted file mode 100644
index abd27ec67397..000000000000
--- a/arch/x86/kernel/paravirt_patch.c
+++ /dev/null
@@ -1,99 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/stringify.h>
-
-#include <asm/paravirt.h>
-#include <asm/asm-offsets.h>
-
-#define PSTART(d, m) \
- patch_data_##d.m
-
-#define PEND(d, m) \
- (PSTART(d, m) + sizeof(patch_data_##d.m))
-
-#define PATCH(d, m, insn_buff, len) \
- paravirt_patch_insns(insn_buff, len, PSTART(d, m), PEND(d, m))
-
-#define PATCH_CASE(ops, m, data, insn_buff, len) \
- case PARAVIRT_PATCH(ops.m): \
- return PATCH(data, ops##_##m, insn_buff, len)
-
-#ifdef CONFIG_PARAVIRT_XXL
-struct patch_xxl {
- const unsigned char irq_irq_disable[1];
- const unsigned char irq_irq_enable[1];
- const unsigned char irq_save_fl[2];
- const unsigned char mmu_read_cr2[3];
- const unsigned char mmu_read_cr3[3];
- const unsigned char mmu_write_cr3[3];
- const unsigned char cpu_wbinvd[2];
- const unsigned char mov64[3];
-};
-
-static const struct patch_xxl patch_data_xxl = {
- .irq_irq_disable = { 0xfa }, // cli
- .irq_irq_enable = { 0xfb }, // sti
- .irq_save_fl = { 0x9c, 0x58 }, // pushf; pop %[re]ax
- .mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
- .mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
- .mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
- .cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
- .mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
-};
-
-unsigned int paravirt_patch_ident_64(void *insn_buff, unsigned int len)
-{
- return PATCH(xxl, mov64, insn_buff, len);
-}
-# endif /* CONFIG_PARAVIRT_XXL */
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-struct patch_lock {
- unsigned char queued_spin_unlock[3];
- unsigned char vcpu_is_preempted[2];
-};
-
-static const struct patch_lock patch_data_lock = {
- .vcpu_is_preempted = { 0x31, 0xc0 }, // xor %eax, %eax
-
-# ifdef CONFIG_X86_64
- .queued_spin_unlock = { 0xc6, 0x07, 0x00 }, // movb $0, (%rdi)
-# else
- .queued_spin_unlock = { 0xc6, 0x00, 0x00 }, // movb $0, (%eax)
-# endif
-};
-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
-
-unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
- unsigned int len)
-{
- switch (type) {
-
-#ifdef CONFIG_PARAVIRT_XXL
- PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
- PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
-
- PATCH_CASE(mmu, read_cr2, xxl, insn_buff, len);
- PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
- PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
-
- PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
-#endif
-
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
- case PARAVIRT_PATCH(lock.queued_spin_unlock):
- if (pv_is_native_spin_unlock())
- return PATCH(lock, queued_spin_unlock, insn_buff, len);
- break;
-
- case PARAVIRT_PATCH(lock.vcpu_is_preempted):
- if (pv_is_native_vcpu_is_preempted())
- return PATCH(lock, vcpu_is_preempted, insn_buff, len);
- break;
-#endif
- default:
- break;
- }
-
- return paravirt_patch_default(type, insn_buff, addr, len);
-}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c214d7085a4..43cbfc84153a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
*/
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
- /*
- * .sp1 is cpu_current_top_of_stack. The init task never
- * runs user code, but cpu_current_top_of_stack should still
- * be well defined before the first context switch.
- */
+#ifdef CONFIG_X86_32
.sp1 = TOP_OF_INIT_STACK,
-#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
#endif
@@ -451,7 +446,7 @@ void speculative_store_bypass_ht_init(void)
* First HT sibling to come up on the core. Link shared state of
* the first HT sibling to itself. The siblings on the same core
* which come up later will see the shared state pointer and link
- * themself to the state of this CPU.
+ * themselves to the state of this CPU.
*/
st->shared_state = st;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 11065dc03f5b..eda37df016f0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -89,7 +89,7 @@ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
/*
* Assumption here is that last_value, a global accumulator, always goes
* forward. If we are less than that, we should not be much smaller.
- * We assume there is an error marging we're inside, and then the correction
+ * We assume there is an error margin we're inside, and then the correction
* does not sacrifice accuracy.
*
* For reads: global may have changed between test and return,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 94b33885f8d2..f469153eca8a 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -107,7 +107,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movl %cr0, %eax
andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index a4d9a261425b..c53271aebb64 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -121,7 +121,7 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped)
* - Write protect disabled
* - No task switch
* - Don't do FP software emulation.
- * - Proctected mode enabled
+ * - Protected mode enabled
*/
movq %cr0, %rax
andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5ecd69a48393..72920af0b3c0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -65,7 +65,7 @@ RESERVE_BRK(dmi_alloc, 65536);
/*
* Range of the BSS area. The size of the BSS area is determined
- * at link time, with RESERVE_BRK*() facility reserving additional
+ * at link time, with RESERVE_BRK() facility reserving additional
* chunks.
*/
unsigned long _brk_start = (unsigned long)__brk_base;
@@ -633,11 +633,16 @@ static void __init trim_snb_memory(void)
printk(KERN_DEBUG "reserving inaccessible SNB gfx pages\n");
/*
- * Reserve all memory below the 1 MB mark that has not
- * already been reserved.
+ * SandyBridge integrated graphics devices have a bug that prevents
+ * them from accessing certain memory ranges, namely anything below
+ * 1M and in the pages listed in bad_pages[] above.
+ *
+ * To avoid these pages being ever accessed by SNB gfx devices
+ * reserve all memory below the 1 MB mark and bad_pages that have
+ * not already been reserved at boot time.
*/
memblock_reserve(0, 1<<20);
-
+
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
printk(KERN_WARNING "failed to reserve 0x%08lx\n",
@@ -645,18 +650,6 @@ static void __init trim_snb_memory(void)
}
}
-/*
- * Here we put platform-specific memory range workarounds, i.e.
- * memory known to be corrupt or otherwise in need to be reserved on
- * specific platforms.
- *
- * If this gets used more widely it could use a real dispatch mechanism.
- */
-static void __init trim_platform_memory_ranges(void)
-{
- trim_snb_memory();
-}
-
static void __init trim_bios_range(void)
{
/*
@@ -725,11 +718,41 @@ static int __init parse_reservelow(char *p)
early_param("reservelow", parse_reservelow);
-static void __init trim_low_memory_range(void)
+static void __init early_reserve_memory(void)
{
+ /*
+ * Reserve the memory occupied by the kernel between _text and
+ * __end_of_kernel_reserve symbols. Any kernel sections after the
+ * __end_of_kernel_reserve symbol must be explicitly reserved with a
+ * separate memblock_reserve() or they will be discarded.
+ */
+ memblock_reserve(__pa_symbol(_text),
+ (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
+
+ /*
+ * The first 4Kb of memory is a BIOS owned area, but generally it is
+ * not listed as such in the E820 table.
+ *
+ * Reserve the first memory page and typically some additional
+ * memory (64KiB by default) since some BIOSes are known to corrupt
+ * low memory. See the Kconfig help text for X86_RESERVE_LOW.
+ *
+ * In addition, make sure page 0 is always reserved because on
+ * systems with L1TF its contents can be leaked to user processes.
+ */
memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
+
+ early_reserve_initrd();
+
+ if (efi_enabled(EFI_BOOT))
+ efi_memblock_x86_reserve_range();
+
+ memblock_x86_reserve_range_setup_data();
+
+ reserve_ibft_region();
+ reserve_bios_regions();
}
-
+
/*
* Dump out kernel offset information on panic.
*/
@@ -764,29 +787,6 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
void __init setup_arch(char **cmdline_p)
{
- /*
- * Reserve the memory occupied by the kernel between _text and
- * __end_of_kernel_reserve symbols. Any kernel sections after the
- * __end_of_kernel_reserve symbol must be explicitly reserved with a
- * separate memblock_reserve() or they will be discarded.
- */
- memblock_reserve(__pa_symbol(_text),
- (unsigned long)__end_of_kernel_reserve - (unsigned long)_text);
-
- /*
- * Make sure page 0 is always reserved because on systems with
- * L1TF its contents can be leaked to user processes.
- */
- memblock_reserve(0, PAGE_SIZE);
-
- early_reserve_initrd();
-
- /*
- * At this point everything still needed from the boot loader
- * or BIOS or kernel text should be early reserved or marked not
- * RAM in e820. All other memory is free game.
- */
-
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
@@ -822,7 +822,6 @@ void __init setup_arch(char **cmdline_p)
idt_setup_early_traps();
early_cpu_init();
- arch_init_ideal_nops();
jump_label_init();
static_call_init();
early_ioremap_init();
@@ -910,8 +909,18 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
- if (efi_enabled(EFI_BOOT))
- efi_memblock_x86_reserve_range();
+ /*
+ * Do some memory reservations *before* memory is added to
+ * memblock, so memblock allocations won't overwrite it.
+ * Do it after early param, so we could get (unlikely) panic from
+ * serial.
+ *
+ * After this point everything still needed from the boot loader or
+ * firmware or kernel text should be early reserved or marked not
+ * RAM in e820. All other memory is free game.
+ */
+ early_reserve_memory();
+
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory used by the kernel cannot be hot-removed because Linux
@@ -938,9 +947,6 @@ void __init setup_arch(char **cmdline_p)
x86_report_nx();
- /* after early param, so could get panic from serial */
- memblock_x86_reserve_range_setup_data();
-
if (acpi_mps_check()) {
#ifdef CONFIG_X86_LOCAL_APIC
disable_apic = 1;
@@ -1032,22 +1038,17 @@ void __init setup_arch(char **cmdline_p)
*/
find_smp_config();
- reserve_ibft_region();
-
early_alloc_pgt_buf();
/*
* Need to conclude brk, before e820__memblock_setup()
- * it could use memblock_find_in_range, could overlap with
- * brk area.
+ * it could use memblock_find_in_range, could overlap with
+ * brk area.
*/
reserve_brk();
cleanup_highmap();
- /* Look for ACPI tables and reserve memory occupied by them. */
- acpi_boot_table_init();
-
memblock_set_current_limit(ISA_END_ADDRESS);
e820__memblock_setup();
@@ -1057,8 +1058,6 @@ void __init setup_arch(char **cmdline_p)
*/
sev_setup_arch();
- reserve_bios_regions();
-
efi_fake_memmap();
efi_find_mirror();
efi_esrt_init();
@@ -1084,8 +1083,12 @@ void __init setup_arch(char **cmdline_p)
reserve_real_mode();
- trim_platform_memory_ranges();
- trim_low_memory_range();
+ /*
+ * Reserving memory causing GPU hangs on Sandy Bridge integrated
+ * graphics devices should be done after we allocated memory under
+ * 1M for the real mode trampoline.
+ */
+ trim_snb_memory();
init_mem_mapping();
@@ -1132,6 +1135,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
acpi_table_upgrade();
+ /* Look for ACPI tables and reserve memory occupied by them. */
+ acpi_boot_table_init();
vsmp_init();
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fd945ce78554..0941d2f44f2a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -224,7 +224,6 @@ void __init setup_per_cpu_areas(void)
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
- setup_stack_canary_segment(cpu);
/*
* Copy data used in early init routines from the
* initial arrays to the per cpu data areas. These
diff --git a/arch/x86/kernel/sev-es-shared.c b/arch/x86/kernel/sev-es-shared.c
index cdc04d091242..0aa9f13efd57 100644
--- a/arch/x86/kernel/sev-es-shared.c
+++ b/arch/x86/kernel/sev-es-shared.c
@@ -24,7 +24,7 @@ static bool __init sev_es_check_cpu_features(void)
return true;
}
-static void sev_es_terminate(unsigned int reason)
+static void __noreturn sev_es_terminate(unsigned int reason)
{
u64 val = GHCB_SEV_TERMINATE;
@@ -186,7 +186,6 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
* make it accessible to the hypervisor.
*
* In particular, check for:
- * - Hypervisor CPUID bit
* - Availability of CPUID leaf 0x8000001f
* - SEV CPUID bit.
*
@@ -194,10 +193,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
* can't be checked here.
*/
- if ((fn == 1 && !(regs->cx & BIT(31))))
- /* Hypervisor bit */
- goto fail;
- else if (fn == 0x80000000 && (regs->ax < 0x8000001f))
+ if (fn == 0x80000000 && (regs->ax < 0x8000001f))
/* SEV leaf check */
goto fail;
else if ((fn == 0x8000001f && !(regs->ax & BIT(1))))
@@ -210,12 +206,8 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code)
return;
fail:
- sev_es_wr_ghcb_msr(GHCB_SEV_TERMINATE);
- VMGEXIT();
-
- /* Shouldn't get here - if we do halt the machine */
- while (true)
- asm volatile("hlt\n");
+ /* Terminate the guest */
+ sev_es_terminate(GHCB_SEV_ES_REASON_GENERAL_REQUEST);
}
static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/sev-es.c b/arch/x86/kernel/sev-es.c
index 04a780abb512..73873b007838 100644
--- a/arch/x86/kernel/sev-es.c
+++ b/arch/x86/kernel/sev-es.c
@@ -137,29 +137,41 @@ static __always_inline bool on_vc_stack(struct pt_regs *regs)
}
/*
- * This function handles the case when an NMI is raised in the #VC exception
- * handler entry code. In this case, the IST entry for #VC must be adjusted, so
- * that any subsequent #VC exception will not overwrite the stack contents of the
- * interrupted #VC handler.
+ * This function handles the case when an NMI is raised in the #VC
+ * exception handler entry code, before the #VC handler has switched off
+ * its IST stack. In this case, the IST entry for #VC must be adjusted,
+ * so that any nested #VC exception will not overwrite the stack
+ * contents of the interrupted #VC handler.
*
* The IST entry is adjusted unconditionally so that it can be also be
- * unconditionally adjusted back in sev_es_ist_exit(). Otherwise a nested
- * sev_es_ist_exit() call may adjust back the IST entry too early.
+ * unconditionally adjusted back in __sev_es_ist_exit(). Otherwise a
+ * nested sev_es_ist_exit() call may adjust back the IST entry too
+ * early.
+ *
+ * The __sev_es_ist_enter() and __sev_es_ist_exit() functions always run
+ * on the NMI IST stack, as they are only called from NMI handling code
+ * right now.
*/
void noinstr __sev_es_ist_enter(struct pt_regs *regs)
{
unsigned long old_ist, new_ist;
/* Read old IST entry */
- old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
+ new_ist = old_ist = __this_cpu_read(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC]);
- /* Make room on the IST stack */
+ /*
+ * If NMI happened while on the #VC IST stack, set the new IST
+ * value below regs->sp, so that the interrupted stack frame is
+ * not overwritten by subsequent #VC exceptions.
+ */
if (on_vc_stack(regs))
- new_ist = ALIGN_DOWN(regs->sp, 8) - sizeof(old_ist);
- else
- new_ist = old_ist - sizeof(old_ist);
+ new_ist = regs->sp;
- /* Store old IST entry */
+ /*
+ * Reserve additional 8 bytes and store old IST value so this
+ * adjustment can be unrolled in __sev_es_ist_exit().
+ */
+ new_ist -= sizeof(old_ist);
*(unsigned long *)new_ist = old_ist;
/* Set new IST entry */
@@ -251,39 +263,54 @@ static int vc_fetch_insn_kernel(struct es_em_ctxt *ctxt,
return copy_from_kernel_nofault(buffer, (unsigned char *)ctxt->regs->ip, MAX_INSN_SIZE);
}
-static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+static enum es_result __vc_decode_user_insn(struct es_em_ctxt *ctxt)
{
char buffer[MAX_INSN_SIZE];
- enum es_result ret;
int res;
- if (user_mode(ctxt->regs)) {
- res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
- if (!res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
+ res = insn_fetch_from_user_inatomic(ctxt->regs, buffer);
+ if (!res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR | X86_PF_USER;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
+ }
- if (!insn_decode(&ctxt->insn, ctxt->regs, buffer, res))
- return ES_DECODE_FAILED;
- } else {
- res = vc_fetch_insn_kernel(ctxt, buffer);
- if (res) {
- ctxt->fi.vector = X86_TRAP_PF;
- ctxt->fi.error_code = X86_PF_INSTR;
- ctxt->fi.cr2 = ctxt->regs->ip;
- return ES_EXCEPTION;
- }
+ if (!insn_decode_from_regs(&ctxt->insn, ctxt->regs, buffer, res))
+ return ES_DECODE_FAILED;
- insn_init(&ctxt->insn, buffer, MAX_INSN_SIZE - res, 1);
- insn_get_length(&ctxt->insn);
+ if (ctxt->insn.immediate.got)
+ return ES_OK;
+ else
+ return ES_DECODE_FAILED;
+}
+
+static enum es_result __vc_decode_kern_insn(struct es_em_ctxt *ctxt)
+{
+ char buffer[MAX_INSN_SIZE];
+ int res, ret;
+
+ res = vc_fetch_insn_kernel(ctxt, buffer);
+ if (res) {
+ ctxt->fi.vector = X86_TRAP_PF;
+ ctxt->fi.error_code = X86_PF_INSTR;
+ ctxt->fi.cr2 = ctxt->regs->ip;
+ return ES_EXCEPTION;
}
- ret = ctxt->insn.immediate.got ? ES_OK : ES_DECODE_FAILED;
+ ret = insn_decode(&ctxt->insn, buffer, MAX_INSN_SIZE, INSN_MODE_64);
+ if (ret < 0)
+ return ES_DECODE_FAILED;
+ else
+ return ES_OK;
+}
- return ret;
+static enum es_result vc_decode_insn(struct es_em_ctxt *ctxt)
+{
+ if (user_mode(ctxt->regs))
+ return __vc_decode_user_insn(ctxt);
+ else
+ return __vc_decode_kern_insn(ctxt);
}
static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index f306e85a08a6..a06cb107c0e8 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -492,7 +492,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
* SS descriptor, but we do need SS to be valid. It's possible
* that the old SS is entirely bogus -- this can happen if the
* signal we're trying to deliver is #GP or #SS caused by a bad
- * SS value. We also have a compatbility issue here: DOSEMU
+ * SS value. We also have a compatibility issue here: DOSEMU
* relies on the contents of the SS register indicating the
* SS value at the time of the signal, even though that code in
* DOSEMU predates sigreturn's ability to restore SS. (DOSEMU
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..0e5d0a7e203b 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(NSIGFPE != 15);
BUILD_BUG_ON(NSIGSEGV != 9);
BUILD_BUG_ON(NSIGBUS != 5);
- BUILD_BUG_ON(NSIGTRAP != 5);
+ BUILD_BUG_ON(NSIGTRAP != 6);
BUILD_BUG_ON(NSIGCHLD != 6);
BUILD_BUG_ON(NSIGSYS != 2);
@@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void)
BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20);
BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14);
+ BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18);
+ BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10);
+
CHECK_CSI_OFFSET(_sigpoll);
CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int));
CHECK_SI_SIZE (_sigpoll, 4*sizeof(int));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index eff4ce3b10da..06db901fabe8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -67,7 +67,7 @@
* 5AP. symmetric IO mode (normal Linux operation) not affected.
* 'noapic' mode has vector 0xf filled out properly.
* 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 7AP. We do not assume writes to the LVT deasserting IRQs
* 8AP. We do not enable low power mode (deep sleep) during MP bootup
* 9AP. We do not use mixed mode
*
@@ -204,7 +204,7 @@ static void native_stop_other_cpus(int wait)
}
/*
* Don't wait longer than 10 ms if the caller didn't
- * reqeust it. If wait is true, the machine hangs here if
+ * request it. If wait is true, the machine hangs here if
* one or more CPUs do not reach shutdown state.
*/
timeout = USEC_PER_MSEC * 10;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 16703c35a944..7ffb0cf3f997 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -458,29 +458,52 @@ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id &&
+ c->cpu_die_id == o->cpu_die_id)
+ return true;
+ return false;
+}
+
/*
- * Define snc_cpu[] for SNC (Sub-NUMA Cluster) CPUs.
+ * Unlike the other levels, we do not enforce keeping a
+ * multicore group inside a NUMA node. If this happens, we will
+ * discard the MC level of the topology later.
+ */
+static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
+{
+ if (c->phys_proc_id == o->phys_proc_id)
+ return true;
+ return false;
+}
+
+/*
+ * Define intel_cod_cpu[] for Intel COD (Cluster-on-Die) CPUs.
*
- * These are Intel CPUs that enumerate an LLC that is shared by
- * multiple NUMA nodes. The LLC on these systems is shared for
- * off-package data access but private to the NUMA node (half
- * of the package) for on-package access.
+ * Any Intel CPU that has multiple nodes per package and does not
+ * match intel_cod_cpu[] has the SNC (Sub-NUMA Cluster) topology.
*
- * CPUID (the source of the information about the LLC) can only
- * enumerate the cache as being shared *or* unshared, but not
- * this particular configuration. The CPU in this case enumerates
- * the cache to be shared across the entire package (spanning both
- * NUMA nodes).
+ * When in SNC mode, these CPUs enumerate an LLC that is shared
+ * by multiple NUMA nodes. The LLC is shared for off-package data
+ * access but private to the NUMA node (half of the package) for
+ * on-package access. CPUID (the source of the information about
+ * the LLC) can only enumerate the cache as shared or unshared,
+ * but not this particular configuration.
*/
-static const struct x86_cpu_id snc_cpu[] = {
- X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, NULL),
+static const struct x86_cpu_id intel_cod_cpu[] = {
+ X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, 0), /* COD */
+ X86_MATCH_INTEL_FAM6_MODEL(ANY, 1), /* SNC */
{}
};
static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
{
+ const struct x86_cpu_id *id = x86_match_cpu(intel_cod_cpu);
int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
+ bool intel_snc = id && id->driver_data;
/* Do not match if we do not have a valid APICID for cpu: */
if (per_cpu(cpu_llc_id, cpu1) == BAD_APICID)
@@ -495,32 +518,12 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
* means 'c' does not share the LLC of 'o'. This will be
* reflected to userspace.
*/
- if (!topology_same_node(c, o) && x86_match_cpu(snc_cpu))
+ if (match_pkg(c, o) && !topology_same_node(c, o) && intel_snc)
return false;
return topology_sane(c, o, "llc");
}
-/*
- * Unlike the other levels, we do not enforce keeping a
- * multicore group inside a NUMA node. If this happens, we will
- * discard the MC level of the topology later.
- */
-static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
- if (c->phys_proc_id == o->phys_proc_id)
- return true;
- return false;
-}
-
-static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
-{
- if ((c->phys_proc_id == o->phys_proc_id) &&
- (c->cpu_die_id == o->cpu_die_id))
- return true;
- return false;
-}
-
#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
static inline int x86_sched_itmt_flags(void)
@@ -592,14 +595,23 @@ void set_cpu_sibling_map(int cpu)
for_each_cpu(i, cpu_sibling_setup_mask) {
o = &cpu_data(i);
+ if (match_pkg(c, o) && !topology_same_node(c, o))
+ x86_has_numa_in_package = true;
+
if ((i == cpu) || (has_smt && match_smt(c, o)))
link_mask(topology_sibling_cpumask, cpu, i);
if ((i == cpu) || (has_mp && match_llc(c, o)))
link_mask(cpu_llc_shared_mask, cpu, i);
+ if ((i == cpu) || (has_mp && match_die(c, o)))
+ link_mask(topology_die_cpumask, cpu, i);
}
+ threads = cpumask_weight(topology_sibling_cpumask(cpu));
+ if (threads > __max_smt_threads)
+ __max_smt_threads = threads;
+
/*
* This needs a separate iteration over the cpus because we rely on all
* topology_sibling_cpumask links to be set-up.
@@ -613,8 +625,7 @@ void set_cpu_sibling_map(int cpu)
/*
* Does this new cpu bringup a new core?
*/
- if (cpumask_weight(
- topology_sibling_cpumask(cpu)) == 1) {
+ if (threads == 1) {
/*
* for each core in package, increment
* the booted_cores for this new cpu
@@ -631,16 +642,7 @@ void set_cpu_sibling_map(int cpu)
} else if (i != cpu && !c->booted_cores)
c->booted_cores = cpu_data(i).booted_cores;
}
- if (match_pkg(c, o) && !topology_same_node(c, o))
- x86_has_numa_in_package = true;
-
- if ((i == cpu) || (has_mp && match_die(c, o)))
- link_mask(topology_die_cpumask, cpu, i);
}
-
- threads = cpumask_weight(topology_sibling_cpumask(cpu));
- if (threads > __max_smt_threads)
- __max_smt_threads = threads;
}
/* maps the cpu to the sched domain representing multi-core */
@@ -1407,7 +1409,7 @@ void __init calculate_max_logical_packages(void)
int ncpus;
/*
- * Today neither Intel nor AMD support heterogenous systems so
+ * Today neither Intel nor AMD support heterogeneous systems so
* extrapolate the boot cpu's data to all packages.
*/
ncpus = cpu_data(0).booted_cores * topology_max_smt_threads();
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 8627fda8d993..15b058eefc4e 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -29,12 +29,6 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
}
}
-/*
- * This function returns an error if it detects any unreliable features of the
- * stack. Otherwise it guarantees that the stack trace is reliable.
- *
- * If the task is not 'current', the caller *must* ensure the task is inactive.
- */
int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
void *cookie, struct task_struct *task)
{
diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c
index 9442c4136c38..ea028e736831 100644
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -34,7 +34,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
break;
case NOP:
- code = ideal_nops[NOP_ATOMIC5];
+ code = x86_nops[5];
break;
case JMP:
@@ -66,7 +66,7 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
- !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
+ !memcmp(insn, x86_nops[5], 5) ||
!memcmp(insn, xor5rax, 5))
return;
}
diff --git a/arch/x86/kernel/sysfb_efi.c b/arch/x86/kernel/sysfb_efi.c
index 653b7f617b61..8a56a6d80098 100644
--- a/arch/x86/kernel/sysfb_efi.c
+++ b/arch/x86/kernel/sysfb_efi.c
@@ -10,7 +10,7 @@
* EFI Quirks
* Several EFI systems do not correctly advertise their boot framebuffers.
* Hence, we use this static table of known broken machines and fix up the
- * information so framebuffer drivers can load corectly.
+ * information so framebuffer drivers can load correctly.
*/
#include <linux/dmi.h>
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c09ba110204..f9af561c3cd4 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -49,6 +49,30 @@ bool tboot_enabled(void)
return tboot != NULL;
}
+/* noinline to prevent gcc from warning about dereferencing constant fixaddr */
+static noinline __init bool check_tboot_version(void)
+{
+ if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
+ pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ return false;
+ }
+
+ if (tboot->version < 5) {
+ pr_warn("tboot version is invalid: %u\n", tboot->version);
+ return false;
+ }
+
+ pr_info("found shared page at phys addr 0x%llx:\n",
+ boot_params.tboot_addr);
+ pr_debug("version: %d\n", tboot->version);
+ pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
+ pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
+ pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
+ pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
+
+ return true;
+}
+
void __init tboot_probe(void)
{
/* Look for valid page-aligned address for shared page. */
@@ -66,25 +90,9 @@ void __init tboot_probe(void)
/* Map and check for tboot UUID. */
set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
- tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
- if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
- pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr);
+ tboot = (void *)fix_to_virt(FIX_TBOOT_BASE);
+ if (!check_tboot_version())
tboot = NULL;
- return;
- }
- if (tboot->version < 5) {
- pr_warn("tboot version is invalid: %u\n", tboot->version);
- tboot = NULL;
- return;
- }
-
- pr_info("found shared page at phys addr 0x%llx:\n",
- boot_params.tboot_addr);
- pr_debug("version: %d\n", tboot->version);
- pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
- pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
- pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
- pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
}
static pgd_t *tboot_pg_dir;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 64a496a0687f..3c883e064242 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -164,17 +164,11 @@ int do_set_thread_area(struct task_struct *p, int idx,
savesegment(fs, sel);
if (sel == modified_sel)
loadsegment(fs, sel);
-
- savesegment(gs, sel);
- if (sel == modified_sel)
- load_gs_index(sel);
#endif
-#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, sel);
if (sel == modified_sel)
- loadsegment(gs, sel);
-#endif
+ load_gs_index(sel);
} else {
#ifdef CONFIG_X86_64
if (p->thread.fsindex == modified_sel)
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index f5477eab5692..bd83748e2bde 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -113,7 +113,7 @@ int arch_register_cpu(int num)
* Two known BSP/CPU0 dependencies: Resume from suspend/hibernate
* depends on BSP. PIC interrupts depend on BSP.
*
- * If the BSP depencies are under control, one can tell kernel to
+ * If the BSP dependencies are under control, one can tell kernel to
* enable BSP hotplug. This basically adds a control file and
* one can attempt to offline BSP.
*/
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 651e3e508959..853ea7a80806 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -395,7 +395,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
/*
* Adjust our frame so that we return straight to the #GP
* vector with the expected RSP value. This is safe because
- * we won't enable interupts or schedule before we invoke
+ * we won't enable interrupts or schedule before we invoke
* general_protection, so nothing will clobber the stack
* frame we just set up.
*
@@ -498,14 +498,15 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
{
u8 insn_buf[MAX_INSN_SIZE];
struct insn insn;
+ int ret;
if (copy_from_kernel_nofault(insn_buf, (void *)regs->ip,
MAX_INSN_SIZE))
return GP_NO_HINT;
- kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
- insn_get_modrm(&insn);
- insn_get_sib(&insn);
+ ret = insn_decode_kernel(&insn, insn_buf);
+ if (ret < 0)
+ return GP_NO_HINT;
*addr = (unsigned long)insn_get_addr_ref(&insn, regs);
if (*addr == -1UL)
@@ -889,9 +890,6 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
dr6 &= ~DR_STEP;
- if (kprobe_debug_handler(regs))
- goto out;
-
/*
* The kernel doesn't use INT1
*/
@@ -978,6 +976,10 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
goto out_irq;
}
+ /* #DB for bus lock can only be triggered from userspace. */
+ if (dr6 & DR_BUS_LOCK)
+ handle_bus_lock(regs);
+
/* Add the virtual_dr6 bits for signals. */
dr6 |= current->thread.virtual_dr6;
if (dr6 & (DR_STEP | DR_TRAP_BITS) || icebp)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f70dffc2771f..57ec01192180 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/timex.h>
#include <linux/static_key.h>
+#include <linux/static_call.h>
#include <asm/hpet.h>
#include <asm/timer.h>
@@ -254,7 +255,7 @@ unsigned long long sched_clock(void)
bool using_native_sched_clock(void)
{
- return pv_ops.time.sched_clock == native_sched_clock;
+ return static_call_query(pv_sched_clock) == native_sched_clock;
}
#else
unsigned long long
@@ -739,7 +740,7 @@ static unsigned long pit_hpet_ptimer_calibrate_cpu(void)
* 2) Reference counter. If available we use the HPET or the
* PMTIMER as a reference to check the sanity of that value.
* We use separate TSC readouts and check inside of the
- * reference read for any possible disturbance. We dicard
+ * reference read for any possible disturbance. We discard
* disturbed values here as well. We do that around the PIT
* calibration delay loop as we have to wait for a certain
* amount of time anyway.
@@ -1079,7 +1080,7 @@ static void tsc_resume(struct clocksource *cs)
* very small window right after one CPU updated cycle_last under
* xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
* is smaller than the cycle_last reference value due to a TSC which
- * is slighty behind. This delta is nowhere else observable, but in
+ * is slightly behind. This delta is nowhere else observable, but in
* that case it results in a forward time jump in the range of hours
* due to the unsigned delta calculation of the time keeping core
* code, which is necessary to support wrapping clocksources like pm
@@ -1264,7 +1265,7 @@ EXPORT_SYMBOL(convert_art_to_tsc);
* corresponding clocksource
* @cycles: System counter value
* @cs: Clocksource corresponding to system counter value. Used
- * by timekeeping code to verify comparibility of two cycle
+ * by timekeeping code to verify comparability of two cycle
* values.
*/
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 3d3c761eb74a..50a4515fe0ad 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -472,7 +472,7 @@ retry:
/*
* Add the result to the previous adjustment value.
*
- * The adjustement value is slightly off by the overhead of the
+ * The adjustment value is slightly off by the overhead of the
* sync mechanism (observed values are ~200 TSC cycles), but this
* really depends on CPU, node distance and frequency. So
* compensating for this is hard to get right. Experiments show
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index f6225bf22c02..8daa70b0d2da 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -272,7 +272,7 @@ static int emulate_umip_insn(struct insn *insn, int umip_inst,
* by whether the operand is a register or a memory location.
* If operand is a register, return as many bytes as the operand
* size. If operand is memory, return only the two least
- * siginificant bytes.
+ * significant bytes.
*/
if (X86_MODRM_MOD(insn->modrm.value) == 3)
*data_size = insn->opnd_bytes;
@@ -356,7 +356,7 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (!nr_copied)
return false;
- if (!insn_decode(&insn, regs, buf, nr_copied))
+ if (!insn_decode_from_regs(&insn, regs, buf, nr_copied))
return false;
umip_inst = identify_insn(&insn);
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index a2b413394917..b63cf8f7745e 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -276,12 +276,12 @@ static bool is_prefix_bad(struct insn *insn)
static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool x86_64)
{
+ enum insn_mode m = x86_64 ? INSN_MODE_64 : INSN_MODE_32;
u32 volatile *good_insns;
+ int ret;
- insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64);
- /* has the side-effect of processing the entire instruction */
- insn_get_length(insn);
- if (!insn_complete(insn))
+ ret = insn_decode(insn, auprobe->insn, sizeof(auprobe->insn), m);
+ if (ret < 0)
return -ENOEXEC;
if (is_prefix_bad(insn))
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a788d5120d4d..f6b93a35ce14 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -84,6 +84,18 @@ config KVM_INTEL
To compile this as a module, choose M here: the module
will be called kvm-intel.
+config X86_SGX_KVM
+ bool "Software Guard eXtensions (SGX) Virtualization"
+ depends on X86_SGX && KVM_INTEL
+ help
+
+ Enables KVM guests to create SGX enclaves.
+
+ This includes support to expose "raw" unreclaimable enclave memory to
+ guests via a device node, e.g. /dev/sgx_vepc.
+
+ If unsure, say N.
+
config KVM_AMD
tristate "KVM for AMD processors support"
depends on KVM
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6bd2f8b830e4..c02466a1410b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -1033,7 +1033,7 @@ EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
* - Centaur: 0xc0000000 - 0xcfffffff
*
* The Hypervisor class is further subdivided into sub-classes that each act as
- * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu
+ * their own independent class associated with a 0x100 byte range. E.g. if Qemu
* is advertising support for both HyperV and KVM, the resulting Hypervisor
* CPUID sub-classes are:
*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f7970ba6219f..cdd2a2b6550e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3222,7 +3222,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
}
/*
- * Now load segment descriptors. If fault happenes at this stage
+ * Now load segment descriptors. If fault happens at this stage
* it is handled in a context of new task
*/
ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR,
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8a4de3f12820..d5b72a08e566 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -269,7 +269,7 @@ int kvm_set_routing_entry(struct kvm *kvm,
const struct kvm_irq_routing_entry *ue)
{
/* We can't check irqchip_in_kernel() here as some callers are
- * currently inititalizing the irqchip. Other callers should therefore
+ * currently initializing the irqchip. Other callers should therefore
* check kvm_arch_can_set_irq_routing() before calling this function.
*/
switch (ue->type) {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 951dae4e7175..62b1729277ef 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4961,7 +4961,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
/*
* No need to care whether allocation memory is successful
- * or not since pte prefetch is skiped if it does not have
+ * or not since pte prefetch is skipped if it does not have
* enough objects in the cache.
*/
mmu_topup_memory_caches(vcpu, true);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 1f6f98c76bdf..360983865398 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -59,7 +59,7 @@ struct kvm_mmu_page {
#ifdef CONFIG_X86_64
bool tdp_mmu_page;
- /* Used for freeing the page asyncronously if it is a TDP MMU page. */
+ /* Used for freeing the page asynchronously if it is a TDP MMU page. */
struct rcu_head rcu_head;
#endif
};
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 018d82e73e31..34207b874886 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -404,7 +404,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
* If this warning were to trigger it would indicate that there was a
* missing MMU notifier or a race with some notifier handler.
* A present, leaf SPTE should never be directly replaced with another
- * present leaf SPTE pointing to a differnt PFN. A notifier handler
+ * present leaf SPTE pointing to a different PFN. A notifier handler
* should be zapping the SPTE before the main MM's page table is
* changed, or the SPTE should be zeroed, and the TLBs flushed by the
* thread before replacement.
@@ -418,7 +418,7 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
/*
* Crash the host to prevent error propagation and guest data
- * courruption.
+ * corruption.
*/
BUG();
}
@@ -529,7 +529,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm,
/*
* No other thread can overwrite the removed SPTE as they
* must either wait on the MMU lock or use
- * tdp_mmu_set_spte_atomic which will not overrite the
+ * tdp_mmu_set_spte_atomic which will not overwrite the
* special removed SPTE value. No bookkeeping is needed
* here since the SPTE is going from non-present
* to non-present.
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7b30bc967af3..67e753edfa22 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -103,7 +103,7 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
/* returns general purpose PMC with the specified MSR. Note that it can be
* used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
- * paramenter to tell them apart.
+ * parameter to tell them apart.
*/
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 78bdcfac4e40..3e55674098be 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -727,7 +727,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
struct amd_svm_iommu_ir *ir;
/**
- * In some cases, the existing irte is updaed and re-set,
+ * In some cases, the existing irte is updated and re-set,
* so we need to check here if it's already been * added
* to the ir_list.
*/
@@ -838,7 +838,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* Here, we setup with legacy mode in the following cases:
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
- * 3. APIC virtialization is disabled for the vcpu.
+ * 3. APIC virtualization is disabled for the vcpu.
* 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 874ea309279f..415a49b8b8f8 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
#include <linux/psp-sev.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
+#include <linux/misc_cgroup.h>
#include <linux/processor.h>
#include <linux/trace_events.h>
#include <asm/fpu/internal.h>
@@ -28,6 +29,21 @@
#define __ex(x) __kvm_handle_fault_on_reboot(x)
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
static u8 sev_enc_bit;
static int sev_flush_asids(void);
static DECLARE_RWSEM(sev_deactivate_lock);
@@ -89,8 +105,19 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
static int sev_asid_new(struct kvm_sev_info *sev)
{
- int pos, min_asid, max_asid;
+ int pos, min_asid, max_asid, ret;
bool retry = true;
+ enum misc_res_type type;
+
+ type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ WARN_ON(sev->misc_cg);
+ sev->misc_cg = get_current_misc_cg();
+ ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+ if (ret) {
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
+ }
mutex_lock(&sev_bitmap_lock);
@@ -108,7 +135,8 @@ again:
goto again;
}
mutex_unlock(&sev_bitmap_lock);
- return -EBUSY;
+ ret = -EBUSY;
+ goto e_uncharge;
}
__set_bit(pos, sev_asid_bitmap);
@@ -116,6 +144,11 @@ again:
mutex_unlock(&sev_bitmap_lock);
return pos + 1;
+e_uncharge:
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
+ return ret;
}
static int sev_get_asid(struct kvm *kvm)
@@ -125,14 +158,15 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
}
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
{
struct svm_cpu_data *sd;
int cpu, pos;
+ enum misc_res_type type;
mutex_lock(&sev_bitmap_lock);
- pos = asid - 1;
+ pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
for_each_possible_cpu(cpu) {
@@ -141,6 +175,11 @@ static void sev_asid_free(int asid)
}
mutex_unlock(&sev_bitmap_lock);
+
+ type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+ misc_cg_uncharge(type, sev->misc_cg, 1);
+ put_misc_cg(sev->misc_cg);
+ sev->misc_cg = NULL;
}
static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -188,19 +227,20 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
asid = sev_asid_new(sev);
if (asid < 0)
return ret;
+ sev->asid = asid;
ret = sev_platform_init(&argp->error);
if (ret)
goto e_free;
sev->active = true;
- sev->asid = asid;
INIT_LIST_HEAD(&sev->regions_list);
return 0;
e_free:
- sev_asid_free(asid);
+ sev_asid_free(sev);
+ sev->asid = 0;
return ret;
}
@@ -1315,12 +1355,12 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(&kvm->lock);
sev_unbind_asid(kvm, sev->handle);
- sev_asid_free(sev->asid);
+ sev_asid_free(sev);
}
void __init sev_hardware_setup(void)
{
- unsigned int eax, ebx, ecx, edx;
+ unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
@@ -1352,7 +1392,11 @@ void __init sev_hardware_setup(void)
if (!sev_reclaim_asid_bitmap)
goto out;
- pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+ sev_asid_count = max_sev_asid - min_sev_asid + 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count))
+ goto out;
+
+ pr_info("SEV supported: %u ASIDs\n", sev_asid_count);
sev_supported = true;
/* SEV-ES support requested? */
@@ -1367,7 +1411,11 @@ void __init sev_hardware_setup(void)
if (min_sev_asid == 1)
goto out;
- pr_info("SEV-ES supported: %u ASIDs\n", min_sev_asid - 1);
+ sev_es_asid_count = min_sev_asid - 1;
+ if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count))
+ goto out;
+
+ pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count);
sev_es_supported = true;
out:
@@ -1382,6 +1430,8 @@ void sev_hardware_teardown(void)
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
+ misc_cg_set_capacity(MISC_CG_RES_SEV, 0);
+ misc_cg_set_capacity(MISC_CG_RES_SEV_ES, 0);
sev_flush_asids();
}
@@ -2082,7 +2132,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu)
hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400);
hostsa->xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
- /* PKRU is restored on VMEXIT, save the curent host value */
+ /* PKRU is restored on VMEXIT, save the current host value */
hostsa->pkru = read_pkru();
/* MSR_IA32_XSS is restored on VMEXIT, save the currnet host value */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 58a45bb139f8..6dad89248312 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4400,7 +4400,7 @@ static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int i
*
* This happens because CPU microcode reading instruction bytes
* uses a special opcode which attempts to read data using CPL=0
- * priviledges. The microcode reads CS:RIP and if it hits a SMAP
+ * privileges. The microcode reads CS:RIP and if it hits a SMAP
* fault, it gives up and returns no instruction bytes.
*
* Detection:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 39e071fdab0c..9806aaebc37f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -65,6 +65,7 @@ struct kvm_sev_info {
unsigned long pages_locked; /* Number of pages locked */
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
+ struct misc_cg *misc_cg; /* For misc cgroup accounting */
};
struct kvm_svm {
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index bcca0b80e0d0..1e069aac7410 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3537,7 +3537,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* snapshot restore (migration).
*
* In this flow, it is assumed that vmcs12 cache was
- * trasferred as part of captured nVMX state and should
+ * transferred as part of captured nVMX state and should
* therefore not be read from guest memory (which may not
* exist on destination host yet).
*/
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 4831bc44ce66..459748680daf 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -10,7 +10,7 @@
#include "vmx.h"
/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * We maintain a per-CPU linked-list of vCPU, so in wakeup_handler() we
* can find which vCPU should be waken up.
*/
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 32cf8287d4a7..bcbf0d2139e9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1529,7 +1529,7 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
/*
* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
- * utilize encodings marked reserved will casue a #GP fault.
+ * utilize encodings marked reserved will cause a #GP fault.
*/
value = intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc_periods);
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_mtc) &&
@@ -2761,7 +2761,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * Update real mode segment cache. It may be not up-to-date if sement
+ * Update real mode segment cache. It may be not up-to-date if segment
* register was written while vcpu was in a guest mode.
*/
vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
@@ -6027,19 +6027,19 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
exit_reason.basic != EXIT_REASON_PML_FULL &&
exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ int ndata = 3;
+
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.ndata++;
- vcpu->run->internal.data[3] =
+ vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
- vcpu->run->internal.data[vcpu->run->internal.ndata++] =
- vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.ndata = ndata;
return 0;
}
@@ -7252,7 +7252,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
- /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabircEn can be set */
+ /* If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set */
if (intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_output_subsys))
vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eca63625aee4..efc7a82ab140 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -156,9 +156,9 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
/*
* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
- * adaptive tuning starting from default advancment of 1000ns. '0' disables
+ * adaptive tuning starting from default advancement of 1000ns. '0' disables
* advancement entirely. Any other value is used as-is and disables adaptive
- * tuning, i.e. allows priveleged userspace to set an exact advancement time.
+ * tuning, i.e. allows privileged userspace to set an exact advancement time.
*/
static int __read_mostly lapic_timer_advance_ns = -1;
module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
@@ -1287,7 +1287,7 @@ static const u32 emulated_msrs_all[] = {
MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
MSR_IA32_TSC_ADJUST,
- MSR_IA32_TSCDEADLINE,
+ MSR_IA32_TSC_DEADLINE,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
@@ -1372,7 +1372,7 @@ static u64 kvm_get_arch_capabilities(void)
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
* the nested hypervisor runs with NX huge pages. If it is not,
- * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
+ * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
* L1 guests, so it need not worry about its own (L2) guests.
*/
data |= ARCH_CAP_PSCHANGE_MC_NO;
@@ -1849,7 +1849,7 @@ fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
ret = EXIT_FASTPATH_EXIT_HANDLED;
}
break;
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
data = kvm_read_edx_eax(vcpu);
if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
kvm_skip_emulated_instruction(vcpu);
@@ -3087,7 +3087,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
case MSR_IA32_TSC_ADJUST:
@@ -3449,7 +3449,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- case MSR_IA32_TSCDEADLINE:
+ case MSR_IA32_TSC_DEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
@@ -4025,7 +4025,6 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
struct kvm_host_map map;
struct kvm_steal_time *st;
- int idx;
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
@@ -4033,15 +4032,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
if (vcpu->arch.st.preempted)
return;
- /*
- * Take the srcu lock as memslots will be accessed to check the gfn
- * cache generation against the memslots generation.
- */
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
&vcpu->arch.st.cache, true))
- goto out;
+ return;
st = map.hva +
offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
@@ -4049,20 +4042,25 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
-
-out:
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ int idx;
+
if (vcpu->preempted && !vcpu->arch.guest_state_protected)
vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
+ /*
+ * Take the srcu lock as memslots will be accessed to check the gfn
+ * cache generation against the memslots generation.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvm_xen_msr_enabled(vcpu->kvm))
kvm_xen_runstate_set_preempted(vcpu);
else
kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
static_call(kvm_x86_vcpu_put)(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 3b6544111ac9..16bc9130e7a5 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
/* if you want SMP support, implement these with real spinlocks */
.macro LOCK reg
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 1c5c81c16b06..ce6935690766 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -6,7 +6,7 @@
*/
#include <linux/linkage.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
.macro read64 reg
movl %ebx, %eax
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 2402d4c489d2..db4b4f9197c7 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 77b9b2a3b5c8..57b79c577496 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -11,7 +11,7 @@
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
index 12539fca75c4..b0f3b2a62ae2 100644
--- a/arch/x86/lib/inat.c
+++ b/arch/x86/lib/inat.c
@@ -4,7 +4,7 @@
*
* Written by Masami Hiramatsu <mhiramat@redhat.com>
*/
-#include <asm/insn.h>
+#include <asm/insn.h> /* __ignore_sync_check__ */
/* Attribute tables are generated from opcode map */
#include "inat-tables.c"
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index bb0b3fe1e0a0..a67afd74232c 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -232,7 +232,7 @@ static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
* resolve_seg_reg() - obtain segment register index
* @insn: Instruction with operands
* @regs: Register values as seen when entering kernel mode
- * @regoff: Operand offset, in pt_regs, used to deterimine segment register
+ * @regoff: Operand offset, in pt_regs, used to determine segment register
*
* Determine the segment register associated with the operands and, if
* applicable, prefixes and the instruction pointed by @insn.
@@ -404,10 +404,6 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
case INAT_SEG_REG_FS:
return (unsigned short)(regs->fs & 0xffff);
case INAT_SEG_REG_GS:
- /*
- * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS.
- * The macro below takes care of both cases.
- */
return get_user_gs(regs);
case INAT_SEG_REG_IGNORE:
default:
@@ -517,7 +513,7 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
* @insn: Instruction containing ModRM byte
* @regs: Register values as seen when entering kernel mode
* @offs1: Offset of the first operand register
- * @offs2: Offset of the second opeand register, if applicable
+ * @offs2: Offset of the second operand register, if applicable
*
* Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
* in @insn. This function is to be used with 16-bit address encodings. The
@@ -576,7 +572,7 @@ static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
* If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
* only addressing. This means that no registers are involved in
* computing the effective address. Thus, ensure that the first
- * register offset is invalild. The second register offset is already
+ * register offset is invalid. The second register offset is already
* invalid under the aforementioned conditions.
*/
if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
@@ -928,10 +924,11 @@ static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
- insn_get_modrm(insn);
+ int ret;
- if (!insn->modrm.nbytes)
- return -EINVAL;
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (X86_MODRM_MOD(insn->modrm.value) != 3)
return -EINVAL;
@@ -977,14 +974,14 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
int *regoff, long *eff_addr)
{
long tmp;
+ int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
- insn_get_modrm(insn);
-
- if (!insn->modrm.nbytes)
- return -EINVAL;
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
@@ -1106,18 +1103,21 @@ static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
* @base_offset will have a register, as an offset from the base of pt_regs,
* that can be used to resolve the associated segment.
*
- * -EINVAL on error.
+ * Negative value on error.
*/
static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
int *base_offset, long *eff_addr)
{
long base, indx;
int indx_offset;
+ int ret;
if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
return -EINVAL;
- insn_get_modrm(insn);
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
if (!insn->modrm.nbytes)
return -EINVAL;
@@ -1125,7 +1125,9 @@ static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
if (X86_MODRM_MOD(insn->modrm.value) > 2)
return -EINVAL;
- insn_get_sib(insn);
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
if (!insn->sib.nbytes)
return -EINVAL;
@@ -1194,8 +1196,8 @@ static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
short eff_addr;
long tmp;
- insn_get_modrm(insn);
- insn_get_displacement(insn);
+ if (insn_get_displacement(insn))
+ goto out;
if (insn->addr_bytes != 2)
goto out;
@@ -1492,7 +1494,7 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN
}
/**
- * insn_decode() - Decode an instruction
+ * insn_decode_from_regs() - Decode an instruction
* @insn: Structure to store decoded instruction
* @regs: Structure with register values as seen when entering kernel mode
* @buf: Buffer containing the instruction bytes
@@ -1505,8 +1507,8 @@ int insn_fetch_from_user_inatomic(struct pt_regs *regs, unsigned char buf[MAX_IN
*
* True if instruction was decoded, False otherwise.
*/
-bool insn_decode(struct insn *insn, struct pt_regs *regs,
- unsigned char buf[MAX_INSN_SIZE], int buf_size)
+bool insn_decode_from_regs(struct insn *insn, struct pt_regs *regs,
+ unsigned char buf[MAX_INSN_SIZE], int buf_size)
{
int seg_defs;
@@ -1529,7 +1531,9 @@ bool insn_decode(struct insn *insn, struct pt_regs *regs,
insn->addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
insn->opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
- insn_get_length(insn);
+ if (insn_get_length(insn))
+ return false;
+
if (buf_size < insn->length)
return false;
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 435630a6ec97..058f19b20465 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -11,10 +11,13 @@
#else
#include <string.h>
#endif
-#include <asm/inat.h>
-#include <asm/insn.h>
+#include <asm/inat.h> /*__ignore_sync_check__ */
+#include <asm/insn.h> /* __ignore_sync_check__ */
-#include <asm/emulate_prefix.h>
+#include <linux/errno.h>
+#include <linux/kconfig.h>
+
+#include <asm/emulate_prefix.h> /* __ignore_sync_check__ */
#define leXX_to_cpu(t, r) \
({ \
@@ -51,6 +54,7 @@
* insn_init() - initialize struct insn
* @insn: &struct insn to be initialized
* @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @buf_len: length of the insn buffer at @kaddr
* @x86_64: !0 for 64-bit kernel or 64-bit app
*/
void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
@@ -111,8 +115,12 @@ static void insn_get_emulate_prefix(struct insn *insn)
* Populates the @insn->prefixes bitmap, and updates @insn->next_byte
* to point to the (first) opcode. No effect if @insn->prefixes.got
* is already set.
+ *
+ * * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_prefixes(struct insn *insn)
+int insn_get_prefixes(struct insn *insn)
{
struct insn_field *prefixes = &insn->prefixes;
insn_attr_t attr;
@@ -120,7 +128,7 @@ void insn_get_prefixes(struct insn *insn)
int i, nb;
if (prefixes->got)
- return;
+ return 0;
insn_get_emulate_prefix(insn);
@@ -230,8 +238,10 @@ vex_end:
prefixes->got = 1;
+ return 0;
+
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -243,16 +253,25 @@ err_out:
* If necessary, first collects any preceding (prefix) bytes.
* Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
* is already 1.
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_opcode(struct insn *insn)
+int insn_get_opcode(struct insn *insn)
{
struct insn_field *opcode = &insn->opcode;
+ int pfx_id, ret;
insn_byte_t op;
- int pfx_id;
+
if (opcode->got)
- return;
- if (!insn->prefixes.got)
- insn_get_prefixes(insn);
+ return 0;
+
+ if (!insn->prefixes.got) {
+ ret = insn_get_prefixes(insn);
+ if (ret)
+ return ret;
+ }
/* Get first opcode */
op = get_next(insn_byte_t, insn);
@@ -267,9 +286,13 @@ void insn_get_opcode(struct insn *insn)
insn->attr = inat_get_avx_attribute(op, m, p);
if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) ||
(!inat_accept_vex(insn->attr) &&
- !inat_is_group(insn->attr)))
- insn->attr = 0; /* This instruction is bad */
- goto end; /* VEX has only 1 byte for opcode */
+ !inat_is_group(insn->attr))) {
+ /* This instruction is bad */
+ insn->attr = 0;
+ return -EINVAL;
+ }
+ /* VEX has only 1 byte for opcode */
+ goto end;
}
insn->attr = inat_get_opcode_attribute(op);
@@ -280,13 +303,18 @@ void insn_get_opcode(struct insn *insn)
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr);
}
- if (inat_must_vex(insn->attr))
- insn->attr = 0; /* This instruction is bad */
+
+ if (inat_must_vex(insn->attr)) {
+ /* This instruction is bad */
+ insn->attr = 0;
+ return -EINVAL;
+ }
end:
opcode->got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -296,15 +324,25 @@ err_out:
* Populates @insn->modrm and updates @insn->next_byte to point past the
* ModRM byte, if any. If necessary, first collects the preceding bytes
* (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_modrm(struct insn *insn)
+int insn_get_modrm(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
insn_byte_t pfx_id, mod;
+ int ret;
+
if (modrm->got)
- return;
- if (!insn->opcode.got)
- insn_get_opcode(insn);
+ return 0;
+
+ if (!insn->opcode.got) {
+ ret = insn_get_opcode(insn);
+ if (ret)
+ return ret;
+ }
if (inat_has_modrm(insn->attr)) {
mod = get_next(insn_byte_t, insn);
@@ -313,17 +351,22 @@ void insn_get_modrm(struct insn *insn)
pfx_id = insn_last_prefix_id(insn);
insn->attr = inat_get_group_attribute(mod, pfx_id,
insn->attr);
- if (insn_is_avx(insn) && !inat_accept_vex(insn->attr))
- insn->attr = 0; /* This is bad */
+ if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) {
+ /* Bad insn */
+ insn->attr = 0;
+ return -EINVAL;
+ }
}
}
if (insn->x86_64 && inat_is_force64(insn->attr))
insn->opnd_bytes = 8;
+
modrm->got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
@@ -337,11 +380,16 @@ err_out:
int insn_rip_relative(struct insn *insn)
{
struct insn_field *modrm = &insn->modrm;
+ int ret;
if (!insn->x86_64)
return 0;
- if (!modrm->got)
- insn_get_modrm(insn);
+
+ if (!modrm->got) {
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return 0;
+ }
/*
* For rip-relative instructions, the mod field (top 2 bits)
* is zero and the r/m field (bottom 3 bits) is 0x5.
@@ -355,15 +403,25 @@ int insn_rip_relative(struct insn *insn)
*
* If necessary, first collects the instruction up to and including the
* ModRM byte.
+ *
+ * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
*/
-void insn_get_sib(struct insn *insn)
+int insn_get_sib(struct insn *insn)
{
insn_byte_t modrm;
+ int ret;
if (insn->sib.got)
- return;
- if (!insn->modrm.got)
- insn_get_modrm(insn);
+ return 0;
+
+ if (!insn->modrm.got) {
+ ret = insn_get_modrm(insn);
+ if (ret)
+ return ret;
+ }
+
if (insn->modrm.nbytes) {
modrm = insn->modrm.bytes[0];
if (insn->addr_bytes != 2 &&
@@ -374,8 +432,10 @@ void insn_get_sib(struct insn *insn)
}
insn->sib.got = 1;
+ return 0;
+
err_out:
- return;
+ return -ENODATA;
}
@@ -386,15 +446,25 @@ err_out:
* If necessary, first collects the instruction up to and including the
* SIB byte.
* Displacement value is sign-expanded.
+ *
+ * * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
*/
-void insn_get_displacement(struct insn *insn)
+int insn_get_displacement(struct insn *insn)
{
insn_byte_t mod, rm, base;
+ int ret;
if (insn->displacement.got)
- return;
- if (!insn->sib.got)
- insn_get_sib(insn);
+ return 0;
+
+ if (!insn->sib.got) {
+ ret = insn_get_sib(insn);
+ if (ret)
+ return ret;
+ }
+
if (insn->modrm.nbytes) {
/*
* Interpreting the modrm byte:
@@ -436,9 +506,10 @@ void insn_get_displacement(struct insn *insn)
}
out:
insn->displacement.got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/* Decode moffset16/32/64. Return 0 if failed */
@@ -537,20 +608,30 @@ err_out:
}
/**
- * insn_get_immediate() - Get the immediates of instruction
+ * insn_get_immediate() - Get the immediate in an instruction
* @insn: &struct insn containing instruction
*
* If necessary, first collects the instruction up to and including the
* displacement bytes.
* Basically, most of immediates are sign-expanded. Unsigned-value can be
- * get by bit masking with ((1 << (nbytes * 8)) - 1)
+ * computed by bit masking with ((1 << (nbytes * 8)) - 1)
+ *
+ * Returns:
+ * 0: on success
+ * < 0: on error
*/
-void insn_get_immediate(struct insn *insn)
+int insn_get_immediate(struct insn *insn)
{
+ int ret;
+
if (insn->immediate.got)
- return;
- if (!insn->displacement.got)
- insn_get_displacement(insn);
+ return 0;
+
+ if (!insn->displacement.got) {
+ ret = insn_get_displacement(insn);
+ if (ret)
+ return ret;
+ }
if (inat_has_moffset(insn->attr)) {
if (!__get_moffset(insn))
@@ -597,9 +678,10 @@ void insn_get_immediate(struct insn *insn)
}
done:
insn->immediate.got = 1;
+ return 0;
err_out:
- return;
+ return -ENODATA;
}
/**
@@ -608,13 +690,65 @@ err_out:
*
* If necessary, first collects the instruction up to and including the
* immediates bytes.
- */
-void insn_get_length(struct insn *insn)
+ *
+ * Returns:
+ * - 0 on success
+ * - < 0 on error
+*/
+int insn_get_length(struct insn *insn)
{
+ int ret;
+
if (insn->length)
- return;
- if (!insn->immediate.got)
- insn_get_immediate(insn);
+ return 0;
+
+ if (!insn->immediate.got) {
+ ret = insn_get_immediate(insn);
+ if (ret)
+ return ret;
+ }
+
insn->length = (unsigned char)((unsigned long)insn->next_byte
- (unsigned long)insn->kaddr);
+
+ return 0;
+}
+
+/* Ensure this instruction is decoded completely */
+static inline int insn_complete(struct insn *insn)
+{
+ return insn->opcode.got && insn->modrm.got && insn->sib.got &&
+ insn->displacement.got && insn->immediate.got;
+}
+
+/**
+ * insn_decode() - Decode an x86 instruction
+ * @insn: &struct insn to be initialized
+ * @kaddr: address (in kernel memory) of instruction (or copy thereof)
+ * @buf_len: length of the insn buffer at @kaddr
+ * @m: insn mode, see enum insn_mode
+ *
+ * Returns:
+ * 0: if decoding succeeded
+ * < 0: otherwise.
+ */
+int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m)
+{
+ int ret;
+
+/* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */
+
+ if (m == INSN_MODE_KERN)
+ insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64));
+ else
+ insn_init(insn, kaddr, buf_len, m == INSN_MODE_64);
+
+ ret = insn_get_length(insn);
+ if (ret)
+ return ret;
+
+ if (insn_complete(insn))
+ return 0;
+
+ return -EINVAL;
}
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1e299ac73c86..1cc9da6e29c7 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,7 +4,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
.pushsection .noinstr.text, "ax"
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 41902fe8b859..64801010d312 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,7 +8,7 @@
*/
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#undef memmove
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 0bfd26e4ca9e..9827ae267f96 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -3,7 +3,7 @@
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
/*
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index 419365c48b2a..cc5f4ea943d3 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -14,7 +14,7 @@
* tested so far for any MMX solution figured.
*
* 22/09/2000 - Arjan van de Ven
- * Improved for non-egineering-sample Athlons
+ * Improved for non-engineering-sample Athlons
*
*/
#include <linux/hardirq.h>
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
index 75a0915b0d01..40bbe56bde32 100644
--- a/arch/x86/lib/msr-smp.c
+++ b/arch/x86/lib/msr-smp.c
@@ -252,7 +252,7 @@ static void __wrmsr_safe_regs_on_cpu(void *info)
rv->err = wrmsr_safe_regs(rv->regs);
}
-int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
@@ -265,7 +265,7 @@ int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
}
EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
-int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
+int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8])
{
int err;
struct msr_regs_info rv;
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 3bd905e10ee2..b09cd2ad426c 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(msrs_free);
* argument @m.
*
*/
-int msr_read(u32 msr, struct msr *m)
+static int msr_read(u32 msr, struct msr *m)
{
int err;
u64 val;
@@ -54,7 +54,7 @@ int msr_read(u32 msr, struct msr *m)
* @msr: MSR to write
* @m: value to write
*/
-int msr_write(u32 msr, struct msr *m)
+static int msr_write(u32 msr, struct msr *m)
{
return wrmsrl_safe(msr, m->q);
}
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index f6fb1d218dcc..4d32cb06ffd5 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -4,33 +4,65 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeatures.h>
-#include <asm/alternative-asm.h>
+#include <asm/alternative.h>
#include <asm/export.h>
#include <asm/nospec-branch.h>
#include <asm/unwind_hints.h>
#include <asm/frame.h>
-.macro THUNK reg
.section .text.__x86.indirect_thunk
- .align 32
-SYM_FUNC_START(__x86_indirect_thunk_\reg)
- JMP_NOSPEC \reg
-SYM_FUNC_END(__x86_indirect_thunk_\reg)
-
-SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg)
+.macro RETPOLINE reg
ANNOTATE_INTRA_FUNCTION_CALL
- call .Ldo_rop_\@
+ call .Ldo_rop_\@
.Lspec_trap_\@:
UNWIND_HINT_EMPTY
pause
lfence
- jmp .Lspec_trap_\@
+ jmp .Lspec_trap_\@
.Ldo_rop_\@:
- mov %\reg, (%_ASM_SP)
+ mov %\reg, (%_ASM_SP)
UNWIND_HINT_FUNC
ret
-SYM_FUNC_END(__x86_retpoline_\reg)
+.endm
+
+.macro THUNK reg
+
+ .align 32
+
+SYM_FUNC_START(__x86_indirect_thunk_\reg)
+
+ ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \
+ __stringify(RETPOLINE \reg), X86_FEATURE_RETPOLINE, \
+ __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD
+
+SYM_FUNC_END(__x86_indirect_thunk_\reg)
+
+.endm
+
+/*
+ * This generates .altinstr_replacement symbols for use by objtool. They,
+ * however, must not actually live in .altinstr_replacement since that will be
+ * discarded after init, but module alternatives will also reference these
+ * symbols.
+ *
+ * Their names matches the "__x86_indirect_" prefix to mark them as retpolines.
+ */
+.macro ALT_THUNK reg
+
+ .align 1
+
+SYM_FUNC_START_NOALIGN(__x86_indirect_alt_call_\reg)
+ ANNOTATE_RETPOLINE_SAFE
+1: call *%\reg
+2: .skip 5-(2b-1b), 0x90
+SYM_FUNC_END(__x86_indirect_alt_call_\reg)
+
+SYM_FUNC_START_NOALIGN(__x86_indirect_alt_jmp_\reg)
+ ANNOTATE_RETPOLINE_SAFE
+1: jmp *%\reg
+2: .skip 5-(2b-1b), 0x90
+SYM_FUNC_END(__x86_indirect_alt_jmp_\reg)
.endm
@@ -48,7 +80,6 @@ SYM_FUNC_END(__x86_retpoline_\reg)
#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
-#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg)
#undef GEN
#define GEN(reg) THUNK reg
@@ -59,5 +90,13 @@ SYM_FUNC_END(__x86_retpoline_\reg)
#include <asm/GEN-for-each-reg.h>
#undef GEN
-#define GEN(reg) EXPORT_RETPOLINE(reg)
+#define GEN(reg) ALT_THUNK reg
+#include <asm/GEN-for-each-reg.h>
+
+#undef GEN
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_call_ ## reg)
+#include <asm/GEN-for-each-reg.h>
+
+#undef GEN
+#define GEN(reg) __EXPORT_THUNK(__x86_indirect_alt_jmp_ ## reg)
#include <asm/GEN-for-each-reg.h>
diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c
index 4a9887851ad8..990d847ae902 100644
--- a/arch/x86/math-emu/fpu_trig.c
+++ b/arch/x86/math-emu/fpu_trig.c
@@ -547,7 +547,7 @@ static void frndint_(FPU_REG *st0_ptr, u_char st0_tag)
single_arg_error(st0_ptr, st0_tag);
}
-static int fsin(FPU_REG *st0_ptr, u_char tag)
+static int f_sin(FPU_REG *st0_ptr, u_char tag)
{
u_char arg_sign = getsign(st0_ptr);
@@ -608,6 +608,11 @@ static int fsin(FPU_REG *st0_ptr, u_char tag)
}
}
+static void fsin(FPU_REG *st0_ptr, u_char tag)
+{
+ f_sin(st0_ptr, tag);
+}
+
static int f_cos(FPU_REG *st0_ptr, u_char tag)
{
u_char st0_sign;
@@ -724,7 +729,7 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag)
}
reg_copy(st0_ptr, &arg);
- if (!fsin(st0_ptr, st0_tag)) {
+ if (!f_sin(st0_ptr, st0_tag)) {
push();
FPU_copy_to_reg0(&arg, st0_tag);
f_cos(&st(0), st0_tag);
@@ -1635,7 +1640,7 @@ void FPU_triga(void)
}
static FUNC_ST0 const trig_table_b[] = {
- fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos
+ fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, fsin, fcos
};
void FPU_trigb(void)
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index fe6246ff9887..7ca6417c0c8d 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -964,7 +964,7 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d)
/* The return value (in eax) is zero if the result is exact,
if bits are changed due to rounding, truncation, etc, then
a non-zero value is returned */
-/* Overflow is signalled by a non-zero return value (in eax).
+/* Overflow is signaled by a non-zero return value (in eax).
In the case of overflow, the returned significand always has the
largest possible value */
int FPU_round_to_int(FPU_REG *r, u_char tag)
diff --git a/arch/x86/math-emu/reg_round.S b/arch/x86/math-emu/reg_round.S
index 11a1f798451b..4a9fc3cc5a4d 100644
--- a/arch/x86/math-emu/reg_round.S
+++ b/arch/x86/math-emu/reg_round.S
@@ -575,7 +575,7 @@ Normalise_result:
#ifdef PECULIAR_486
/*
* This implements a special feature of 80486 behaviour.
- * Underflow will be signalled even if the number is
+ * Underflow will be signaled even if the number is
* not a denormal after rounding.
* This difference occurs only for masked underflow, and not
* in the unmasked case.
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a73347e2cdfc..1c548ad00752 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1497,7 +1497,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
* userspace task is trying to access some valid (from guest's point of
* view) memory which is not currently mapped by the host (e.g. the
* memory is swapped out). Note, the corresponding "page ready" event
- * which is injected when the memory becomes available, is delived via
+ * which is injected when the memory becomes available, is delivered via
* an interrupt mechanism and not a #PF exception
* (see arch/x86/kernel/kvm.c: sysvec_kvm_asyncpf_interrupt()).
*
@@ -1523,7 +1523,7 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
*
* In case the fault hit a RCU idle region the conditional entry
* code reenabled RCU to avoid subsequent wreckage which helps
- * debugability.
+ * debuggability.
*/
state = irqentry_enter(regs);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index dd694fb93916..75ef19aa8903 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,7 +29,7 @@
/*
* We need to define the tracepoints somewhere, and tlb.c
- * is only compied when SMP=y.
+ * is only compiled when SMP=y.
*/
#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
@@ -756,7 +756,7 @@ void __init init_mem_mapping(void)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- /* can we preseve max_low_pfn ?*/
+ /* can we preserve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
#else
@@ -939,7 +939,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
{
/*
* end could be not aligned, and We can not align that,
- * decompresser could be confused by aligned initrd_end
+ * decompressor could be confused by aligned initrd_end
* We already reserve the end partial page before in
* - i386_start_kernel()
* - x86_64_start_kernel()
@@ -1017,7 +1017,7 @@ void __init zone_sizes_init(void)
free_area_init(max_zone_pfns);
}
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index da31c2635ee4..21ffb03f6c72 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -755,8 +755,6 @@ void __init mem_init(void)
after_bootmem = 1;
x86_init.hyper.init_after_bootmem();
- mem_init_print_info(NULL);
-
/*
* Check boundaries twice: Some fundamental inconsistencies can
* be detected at build time already.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index b5a3fa4033d3..e527d829e1ed 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -172,7 +172,7 @@ static void sync_global_pgds_l4(unsigned long start, unsigned long end)
/*
* With folded p4d, pgd_none() is always false, we need to
- * handle synchonization on p4d level.
+ * handle synchronization on p4d level.
*/
MAYBE_BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, addr);
@@ -826,6 +826,106 @@ void __init paging_init(void)
zone_sizes_init();
}
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define PAGE_UNUSED 0xFD
+
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED), ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start __meminitdata;
+
+static void __meminit vmemmap_flush_unused_pmd(void)
+{
+ if (!unused_pmd_start)
+ return;
+ /*
+ * Clears (unused_pmd_start, PMD_END]
+ */
+ memset((void *)unused_pmd_start, PAGE_UNUSED,
+ ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+ unused_pmd_start = 0;
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* Returns true if the PMD is completely unused and thus it can be freed */
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+ unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+ /*
+ * Flush the unused range cache to ensure that memchr_inv() will work
+ * for the whole range.
+ */
+ vmemmap_flush_unused_pmd();
+ memset((void *)addr, PAGE_UNUSED, end - addr);
+
+ return !memchr_inv((void *)start, PAGE_UNUSED, PMD_SIZE);
+}
+#endif
+
+static void __meminit __vmemmap_use_sub_pmd(unsigned long start)
+{
+ /*
+ * As we expect to add in the same granularity as we remove, it's
+ * sufficient to mark only some piece used to block the memmap page from
+ * getting removed when removing some other adjacent memmap (just in
+ * case the first memmap never gets initialized e.g., because the memory
+ * block never gets onlined).
+ */
+ memset((void *)start, 0, sizeof(struct page));
+}
+
+static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+ /*
+ * We only optimize if the new used range directly follows the
+ * previously unused range (esp., when populating consecutive sections).
+ */
+ if (unused_pmd_start == start) {
+ if (likely(IS_ALIGNED(end, PMD_SIZE)))
+ unused_pmd_start = 0;
+ else
+ unused_pmd_start = end;
+ return;
+ }
+
+ /*
+ * If the range does not contiguously follows previous one, make sure
+ * to mark the unused range of the previous one so it can be removed.
+ */
+ vmemmap_flush_unused_pmd();
+ __vmemmap_use_sub_pmd(start);
+}
+
+
+static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+ vmemmap_flush_unused_pmd();
+
+ /*
+ * Could be our memmap page is filled with PAGE_UNUSED already from a
+ * previous remove. Make sure to reset it.
+ */
+ __vmemmap_use_sub_pmd(start);
+
+ /*
+ * Mark with PAGE_UNUSED the unused parts of the new memmap range
+ */
+ if (!IS_ALIGNED(start, PMD_SIZE))
+ memset((void *)start, PAGE_UNUSED,
+ start - ALIGN_DOWN(start, PMD_SIZE));
+
+ /*
+ * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+ * consecutive sections. Remember for the last added PMD where the
+ * unused range begins.
+ */
+ if (!IS_ALIGNED(end, PMD_SIZE))
+ unused_pmd_start = end;
+}
+#endif
+
/*
* Memory hotplug specific functions
*/
@@ -871,8 +971,6 @@ int arch_add_memory(int nid, u64 start, u64 size,
return add_pages(nid, start_pfn, nr_pages, params);
}
-#define PAGE_INUSE 0xFD
-
static void __meminit free_pagetable(struct page *page, int order)
{
unsigned long magic;
@@ -962,7 +1060,6 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
{
unsigned long next, pages = 0;
pte_t *pte;
- void *page_addr;
phys_addr_t phys_addr;
pte = pte_start + pte_index(addr);
@@ -983,42 +1080,15 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
if (phys_addr < (phys_addr_t)0x40000000)
return;
- if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
- /*
- * Do not free direct mapping pages since they were
- * freed when offlining, or simplely not in use.
- */
- if (!direct)
- free_pagetable(pte_page(*pte), 0);
-
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
-
- /* For non-direct mapping, pages means nothing. */
- pages++;
- } else {
- /*
- * If we are here, we are freeing vmemmap pages since
- * direct mapped memory ranges to be freed are aligned.
- *
- * If we are not removing the whole page, it means
- * other page structs in this page are being used and
- * we canot remove them. So fill the unused page_structs
- * with 0xFD, and remove the page when it is wholly
- * filled with 0xFD.
- */
- memset((void *)addr, PAGE_INUSE, next - addr);
+ if (!direct)
+ free_pagetable(pte_page(*pte), 0);
- page_addr = page_address(pte_page(*pte));
- if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
- free_pagetable(pte_page(*pte), 0);
+ spin_lock(&init_mm.page_table_lock);
+ pte_clear(&init_mm, addr, pte);
+ spin_unlock(&init_mm.page_table_lock);
- spin_lock(&init_mm.page_table_lock);
- pte_clear(&init_mm, addr, pte);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
+ /* For non-direct mapping, pages means nothing. */
+ pages++;
}
/* Call free_pte_table() in remove_pmd_table(). */
@@ -1034,7 +1104,6 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pte_t *pte_base;
pmd_t *pmd;
- void *page_addr;
pmd = pmd_start + pmd_index(addr);
for (; addr < end; addr = next, pmd++) {
@@ -1054,22 +1123,16 @@ remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pmd_page(*pmd));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PMD_SIZE)) {
+ }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ else if (vmemmap_pmd_is_unused(addr, next)) {
free_hugepage_table(pmd_page(*pmd),
altmap);
-
spin_lock(&init_mm.page_table_lock);
pmd_clear(pmd);
spin_unlock(&init_mm.page_table_lock);
- }
}
-
+#endif
continue;
}
@@ -1090,7 +1153,6 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
unsigned long next, pages = 0;
pmd_t *pmd_base;
pud_t *pud;
- void *page_addr;
pud = pud_start + pud_index(addr);
for (; addr < end; addr = next, pud++) {
@@ -1099,33 +1161,13 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
if (!pud_present(*pud))
continue;
- if (pud_large(*pud)) {
- if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE)) {
- if (!direct)
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- pages++;
- } else {
- /* If here, we are freeing vmemmap pages. */
- memset((void *)addr, PAGE_INUSE, next - addr);
-
- page_addr = page_address(pud_page(*pud));
- if (!memchr_inv(page_addr, PAGE_INUSE,
- PUD_SIZE)) {
- free_pagetable(pud_page(*pud),
- get_order(PUD_SIZE));
-
- spin_lock(&init_mm.page_table_lock);
- pud_clear(pud);
- spin_unlock(&init_mm.page_table_lock);
- }
- }
-
+ if (pud_large(*pud) &&
+ IS_ALIGNED(addr, PUD_SIZE) &&
+ IS_ALIGNED(next, PUD_SIZE)) {
+ spin_lock(&init_mm.page_table_lock);
+ pud_clear(pud);
+ spin_unlock(&init_mm.page_table_lock);
+ pages++;
continue;
}
@@ -1197,6 +1239,9 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct,
void __ref vmemmap_free(unsigned long start, unsigned long end,
struct vmem_altmap *altmap)
{
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
remove_pagetable(start, end, false, altmap);
}
@@ -1306,8 +1351,6 @@ void __init mem_init(void)
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER);
preallocate_vmalloc_pages();
-
- mem_init_print_info(NULL);
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -1538,11 +1581,17 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
addr_end = addr + PMD_SIZE;
p_end = p + PMD_SIZE;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE) ||
+ !IS_ALIGNED(next, PMD_SIZE))
+ vmemmap_use_new_sub_pmd(addr, next);
+
continue;
} else if (altmap)
return -ENOMEM; /* no fallback */
} else if (pmd_large(*pmd)) {
vmemmap_verify((pte_t *)pmd, node, addr, next);
+ vmemmap_use_sub_pmd(addr, next);
continue;
}
if (vmemmap_populate_basepages(addr, next, node, NULL))
@@ -1556,6 +1605,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
{
int err;
+ VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
+ VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+
if (end - start < PAGES_PER_SECTION * sizeof(struct page))
err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 9e5ccc56f8e0..12c686c65ea9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -481,25 +481,6 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int __init arch_ioremap_p4d_supported(void)
-{
- return 0;
-}
-
-int __init arch_ioremap_pud_supported(void)
-{
-#ifdef CONFIG_X86_64
- return boot_cpu_has(X86_FEATURE_GBPAGES);
-#else
- return 0;
-#endif
-}
-
-int __init arch_ioremap_pmd_supported(void)
-{
- return boot_cpu_has(X86_FEATURE_PSE);
-}
-
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 6e6b39710e5f..557f0fe25dff 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -96,7 +96,7 @@ void __init kernel_randomize_memory(void)
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
- /* Adapt phyiscal memory region size based on available memory */
+ /* Adapt physical memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
kaslr_regions[0].size_tb = memory_tb;
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index be020a7bc414..d3efbc5b3449 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Support for MMIO probes.
- * Benfit many code from kprobes
+ * Benefit many code from kprobes
* (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
* 2007 Alexander Eichner
* 2008 Pekka Paalanen <pq@iki.fi>
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ae78cef79980..f633f9e23b8f 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -19,6 +19,7 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/dma-mapping.h>
+#include <linux/virtio_config.h>
#include <asm/tlbflush.h>
#include <asm/fixmap.h>
@@ -484,3 +485,8 @@ void __init mem_encrypt_init(void)
print_mem_encrypt_feature_info();
}
+int arch_has_restricted_virtio_memory_access(void)
+{
+ return sev_active();
+}
+EXPORT_SYMBOL_GPL(arch_has_restricted_virtio_memory_access);
diff --git a/arch/x86/mm/mem_encrypt_boot.S b/arch/x86/mm/mem_encrypt_boot.S
index 7a84fc8bc5c3..17d292b7072f 100644
--- a/arch/x86/mm/mem_encrypt_boot.S
+++ b/arch/x86/mm/mem_encrypt_boot.S
@@ -27,7 +27,7 @@ SYM_FUNC_START(sme_encrypt_execute)
* - stack page (PAGE_SIZE)
* - encryption routine page (PAGE_SIZE)
* - intermediate copy buffer (PMD_PAGE_SIZE)
- * R8 - physcial address of the pagetables to use for encryption
+ * R8 - physical address of the pagetables to use for encryption
*/
push %rbp
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c
index 6c5eb6f3f14f..a19374d26101 100644
--- a/arch/x86/mm/mem_encrypt_identity.c
+++ b/arch/x86/mm/mem_encrypt_identity.c
@@ -503,14 +503,10 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
- /*
- * Set the feature mask (SME or SEV) based on whether we are
- * running under a hypervisor.
- */
- eax = 1;
- ecx = 0;
- native_cpuid(&eax, &ebx, &ecx, &edx);
- feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+ /* Check the SEV MSR whether SEV or SME is enabled */
+ sev_status = __rdmsr(MSR_AMD64_SEV);
+ feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
@@ -530,19 +526,26 @@ void __init sme_enable(struct boot_params *bp)
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
+ /*
+ * No SME if Hypervisor bit is set. This check is here to
+ * prevent a guest from trying to enable SME. For running as a
+ * KVM guest the MSR_K8_SYSCFG will be sufficient, but there
+ * might be other hypervisors which emulate that MSR as non-zero
+ * or even pass it through to the guest.
+ * A malicious hypervisor can still trick a guest into this
+ * path, but there is no way to protect against that.
+ */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ if (ecx & BIT(31))
+ return;
+
/* For SME, check the SYSCFG MSR */
msr = __rdmsr(MSR_K8_SYSCFG);
if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
return;
} else {
- /* For SEV, check the SEV MSR */
- msr = __rdmsr(MSR_AMD64_SEV);
- if (!(msr & MSR_AMD64_SEV_ENABLED))
- return;
-
- /* Save SEV_STATUS to avoid reading MSR again */
- sev_status = msr;
-
/* SEV state cannot be controlled by a command line option */
sme_me_mask = me_mask;
sev_enabled = true;
diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index ca311aaa67b8..3112ca7786ed 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -695,7 +695,7 @@ int memtype_free(u64 start, u64 end)
/**
- * lookup_memtype - Looksup the memory type for a physical address
+ * lookup_memtype - Looks up the memory type for a physical address
* @paddr: physical address of which memory type needs to be looked up
*
* Only to be called when PAT is enabled
@@ -800,6 +800,7 @@ void memtype_free_io(resource_size_t start, resource_size_t end)
memtype_free(start, end);
}
+#ifdef CONFIG_X86_PAT
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
@@ -813,6 +814,7 @@ void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);
+#endif
pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
unsigned long size, pgprot_t vma_prot)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..427980617557 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -680,7 +680,7 @@ pmd_t *lookup_pmd_address(unsigned long address)
* end up in this kind of memory, for instance.
*
* This could be optimized, but it is only intended to be
- * used at inititalization time, and keeping it
+ * used at initialization time, and keeping it
* unoptimized should increase the testing coverage for
* the more obscure platforms.
*/
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..d27cf69e811d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -780,14 +780,6 @@ int pmd_clear_huge(pmd_t *pmd)
return 0;
}
-/*
- * Until we support 512GB pages, skip them in the vmap area.
- */
-int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
-{
- return 0;
-}
-
#ifdef CONFIG_X86_64
/**
* pud_free_pmd_page - Clear pud entry and free pmd page.
@@ -861,11 +853,6 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
#else /* !CONFIG_X86_64 */
-int pud_free_pmd_page(pud_t *pud, unsigned long addr)
-{
- return pud_none(*pud);
-}
-
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 8873ed1438a9..a2332eef66e9 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -128,7 +128,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
/*
* Called from the FPU code when creating a fresh set of FPU
* registers. This is called from a very specific context where
- * we know the FPU regstiers are safe for use and we can use PKRU
+ * we know the FPU registers are safe for use and we can use PKRU
* directly.
*/
void copy_init_pkru_to_fpregs(void)
diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 1aab92930569..5d5c7bb50ce9 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -361,7 +361,7 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
* global, so set it as global in both copies. Note:
* the X86_FEATURE_PGE check is not _required_ because
* the CPU ignores _PAGE_GLOBAL when PGE is not
- * supported. The check keeps consistentency with
+ * supported. The check keeps consistency with
* code that only set this bit when supported.
*/
if (boot_cpu_has(X86_FEATURE_PGE))
@@ -440,10 +440,9 @@ static void __init pti_clone_user_shared(void)
for_each_possible_cpu(cpu) {
/*
- * The SYSCALL64 entry code needs to be able to find the
- * thread stack and needs one word of scratch space in which
- * to spill a register. All of this lives in the TSS, in
- * the sp1 and sp2 slots.
+ * The SYSCALL64 entry code needs one word of scratch space
+ * in which to spill a register. It lives in the sp2 slot
+ * of the CPU's TSS.
*
* This is done for all possible CPUs during boot to ensure
* that it's propagated to all mms.
@@ -512,7 +511,7 @@ static void pti_clone_entry_text(void)
static inline bool pti_kernel_image_global_ok(void)
{
/*
- * Systems with PCIDs get litlle benefit from global
+ * Systems with PCIDs get little benefit from global
* kernel text and are not worth the downsides.
*/
if (cpu_feature_enabled(X86_FEATURE_PCID))
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 569ac1d57f55..78804680e923 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -24,7 +24,7 @@
# define __flush_tlb_local native_flush_tlb_local
# define __flush_tlb_global native_flush_tlb_global
# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr)
-# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info)
+# define __flush_tlb_multi(msk, info) native_flush_tlb_multi(msk, info)
#endif
/*
@@ -106,7 +106,7 @@ static inline u16 kern_pcid(u16 asid)
#ifdef CONFIG_PAGE_TABLE_ISOLATION
/*
- * Make sure that the dynamic ASID space does not confict with the
+ * Make sure that the dynamic ASID space does not conflict with the
* bit we are using to switch between user and kernel ASIDs.
*/
BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
@@ -300,7 +300,7 @@ void leave_mm(int cpu)
return;
/* Warn if we're not lazy. */
- WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
+ WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
switch_mm(NULL, &init_mm, NULL);
}
@@ -316,7 +316,7 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
{
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
@@ -424,7 +424,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+ bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -439,7 +439,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* NB: leave_mm() calls us with prev == NULL and tsk == NULL.
*/
- /* We don't want flush_tlb_func_* to run concurrently with us. */
+ /* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
@@ -469,7 +469,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
__flush_tlb_all();
}
#endif
- this_cpu_write(cpu_tlbstate.is_lazy, false);
+ if (was_lazy)
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
/*
* The membarrier system call requires a full memory barrier and
@@ -490,7 +491,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
/*
* Even in lazy TLB mode, the CPU should stay set in the
* mm_cpumask. The TLB shootdown code can figure out from
- * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+ * cpu_tlbstate_shared.is_lazy whether or not to send an IPI.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
@@ -598,7 +599,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- this_cpu_write(cpu_tlbstate.is_lazy, true);
+ this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
}
/*
@@ -647,14 +648,13 @@ void initialize_tlbstate_and_flush(void)
}
/*
- * flush_tlb_func_common()'s memory ordering requirement is that any
+ * flush_tlb_func()'s memory ordering requirement is that any
* TLB fills that happen after we flush the TLB are ordered after we
* read active_mm's tlb_gen. We don't need any explicit barriers
* because all x86 flush operations are serializing and the
* atomic64_read operation won't be reordered by the compiler.
*/
-static void flush_tlb_func_common(const struct flush_tlb_info *f,
- bool local, enum tlb_flush_reason reason)
+static void flush_tlb_func(void *info)
{
/*
* We have three different tlb_gen values in here. They are:
@@ -665,28 +665,40 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* - f->new_tlb_gen: the generation that the requester of the flush
* wants us to catch up to.
*/
+ const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+ bool local = smp_processor_id() == f->initiating_cpu;
+ unsigned long nr_invalidate = 0;
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
+ if (!local) {
+ inc_irq_stat(irq_tlb_count);
+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+
+ /* Can only happen on remote CPUs */
+ if (f->mm && f->mm != loaded_mm)
+ return;
+ }
+
if (unlikely(loaded_mm == &init_mm))
return;
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
loaded_mm->context.ctx_id);
- if (this_cpu_read(cpu_tlbstate.is_lazy)) {
+ if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) {
/*
* We're in lazy mode. We need to at least flush our
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
*
- * This should be rare, with native_flush_tlb_others skipping
+ * This should be rare, with native_flush_tlb_multi() skipping
* IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
@@ -700,8 +712,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* be handled can catch us all the way up, leaving no work for
* the second flush.
*/
- trace_tlb_flush(reason, 0);
- return;
+ goto done;
}
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
@@ -736,7 +747,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* 3, we'd be break the invariant: we'd update local_tlb_gen above
* 1 without the full flush that's needed for tlb_gen 2.
*
- * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation.
+ * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimization.
* Partial TLB flushes are not all that much cheaper than full TLB
* flushes, so it seems unlikely that it would be a performance win
* to do a partial flush if that won't bring our TLB fully up to
@@ -748,56 +759,54 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
- unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
unsigned long addr = f->start;
+ nr_invalidate = (f->end - f->start) >> f->stride_shift;
+
while (addr < f->end) {
flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
- trace_tlb_flush(reason, nr_invalidate);
} else {
/* Full flush. */
+ nr_invalidate = TLB_FLUSH_ALL;
+
flush_tlb_local();
if (local)
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
}
/* Both paths above update our state to mm_tlb_gen. */
this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
-}
-
-static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason)
-{
- const struct flush_tlb_info *f = info;
- flush_tlb_func_common(f, true, reason);
+ /* Tracing is done in a unified manner to reduce the code size */
+done:
+ trace_tlb_flush(!local ? TLB_REMOTE_SHOOTDOWN :
+ (f->mm == NULL) ? TLB_LOCAL_SHOOTDOWN :
+ TLB_LOCAL_MM_SHOOTDOWN,
+ nr_invalidate);
}
-static void flush_tlb_func_remote(void *info)
+static bool tlb_is_not_lazy(int cpu)
{
- const struct flush_tlb_info *f = info;
-
- inc_irq_stat(irq_tlb_count);
-
- if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
+ return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
-{
- return !per_cpu(cpu_tlbstate.is_lazy, cpu);
-}
+static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate_shared);
-STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
+STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
+ /*
+ * Do accounting and tracing. Note that there are (and have always been)
+ * cases in which a remote TLB flush will be traced, but eventually
+ * would not happen.
+ */
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -815,18 +824,42 @@ STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask,
* up on the new contents of what used to be page tables, while
* doing a speculative memory access.
*/
- if (info->freed_tables)
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
- (void *)info, 1, cpumask);
+ if (info->freed_tables) {
+ on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
+ } else {
+ /*
+ * Although we could have used on_each_cpu_cond_mask(),
+ * open-coding it has performance advantages, as it eliminates
+ * the need for indirect calls or retpolines. In addition, it
+ * allows to use a designated cpumask for evaluating the
+ * condition, instead of allocating one.
+ *
+ * This code works under the assumption that there are no nested
+ * TLB flushes, an assumption that is already made in
+ * flush_tlb_mm_range().
+ *
+ * cond_cpumask is logically a stack-local variable, but it is
+ * more efficient to have it off the stack and not to allocate
+ * it on demand. Preemption is disabled and this code is
+ * non-reentrant.
+ */
+ struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
+ int cpu;
+
+ cpumask_clear(cond_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ if (tlb_is_not_lazy(cpu))
+ __cpumask_set_cpu(cpu, cond_cpumask);
+ }
+ on_each_cpu_mask(cond_cpumask, flush_tlb_func, (void *)info, true);
+ }
}
-void flush_tlb_others(const struct cpumask *cpumask,
+void flush_tlb_multi(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- __flush_tlb_others(cpumask, info);
+ __flush_tlb_multi(cpumask, info);
}
/*
@@ -847,7 +880,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info);
static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
#endif
-static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
@@ -869,14 +902,15 @@ static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->stride_shift = stride_shift;
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
+ info->initiating_cpu = smp_processor_id();
return info;
}
-static inline void put_flush_tlb_info(void)
+static void put_flush_tlb_info(void)
{
#ifdef CONFIG_DEBUG_VM
- /* Complete reentrency prevention checks */
+ /* Complete reentrancy prevention checks */
barrier();
this_cpu_dec(flush_tlb_info_idx);
#endif
@@ -905,16 +939,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables,
new_tlb_gen);
- if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ flush_tlb_multi(mm_cpumask(mm), info);
+ } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), info);
-
put_flush_tlb_info();
put_cpu();
}
@@ -1119,34 +1157,30 @@ void __flush_tlb_all(void)
}
EXPORT_SYMBOL_GPL(__flush_tlb_all);
-/*
- * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm.
- * This means that the 'struct flush_tlb_info' that describes which mappings to
- * flush is actually fixed. We therefore set a single fixed struct and use it in
- * arch_tlbbatch_flush().
- */
-static const struct flush_tlb_info full_flush_tlb_info = {
- .mm = NULL,
- .start = 0,
- .end = TLB_FLUSH_ALL,
-};
-
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
+ struct flush_tlb_info *info;
+
int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+ info = get_flush_tlb_info(NULL, 0, TLB_FLUSH_ALL, 0, false, 0);
+ /*
+ * flush_tlb_multi() is not optimized for the common case in which only
+ * a local TLB flush is needed. Optimize this use-case by calling
+ * flush_tlb_func_local() directly in this case.
+ */
+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+ flush_tlb_multi(&batch->cpumask, info);
+ } else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
local_irq_disable();
- flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN);
+ flush_tlb_func(info);
local_irq_enable();
}
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &full_flush_tlb_info);
-
cpumask_clear(&batch->cpumask);
+ put_flush_tlb_info();
put_cpu();
}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 7f1b3a862e14..2a2e290fa5d8 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -282,7 +282,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
/* BPF trampoline can be made to work without these nops,
* but let's waste 5 bytes for now and optimize later
*/
- memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
+ memcpy(prog, x86_nops[5], cnt);
prog += cnt;
if (!ebpf_from_cbpf) {
if (tail_call_reachable && !is_subprog)
@@ -330,7 +330,7 @@ static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
void *old_addr, void *new_addr,
const bool text_live)
{
- const u8 *nop_insn = ideal_nops[NOP_ATOMIC5];
+ const u8 *nop_insn = x86_nops[5];
u8 old_insn[X86_PATCH_SIZE];
u8 new_insn[X86_PATCH_SIZE];
u8 *prog;
@@ -560,7 +560,7 @@ static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke,
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
- memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
/* out: */
@@ -881,7 +881,7 @@ static int emit_nops(u8 **pprog, int len)
noplen = ASM_NOP_MAX;
for (i = 0; i < noplen; i++)
- EMIT1(ideal_nops[noplen][i]);
+ EMIT1(x86_nops[noplen][i]);
len -= noplen;
}
@@ -1556,7 +1556,7 @@ emit_cond_jmp: /* Convert BPF opcode to x86 */
if (is_imm8(jmp_offset)) {
if (jmp_padding) {
/* To keep the jmp_offset valid, the extra bytes are
- * padded before the jump insn, so we substract the
+ * padded before the jump insn, so we subtract the
* 2 bytes of jmp_cond insn from INSN_SZ_DIFF.
*
* If the previous pass already emits an imm8
@@ -1631,7 +1631,7 @@ emit_jmp:
if (jmp_padding) {
/* To avoid breaking jmp_offset, the extra bytes
* are padded before the actual jmp insn, so
- * 2 bytes is substracted from INSN_SZ_DIFF.
+ * 2 bytes is subtracted from INSN_SZ_DIFF.
*
* If the previous pass already emits an imm8
* jmp, there is nothing to pad (0 byte).
@@ -2021,7 +2021,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
/* remember return value in a stack for bpf prog to access */
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
im->ip_after_call = prog;
- memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE);
+ memcpy(prog, x86_nops[5], X86_PATCH_SIZE);
prog += X86_PATCH_SIZE;
}
@@ -2355,3 +2355,8 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index 6a99def7d315..3da88ded6ee3 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -1390,6 +1390,19 @@ static inline void emit_push_r64(const u8 src[], u8 **pprog)
*pprog = prog;
}
+static void emit_push_r32(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
{
u8 jmp_cond;
@@ -1459,6 +1472,174 @@ static u8 get_cond_jmp_opcode(const u8 op, bool is_cmp_lo)
return jmp_cond;
}
+/* i386 kernel compiles with "-mregparm=3". From gcc document:
+ *
+ * ==== snippet ====
+ * regparm (number)
+ * On x86-32 targets, the regparm attribute causes the compiler
+ * to pass arguments number one to (number) if they are of integral
+ * type in registers EAX, EDX, and ECX instead of on the stack.
+ * Functions that take a variable number of arguments continue
+ * to be passed all of their arguments on the stack.
+ * ==== snippet ====
+ *
+ * The first three args of a function will be considered for
+ * putting into the 32bit register EAX, EDX, and ECX.
+ *
+ * Two 32bit registers are used to pass a 64bit arg.
+ *
+ * For example,
+ * void foo(u32 a, u32 b, u32 c, u32 d):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u32 c: ECX
+ * u32 d: stack
+ *
+ * void foo(u64 a, u32 b, u32 c):
+ * u64 a: EAX (lo32) EDX (hi32)
+ * u32 b: ECX
+ * u32 c: stack
+ *
+ * void foo(u32 a, u64 b, u32 c):
+ * u32 a: EAX
+ * u64 b: EDX (lo32) ECX (hi32)
+ * u32 c: stack
+ *
+ * void foo(u32 a, u32 b, u64 c):
+ * u32 a: EAX
+ * u32 b: EDX
+ * u64 c: stack
+ *
+ * The return value will be stored in the EAX (and EDX for 64bit value).
+ *
+ * For example,
+ * u32 foo(u32 a, u32 b, u32 c):
+ * return value: EAX
+ *
+ * u64 foo(u32 a, u32 b, u32 c):
+ * return value: EAX (lo32) EDX (hi32)
+ *
+ * Notes:
+ * The verifier only accepts function having integer and pointers
+ * as its args and return value, so it does not have
+ * struct-by-value.
+ *
+ * emit_kfunc_call() finds out the btf_func_model by calling
+ * bpf_jit_find_kfunc_model(). A btf_func_model
+ * has the details about the number of args, size of each arg,
+ * and the size of the return value.
+ *
+ * It first decides how many args can be passed by EAX, EDX, and ECX.
+ * That will decide what args should be pushed to the stack:
+ * [first_stack_regno, last_stack_regno] are the bpf regnos
+ * that should be pushed to the stack.
+ *
+ * It will first push all args to the stack because the push
+ * will need to use ECX. Then, it moves
+ * [BPF_REG_1, first_stack_regno) to EAX, EDX, and ECX.
+ *
+ * When emitting a call (0xE8), it needs to figure out
+ * the jmp_offset relative to the jit-insn address immediately
+ * following the call (0xE8) instruction. At this point, it knows
+ * the end of the jit-insn address after completely translated the
+ * current (BPF_JMP | BPF_CALL) bpf-insn. It is passed as "end_addr"
+ * to the emit_kfunc_call(). Thus, it can learn the "immediate-follow-call"
+ * address by figuring out how many jit-insn is generated between
+ * the call (0xE8) and the end_addr:
+ * - 0-1 jit-insn (3 bytes each) to restore the esp pointer if there
+ * is arg pushed to the stack.
+ * - 0-2 jit-insns (3 bytes each) to handle the return value.
+ */
+static int emit_kfunc_call(const struct bpf_prog *bpf_prog, u8 *end_addr,
+ const struct bpf_insn *insn, u8 **pprog)
+{
+ const u8 arg_regs[] = { IA32_EAX, IA32_EDX, IA32_ECX };
+ int i, cnt = 0, first_stack_regno, last_stack_regno;
+ int free_arg_regs = ARRAY_SIZE(arg_regs);
+ const struct btf_func_model *fm;
+ int bytes_in_stack = 0;
+ const u8 *cur_arg_reg;
+ u8 *prog = *pprog;
+ s64 jmp_offset;
+
+ fm = bpf_jit_find_kfunc_model(bpf_prog, insn);
+ if (!fm)
+ return -EINVAL;
+
+ first_stack_regno = BPF_REG_1;
+ for (i = 0; i < fm->nr_args; i++) {
+ int regs_needed = fm->arg_size[i] > sizeof(u32) ? 2 : 1;
+
+ if (regs_needed > free_arg_regs)
+ break;
+
+ free_arg_regs -= regs_needed;
+ first_stack_regno++;
+ }
+
+ /* Push the args to the stack */
+ last_stack_regno = BPF_REG_0 + fm->nr_args;
+ for (i = last_stack_regno; i >= first_stack_regno; i--) {
+ if (fm->arg_size[i - 1] > sizeof(u32)) {
+ emit_push_r64(bpf2ia32[i], &prog);
+ bytes_in_stack += 8;
+ } else {
+ emit_push_r32(bpf2ia32[i], &prog);
+ bytes_in_stack += 4;
+ }
+ }
+
+ cur_arg_reg = &arg_regs[0];
+ for (i = BPF_REG_1; i < first_stack_regno; i++) {
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][0]));
+ if (fm->arg_size[i - 1] > sizeof(u32))
+ /* mov e[adc]x,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, *cur_arg_reg++),
+ STACK_VAR(bpf2ia32[i][1]));
+ }
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],edx */
+ if (fm->ret_size > sizeof(u32))
+ end_addr -= 3;
+
+ /* mov dword ptr [ebp+off],eax */
+ if (fm->ret_size)
+ end_addr -= 3;
+
+ jmp_offset = (u8 *)__bpf_call_base + insn->imm - end_addr;
+ if (!is_simm32(jmp_offset)) {
+ pr_err("unsupported BPF kernel function jmp_offset:%lld\n",
+ jmp_offset);
+ return -EINVAL;
+ }
+
+ EMIT1_off32(0xE8, jmp_offset);
+
+ if (fm->ret_size)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][0]));
+
+ if (fm->ret_size > sizeof(u32))
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(bpf2ia32[BPF_REG_0][1]));
+
+ if (bytes_in_stack)
+ /* add esp,"bytes_in_stack" */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), bytes_in_stack);
+
+ *pprog = prog;
+
+ return 0;
+}
+
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
int oldproglen, struct jit_context *ctx)
{
@@ -1888,6 +2069,18 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
if (insn->src_reg == BPF_PSEUDO_CALL)
goto notyet;
+ if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
+ int err;
+
+ err = emit_kfunc_call(bpf_prog,
+ image + addrs[i],
+ insn, &prog);
+
+ if (err)
+ return err;
+ break;
+ }
+
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
@@ -2402,3 +2595,8 @@ out:
tmp : orig_prog);
return prog;
}
+
+bool bpf_jit_supports_kfunc_call(void)
+{
+ return true;
+}
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 0a0e168be1cb..02dc64625e64 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -375,7 +375,7 @@ static const struct dmi_system_id msi_k8t_dmi_table[] = {
* The BIOS only gives options "DISABLED" and "AUTO". This code sets
* the corresponding register-value to enable the soundcard.
*
- * The soundcard is only enabled, if the mainborad is identified
+ * The soundcard is only enabled, if the mainboard is identified
* via DMI-tables and the soundcard is detected to be off.
*/
static void pci_fixup_msi_k8t_onboard_sound(struct pci_dev *dev)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 1b82d77019b1..df7b5477fc4f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -195,7 +195,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
}
/*
- * Certain firmware versions are way too sentimential and still believe
+ * Certain firmware versions are way too sentimental and still believe
* they are exclusive and unquestionable owners of the first physical page,
* even though they explicitly mark it as EFI_CONVENTIONAL_MEMORY
* (but then write-access it later during SetVirtualAddressMap()).
@@ -457,7 +457,7 @@ void __init efi_dump_pagetable(void)
* in a kernel thread and user context. Preemption needs to remain disabled
* while the EFI-mm is borrowed. mmgrab()/mmdrop() is not used because the mm
* can not change under us.
- * It should be ensured that there are no concurent calls to this function.
+ * It should be ensured that there are no concurrent calls to this function.
*/
void efi_enter_mm(void)
{
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 67d93a243c35..7850111008a8 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -441,7 +441,7 @@ void __init efi_free_boot_services(void)
* 1.4.4 with SGX enabled booting Linux via Fedora 24's
* grub2-efi on a hard disk. (And no, I don't know why
* this happened, but Linux should still try to boot rather
- * panicing early.)
+ * panicking early.)
*/
rm_size = real_mode_size_needed();
if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
@@ -726,7 +726,7 @@ void efi_crash_gracefully_on_page_fault(unsigned long phys_addr)
* Buggy efi_reset_system() is handled differently from other EFI
* Runtime Services as it doesn't use efi_rts_wq. Although,
* native_machine_emergency_restart() says that machine_real_restart()
- * could fail, it's better not to compilcate this fault handler
+ * could fail, it's better not to complicate this fault handler
* because this case occurs *very* rarely and hence could be improved
* on a need by basis.
*/
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 0286fe1b14b5..d3d456925b2a 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-/**
+/*
* imr.c -- Intel Isolated Memory Region driver
*
* Copyright(c) 2013 Intel Corporation.
@@ -551,7 +551,7 @@ static void __init imr_fixup_memmap(struct imr_device *idev)
/*
* Setup an unlocked IMR around the physical extent of the kernel
- * from the beginning of the .text secton to the end of the
+ * from the beginning of the .text section to the end of the
* .rodata section as one physically contiguous block.
*
* We don't round up @size since it is already PAGE_SIZE aligned.
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 570e3062faac..761f3689f60a 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-/**
+/*
* imr_selftest.c -- Intel Isolated Memory Region self-test driver
*
* Copyright(c) 2013 Intel Corporation.
diff --git a/arch/x86/platform/intel/iosf_mbi.c b/arch/x86/platform/intel/iosf_mbi.c
index 526f70f27c1c..fdd49d70b437 100644
--- a/arch/x86/platform/intel/iosf_mbi.c
+++ b/arch/x86/platform/intel/iosf_mbi.c
@@ -187,7 +187,7 @@ bool iosf_mbi_available(void)
EXPORT_SYMBOL(iosf_mbi_available);
/*
- **************** P-Unit/kernel shared I2C bus arbritration ****************
+ **************** P-Unit/kernel shared I2C bus arbitration ****************
*
* Some Bay Trail and Cherry Trail devices have the P-Unit and us (the kernel)
* share a single I2C bus to the PMIC. Below are helpers to arbitrate the
@@ -493,7 +493,7 @@ static void iosf_sideband_debug_init(void)
/* mcrx */
debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
- /* mcr - initiates mailbox tranaction */
+ /* mcr - initiates mailbox transaction */
debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
}
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 85f4638764d6..994a229cb79f 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -27,7 +27,7 @@ static bool lid_wake_on_close;
* wake-on-close. This is implemented as standard by the XO-1.5 DSDT.
*
* We provide here a sysfs attribute that will additionally enable
- * wake-on-close behavior. This is useful (e.g.) when we oportunistically
+ * wake-on-close behavior. This is useful (e.g.) when we opportunistically
* suspend with the display running; if the lid is then closed, we want to
* wake up to turn the display off.
*
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 26d1f6693789..75e3319e8bee 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -131,7 +131,7 @@ void * __init prom_early_alloc(unsigned long size)
const size_t chunk_size = max(PAGE_SIZE, size);
/*
- * To mimimize the number of allocations, grab at least
+ * To minimize the number of allocations, grab at least
* PAGE_SIZE of memory (that's an arbitrary choice that's
* fast enough on the platforms we care about while minimizing
* wasted bootmem) and hand off chunks of it to callers.
diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S
index d2ccadc247e6..72c1e42d121d 100644
--- a/arch/x86/platform/pvh/head.S
+++ b/arch/x86/platform/pvh/head.S
@@ -30,10 +30,10 @@
* the boot start info structure.
* - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
* - `cr4`: all bits are cleared.
- * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
- * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of `0`
+ * and a limit of `0xFFFFFFFF`. The selector value is unspecified.
* - `ds`, `es`: must be a 32-bit read/write data segment with a base of
- * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ * `0` and a limit of `0xFFFFFFFF`. The selector values are all
* unspecified.
* - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
* of '0x67'.
@@ -46,10 +46,8 @@
#define PVH_GDT_ENTRY_CS 1
#define PVH_GDT_ENTRY_DS 2
-#define PVH_GDT_ENTRY_CANARY 3
#define PVH_CS_SEL (PVH_GDT_ENTRY_CS * 8)
#define PVH_DS_SEL (PVH_GDT_ENTRY_DS * 8)
-#define PVH_CANARY_SEL (PVH_GDT_ENTRY_CANARY * 8)
SYM_CODE_START_LOCAL(pvh_start_xen)
cld
@@ -111,17 +109,6 @@ SYM_CODE_START_LOCAL(pvh_start_xen)
#else /* CONFIG_X86_64 */
- /* Set base address in stack canary descriptor. */
- movl $_pa(gdt_start),%eax
- movl $_pa(canary),%ecx
- movw %cx, (PVH_GDT_ENTRY_CANARY * 8) + 2(%eax)
- shrl $16, %ecx
- movb %cl, (PVH_GDT_ENTRY_CANARY * 8) + 4(%eax)
- movb %ch, (PVH_GDT_ENTRY_CANARY * 8) + 7(%eax)
-
- mov $PVH_CANARY_SEL,%eax
- mov %eax,%gs
-
call mk_early_pgtbl_32
mov $_pa(initial_page_table), %eax
@@ -165,7 +152,6 @@ SYM_DATA_START_LOCAL(gdt_start)
.quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* PVH_CS_SEL */
#endif
.quad GDT_ENTRY(0xc092, 0, 0xfffff) /* PVH_DS_SEL */
- .quad GDT_ENTRY(0x4090, 0, 0x18) /* PVH_CANARY_SEL */
SYM_DATA_END_LABEL(gdt_start, SYM_L_LOCAL, gdt_end)
.balign 16
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index eafc530c8767..1e9ff28bc2e0 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -24,6 +24,7 @@
#include <asm/kdebug.h>
#include <asm/local64.h>
#include <asm/nmi.h>
+#include <asm/reboot.h>
#include <asm/traps.h>
#include <asm/uv/uv.h>
#include <asm/uv/uv_hub.h>
@@ -91,6 +92,8 @@ static atomic_t uv_nmi_cpus_in_nmi = ATOMIC_INIT(-1);
static atomic_t uv_nmi_slave_continue;
static cpumask_var_t uv_nmi_cpu_mask;
+static atomic_t uv_nmi_kexec_failed;
+
/* Values for uv_nmi_slave_continue */
#define SLAVE_CLEAR 0
#define SLAVE_CONTINUE 1
@@ -834,38 +837,35 @@ static void uv_nmi_touch_watchdogs(void)
touch_nmi_watchdog();
}
-static atomic_t uv_nmi_kexec_failed;
-
-#if defined(CONFIG_KEXEC_CORE)
-static void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
+static void uv_nmi_kdump(int cpu, int main, struct pt_regs *regs)
{
+ /* Check if kdump kernel loaded for both main and secondary CPUs */
+ if (!kexec_crash_image) {
+ if (main)
+ pr_err("UV: NMI error: kdump kernel not loaded\n");
+ return;
+ }
+
/* Call crash to dump system state */
- if (master) {
+ if (main) {
pr_emerg("UV: NMI executing crash_kexec on CPU%d\n", cpu);
crash_kexec(regs);
- pr_emerg("UV: crash_kexec unexpectedly returned, ");
+ pr_emerg("UV: crash_kexec unexpectedly returned\n");
atomic_set(&uv_nmi_kexec_failed, 1);
- if (!kexec_crash_image) {
- pr_cont("crash kernel not loaded\n");
- return;
- }
- pr_cont("kexec busy, stalling cpus while waiting\n");
- }
- /* If crash exec fails the slaves should return, otherwise stall */
- while (atomic_read(&uv_nmi_kexec_failed) == 0)
- mdelay(10);
-}
+ } else { /* secondary */
-#else /* !CONFIG_KEXEC_CORE */
-static inline void uv_nmi_kdump(int cpu, int master, struct pt_regs *regs)
-{
- if (master)
- pr_err("UV: NMI kdump: KEXEC not supported in this kernel\n");
- atomic_set(&uv_nmi_kexec_failed, 1);
+ /* If kdump kernel fails, secondaries will exit this loop */
+ while (atomic_read(&uv_nmi_kexec_failed) == 0) {
+
+ /* Once shootdown cpus starts, they do not return */
+ run_crash_ipi_callback(regs);
+
+ mdelay(10);
+ }
+ }
}
-#endif /* !CONFIG_KEXEC_CORE */
#ifdef CONFIG_KGDB
#ifdef CONFIG_KGDB_KDB
@@ -889,7 +889,7 @@ static inline int uv_nmi_kdb_reason(void)
* Call KGDB/KDB from NMI handler
*
* Note that if both KGDB and KDB are configured, then the action of 'kgdb' or
- * 'kdb' has no affect on which is used. See the KGDB documention for further
+ * 'kdb' has no affect on which is used. See the KGDB documentation for further
* information.
*/
static void uv_call_kgdb_kdb(int cpu, struct pt_regs *regs, int master)
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index db1378c6ff26..3a070e7cdb8b 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -99,11 +99,8 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* segment registers
*/
-#ifdef CONFIG_X86_32_LAZY_GS
savesegment(gs, ctxt->gs);
-#endif
#ifdef CONFIG_X86_64
- savesegment(gs, ctxt->gs);
savesegment(fs, ctxt->fs);
savesegment(ds, ctxt->ds);
savesegment(es, ctxt->es);
@@ -232,7 +229,6 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->kernelmode_gs_base);
#else
loadsegment(fs, __KERNEL_PERCPU);
- loadsegment(gs, __KERNEL_STACK_CANARY);
#endif
/* Restore the TSS, RO GDT, LDT, and usermode-relevant MSRs. */
@@ -255,7 +251,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
*/
wrmsrl(MSR_FS_BASE, ctxt->fs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->usermode_gs_base);
-#elif defined(CONFIG_X86_32_LAZY_GS)
+#else
loadsegment(gs, ctxt->gs);
#endif
@@ -321,7 +317,7 @@ int hibernate_resume_nonboot_cpu_disable(void)
/*
* When bsp_check() is called in hibernate and suspend, cpu hotplug
- * is disabled already. So it's unnessary to handle race condition between
+ * is disabled already. So it's unnecessary to handle race condition between
* cpumask query and cpu hotplug.
*/
static int bsp_check(void)
diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c
index cd3914fc9f3d..e94e0050a583 100644
--- a/arch/x86/power/hibernate.c
+++ b/arch/x86/power/hibernate.c
@@ -13,8 +13,8 @@
#include <linux/kdebug.h>
#include <linux/cpu.h>
#include <linux/pgtable.h>
-
-#include <crypto/hash.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
#include <asm/e820/api.h>
#include <asm/init.h>
@@ -54,95 +54,33 @@ int pfn_is_nosave(unsigned long pfn)
return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn;
}
-
-#define MD5_DIGEST_SIZE 16
-
struct restore_data_record {
unsigned long jump_address;
unsigned long jump_address_phys;
unsigned long cr3;
unsigned long magic;
- u8 e820_digest[MD5_DIGEST_SIZE];
+ unsigned long e820_checksum;
};
-#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
/**
- * get_e820_md5 - calculate md5 according to given e820 table
+ * compute_e820_crc32 - calculate crc32 of a given e820 table
*
* @table: the e820 table to be calculated
- * @buf: the md5 result to be stored to
+ *
+ * Return: the resulting checksum
*/
-static int get_e820_md5(struct e820_table *table, void *buf)
+static inline u32 compute_e820_crc32(struct e820_table *table)
{
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- int size;
- int ret = 0;
-
- tfm = crypto_alloc_shash("md5", 0, 0);
- if (IS_ERR(tfm))
- return -ENOMEM;
-
- desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
- GFP_KERNEL);
- if (!desc) {
- ret = -ENOMEM;
- goto free_tfm;
- }
-
- desc->tfm = tfm;
-
- size = offsetof(struct e820_table, entries) +
+ int size = offsetof(struct e820_table, entries) +
sizeof(struct e820_entry) * table->nr_entries;
- if (crypto_shash_digest(desc, (u8 *)table, size, buf))
- ret = -EINVAL;
-
- kfree_sensitive(desc);
-
-free_tfm:
- crypto_free_shash(tfm);
- return ret;
-}
-
-static int hibernation_e820_save(void *buf)
-{
- return get_e820_md5(e820_table_firmware, buf);
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- int ret;
- u8 result[MD5_DIGEST_SIZE];
-
- memset(result, 0, MD5_DIGEST_SIZE);
- /* If there is no digest in suspend kernel, let it go. */
- if (!memcmp(result, buf, MD5_DIGEST_SIZE))
- return false;
-
- ret = get_e820_md5(e820_table_firmware, result);
- if (ret)
- return true;
-
- return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
-}
-#else
-static int hibernation_e820_save(void *buf)
-{
- return 0;
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
- /* If md5 is not builtin for restore kernel, let it go. */
- return false;
+ return ~crc32_le(~0, (unsigned char const *)table, size);
}
-#endif
#ifdef CONFIG_X86_64
-#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+#define RESTORE_MAGIC 0x23456789ABCDEF02UL
#else
-#define RESTORE_MAGIC 0x12345678UL
+#define RESTORE_MAGIC 0x12345679UL
#endif
/**
@@ -179,7 +117,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
*/
rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
- return hibernation_e820_save(rdr->e820_digest);
+ rdr->e820_checksum = compute_e820_crc32(e820_table_firmware);
+ return 0;
}
/**
@@ -200,7 +139,7 @@ int arch_hibernation_header_restore(void *addr)
jump_address_phys = rdr->jump_address_phys;
restore_cr3 = rdr->cr3;
- if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) {
pr_crit("Hibernate inconsistent memory map detected!\n");
return -ENODEV;
}
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 22fda7d99159..1be71ef5e4c4 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -103,7 +103,7 @@ static void __init setup_real_mode(void)
*ptr += phys_base;
}
- /* Must be perfomed *after* relocation. */
+ /* Must be performed *after* relocation. */
trampoline_header = (struct trampoline_header *)
__va(real_mode_header->trampoline_header);
diff --git a/arch/x86/tools/insn_decoder_test.c b/arch/x86/tools/insn_decoder_test.c
index 34eda63c124b..472540aeabc2 100644
--- a/arch/x86/tools/insn_decoder_test.c
+++ b/arch/x86/tools/insn_decoder_test.c
@@ -120,7 +120,7 @@ int main(int argc, char **argv)
while (fgets(line, BUFSIZE, stdin)) {
char copy[BUFSIZE], *s, *tab1, *tab2;
- int nb = 0;
+ int nb = 0, ret;
unsigned int b;
if (line[0] == '<') {
@@ -148,10 +148,12 @@ int main(int argc, char **argv)
} else
break;
}
+
/* Decode an instruction */
- insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
- insn_get_length(&insn);
- if (insn.length != nb) {
+ ret = insn_decode(&insn, insn_buff, sizeof(insn_buff),
+ x86_64 ? INSN_MODE_64 : INSN_MODE_32);
+
+ if (ret < 0 || insn.length != nb) {
warnings++;
pr_warn("Found an x86 instruction decoder bug, "
"please report this.\n", sym);
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index c6a0000ae635..213f35f94feb 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -218,8 +218,8 @@ static void parse_args(int argc, char **argv)
int main(int argc, char **argv)
{
+ int insns = 0, ret;
struct insn insn;
- int insns = 0;
int errors = 0;
unsigned long i;
unsigned char insn_buff[MAX_INSN_SIZE * 2];
@@ -237,15 +237,15 @@ int main(int argc, char **argv)
continue;
/* Decode an instruction */
- insn_init(&insn, insn_buff, sizeof(insn_buff), x86_64);
- insn_get_length(&insn);
+ ret = insn_decode(&insn, insn_buff, sizeof(insn_buff),
+ x86_64 ? INSN_MODE_64 : INSN_MODE_32);
if (insn.next_byte <= insn.kaddr ||
insn.kaddr + MAX_INSN_SIZE < insn.next_byte) {
/* Access out-of-range memory */
dump_stream(stderr, "Error: Found an access violation", i, insn_buff, &insn);
errors++;
- } else if (verbose && !insn_complete(&insn))
+ } else if (verbose && ret < 0)
dump_stream(stdout, "Info: Found an undecodable input", i, insn_buff, &insn);
else if (verbose >= 2)
dump_insn(stdout, &insn);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index dc0a337f985b..17503fed2017 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1070,8 +1070,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_pmc = xen_read_pmc,
- .iret = xen_iret,
-
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
.load_gdt = xen_load_gdt,
@@ -1204,7 +1202,6 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- setup_stack_canary_segment(cpu);
switch_to_new_gdt(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
@@ -1233,8 +1230,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* Install Xen paravirt ops */
pv_info = xen_info;
- pv_ops.init.patch = paravirt_patch_default;
pv_ops.cpu = xen_cpu_ops;
+ paravirt_iret = xen_iret;
xen_init_irq_ops();
/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index cf2ade864c30..ade789e73ee4 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1247,8 +1247,8 @@ static void xen_flush_tlb_one_user(unsigned long addr)
preempt_enable();
}
-static void xen_flush_tlb_others(const struct cpumask *cpus,
- const struct flush_tlb_info *info)
+static void xen_flush_tlb_multi(const struct cpumask *cpus,
+ const struct flush_tlb_info *info)
{
struct {
struct mmuext_op op;
@@ -1258,7 +1258,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
const size_t mc_entry_size = sizeof(args->op) +
sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
- trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
+ trace_xen_mmu_flush_tlb_multi(cpus, info->mm, info->start, info->end);
if (cpumask_empty(cpus))
return; /* nothing to do */
@@ -1267,9 +1267,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
args = mcs.args;
args->op.arg2.vcpumask = to_cpumask(args->mask);
- /* Remove us, and any offline CPUS. */
+ /* Remove any offline CPUs */
cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
if (info->end != TLB_FLUSH_ALL &&
@@ -2086,7 +2085,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
- .flush_tlb_others = xen_flush_tlb_others,
+ .flush_tlb_multi = xen_flush_tlb_multi,
.tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
@@ -2410,7 +2409,7 @@ int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr,
rmd.prot = prot;
/*
* We use the err_ptr to indicate if there we are doing a contiguous
- * mapping or a discontigious mapping.
+ * mapping or a discontiguous mapping.
*/
rmd.contiguous = !err_ptr;
rmd.no_translate = no_translate;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 91f5b330dcc6..d9c945ee1100 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -379,11 +379,6 @@ void xen_timer_resume(void)
}
}
-static const struct pv_time_ops xen_time_ops __initconst = {
- .sched_clock = xen_sched_clock,
- .steal_clock = xen_steal_clock,
-};
-
static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
static u64 xen_clock_value_saved;
@@ -525,17 +520,24 @@ static void __init xen_time_init(void)
pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
}
-void __init xen_init_time_ops(void)
+static void __init xen_init_time_common(void)
{
xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ static_call_update(pv_steal_clock, xen_steal_clock);
+ paravirt_set_sched_clock(xen_sched_clock);
+
+ x86_platform.calibrate_tsc = xen_tsc_khz;
+ x86_platform.get_wallclock = xen_get_wallclock;
+}
+
+void __init xen_init_time_ops(void)
+{
+ xen_init_time_common();
x86_init.timers.timer_init = xen_time_init;
x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_cpuinit.setup_percpu_clockev = x86_init_noop;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
/* Dom0 uses the native method to set the hardware RTC. */
if (!xen_initial_domain())
x86_platform.set_wallclock = xen_set_wallclock;
@@ -569,13 +571,11 @@ void __init xen_hvm_init_time_ops(void)
return;
}
- xen_sched_clock_offset = xen_clocksource_read();
- pv_ops.time = xen_time_ops;
+ xen_init_time_common();
+
x86_init.timers.setup_percpu_clockev = xen_time_init;
x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
- x86_platform.calibrate_tsc = xen_tsc_khz;
- x86_platform.get_wallclock = xen_get_wallclock;
x86_platform.set_wallclock = xen_set_wallclock;
}
#endif