aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig118
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/eboot.c65
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/cpu.c6
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c705
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c22
-rw-r--r--arch/x86/crypto/fpu.c207
-rw-r--r--arch/x86/crypto/glue_helper.c74
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb.c2
-rw-r--r--arch/x86/crypto/sha1-mb/sha1_mb_ctx.h2
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb.c2
-rw-r--r--arch/x86/crypto/sha256-mb/sha256_mb_ctx.h2
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb.c2
-rw-r--r--arch/x86/crypto/sha512-mb/sha512_mb_ctx.h2
-rw-r--r--arch/x86/entry/calling.h33
-rw-r--r--arch/x86/entry/entry_32.S145
-rw-r--r--arch/x86/entry/entry_64.S16
-rw-r--r--arch/x86/entry/vdso/vma.c14
-rw-r--r--arch/x86/events/core.c10
-rw-r--r--arch/x86/events/intel/core.c30
-rw-r--r--arch/x86/events/intel/cqm.c23
-rw-r--r--arch/x86/events/intel/pt.c45
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/Kbuild4
-rw-r--r--arch/x86/include/asm/acpi.h2
-rw-r--r--arch/x86/include/asm/amd_nb.h23
-rw-r--r--arch/x86/include/asm/apic.h5
-rw-r--r--arch/x86/include/asm/asm-prototypes.h16
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/cpufeatures.h12
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h39
-rw-r--r--arch/x86/include/asm/e820.h12
-rw-r--r--arch/x86/include/asm/efi.h16
-rw-r--r--arch/x86/include/asm/floppy.h20
-rw-r--r--arch/x86/include/asm/fpu/api.h10
-rw-r--r--arch/x86/include/asm/fpu/internal.h139
-rw-r--r--arch/x86/include/asm/fpu/types.h34
-rw-r--r--arch/x86/include/asm/fpu/xstate.h17
-rw-r--r--arch/x86/include/asm/idle.h22
-rw-r--r--arch/x86/include/asm/intel_rdt.h224
-rw-r--r--arch/x86/include/asm/intel_rdt_common.h27
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h19
-rw-r--r--arch/x86/include/asm/kvm_page_track.h14
-rw-r--r--arch/x86/include/asm/lguest_hcall.h1
-rw-r--r--arch/x86/include/asm/mce.h34
-rw-r--r--arch/x86/include/asm/microcode.h18
-rw-r--r--arch/x86/include/asm/microcode_amd.h30
-rw-r--r--arch/x86/include/asm/microcode_intel.h4
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/mpx.h4
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/msr.h46
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h110
-rw-r--r--arch/x86/include/asm/mutex_64.h127
-rw-r--r--arch/x86/include/asm/paravirt.h10
-rw-r--r--arch/x86/include/asm/paravirt_types.h18
-rw-r--r--arch/x86/include/asm/percpu.h11
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/include/asm/preempt.h8
-rw-r--r--arch/x86/include/asm/processor.h99
-rw-r--r--arch/x86/include/asm/qspinlock.h6
-rw-r--r--arch/x86/include/asm/special_insns.h13
-rw-r--r--arch/x86/include/asm/stacktrace.h8
-rw-r--r--arch/x86/include/asm/topology.h32
-rw-r--r--arch/x86/include/asm/trace/exceptions.h2
-rw-r--r--arch/x86/include/asm/trace/fpu.h5
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h2
-rw-r--r--arch/x86/include/asm/tsc.h9
-rw-r--r--arch/x86/include/asm/uaccess.h13
-rw-r--r--arch/x86/include/asm/unwind.h18
-rw-r--r--arch/x86/include/asm/vgtod.h7
-rw-r--r--arch/x86/include/asm/vmx.h37
-rw-r--r--arch/x86/include/asm/x86_init.h26
-rw-r--r--arch/x86/include/uapi/asm/bootparam.h1
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h4
-rw-r--r--arch/x86/include/uapi/asm/mce.h1
-rw-r--r--arch/x86/include/uapi/asm/prctl.h8
-rw-r--r--arch/x86/include/uapi/asm/vmx.h5
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/apei.c3
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S9
-rw-r--r--arch/x86/kernel/alternative.c15
-rw-r--r--arch/x86/kernel/amd_nb.c164
-rw-r--r--arch/x86/kernel/apic/apic.c19
-rw-r--r--arch/x86/kernel/apic/io_apic.c1
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c51
-rw-r--r--arch/x86/kernel/cpu/bugs.c26
-rw-r--r--arch/x86/kernel/cpu/bugs_64.c33
-rw-r--r--arch/x86/kernel/cpu/common.c92
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c22
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.c403
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c1115
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_schemata.c245
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c202
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c357
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c41
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile2
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c431
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c109
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c858
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c184
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c25
-rw-r--r--arch/x86/kernel/cpu/scattered.c60
-rw-r--r--arch/x86/kernel/cpu/vmware.c86
-rw-r--r--arch/x86/kernel/cpuid.c73
-rw-r--r--arch/x86/kernel/crash.c37
-rw-r--r--arch/x86/kernel/dumpstack.c68
-rw-r--r--arch/x86/kernel/dumpstack_32.c56
-rw-r--r--arch/x86/kernel/dumpstack_64.c79
-rw-r--r--arch/x86/kernel/fpu/bugs.c7
-rw-r--r--arch/x86/kernel/fpu/core.c74
-rw-r--r--arch/x86/kernel/fpu/init.c107
-rw-r--r--arch/x86/kernel/fpu/signal.c8
-rw-r--r--arch/x86/kernel/fpu/xstate.c11
-rw-r--r--arch/x86/kernel/head_32.S49
-rw-r--r--arch/x86/kernel/head_64.S61
-rw-r--r--arch/x86/kernel/irq.c1
-rw-r--r--arch/x86/kernel/irq_64.c1
-rw-r--r--arch/x86/kernel/itmt.c215
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c48
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/ldt.c14
-rw-r--r--arch/x86/kernel/machine_kexec_64.c6
-rw-r--r--arch/x86/kernel/msr.c69
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c14
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c14
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c14
-rw-r--r--arch/x86/kernel/platform-quirks.c5
-rw-r--r--arch/x86/kernel/process.c150
-rw-r--r--arch/x86/kernel/process_32.c17
-rw-r--r--arch/x86/kernel/process_64.c23
-rw-r--r--arch/x86/kernel/rtc.c9
-rw-r--r--arch/x86/kernel/setup.c24
-rw-r--r--arch/x86/kernel/setup_percpu.c3
-rw-r--r--arch/x86/kernel/smp.c2
-rw-r--r--arch/x86/kernel/smpboot.c116
-rw-r--r--arch/x86/kernel/tracepoint.c3
-rw-r--r--arch/x86/kernel/traps.c20
-rw-r--r--arch/x86/kernel/tsc.c42
-rw-r--r--arch/x86/kernel/tsc_msr.c19
-rw-r--r--arch/x86/kernel/tsc_sync.c290
-rw-r--r--arch/x86/kernel/unwind_frame.c195
-rw-r--r--arch/x86/kernel/vmlinux.lds.S2
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/cpuid.c39
-rw-r--r--arch/x86/kvm/emulate.c200
-rw-r--r--arch/x86/kvm/hyperv.c26
-rw-r--r--arch/x86/kvm/i8254.c15
-rw-r--r--arch/x86/kvm/i8254.h3
-rw-r--r--arch/x86/kvm/lapic.c212
-rw-r--r--arch/x86/kvm/lapic.h2
-rw-r--r--arch/x86/kvm/mmu.c43
-rw-r--r--arch/x86/kvm/page_track.c31
-rw-r--r--arch/x86/kvm/svm.c21
-rw-r--r--arch/x86/kvm/vmx.c1117
-rw-r--r--arch/x86/kvm/x86.c147
-rw-r--r--arch/x86/lguest/boot.c29
-rw-r--r--arch/x86/lib/copy_user_64.S47
-rw-r--r--arch/x86/lib/msr.c4
-rw-r--r--arch/x86/lib/usercopy.c49
-rw-r--r--arch/x86/lib/usercopy_32.c49
-rw-r--r--arch/x86/mm/fault.c5
-rw-r--r--arch/x86/mm/init_64.c24
-rw-r--r--arch/x86/mm/mpx.c10
-rw-r--r--arch/x86/mm/numa.c2
-rw-r--r--arch/x86/mm/pat.c7
-rw-r--r--arch/x86/mm/pkeys.c3
-rw-r--r--arch/x86/net/bpf_jit_comp.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c71
-rw-r--r--arch/x86/pci/amd_bus.c34
-rw-r--r--arch/x86/pci/xen.c4
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/ce4100/ce4100.c8
-rw-r--r--arch/x86/platform/intel-mid/device_libs/Makefile2
-rw-r--r--arch/x86/platform/intel-mid/intel-mid.c7
-rw-r--r--arch/x86/platform/intel-mid/mfld.c9
-rw-r--r--arch/x86/platform/intel-mid/mrfld.c8
-rw-r--r--arch/x86/platform/intel-mid/pwr.c1
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c3
-rw-r--r--arch/x86/platform/mellanox/Makefile1
-rw-r--r--arch/x86/platform/mellanox/mlx-platform.c266
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1
-rw-r--r--arch/x86/platform/uv/uv_nmi.c4
-rw-r--r--arch/x86/power/cpu.c1
-rw-r--r--arch/x86/power/hibernate_64.c94
-rw-r--r--arch/x86/ras/mce_amd_inj.c2
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/tools/insn_sanity.c3
-rw-r--r--arch/x86/tools/relocs.c3
-rw-r--r--arch/x86/tools/test_get_len.c2
-rw-r--r--arch/x86/um/asm/processor.h1
-rw-r--r--arch/x86/xen/enlighten.c13
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c1
-rw-r--r--arch/x86/xen/setup.c6
-rw-r--r--arch/x86/xen/smp.c6
-rw-r--r--arch/x86/xen/spinlock.c2
209 files changed, 7498 insertions, 4679 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636d1065..e487493bbd47 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -9,28 +9,50 @@ config 64BIT
config X86_32
def_bool y
depends on !64BIT
+ # Options that are inherently 32-bit kernel only:
+ select ARCH_WANT_IPC_PARSE_VERSION
+ select CLKSRC_I8253
+ select CLONE_BACKWARDS
+ select HAVE_AOUT
+ select HAVE_GENERIC_DMA_COHERENT
+ select MODULES_USE_ELF_REL
+ select OLD_SIGACTION
config X86_64
def_bool y
depends on 64BIT
+ # Options that are inherently 64-bit kernel only:
+ select ARCH_HAS_GIGANTIC_PAGE
+ select ARCH_SUPPORTS_INT128
+ select ARCH_USE_CMPXCHG_LOCKREF
+ select HAVE_ARCH_SOFT_DIRTY
+ select MODULES_USE_ELF_RELA
+ select X86_DEV_DMA_OPS
-### Arch settings
+#
+# Arch settings
+#
+# ( Note that options that are marked 'if X86_64' could in principle be
+# ported to 32-bit as well. )
+#
config X86
def_bool y
+ #
+ # Note: keep this list sorted alphabetically
+ #
select ACPI_LEGACY_TABLES_LOOKUP if ACPI
select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ANON_INODES
select ARCH_CLOCKSOURCE_DATA
select ARCH_DISCARD_MEMBLOCK
- select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
+ select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
- select ARCH_HAS_GIGANTIC_PAGE if X86_64
select ARCH_HAS_KCOV if X86_64
- select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
+ select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -39,23 +61,17 @@ config X86
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_SUPPORTS_ATOMIC_RMW
select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
- select ARCH_SUPPORTS_INT128 if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_USE_BUILTIN_BSWAP
- select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
- select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
- select ARCH_WANT_IPC_PARSE_VERSION if X86_32
+ select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
- select CLKSRC_I8253 if X86_32
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select CLOCKSOURCE_WATCHDOG
- select CLONE_BACKWARDS if X86_32
- select COMPAT_OLD_SIGACTION if IA32_EMULATION
select DCACHE_WORD_ACCESS
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
@@ -77,7 +93,6 @@ config X86
select HAVE_ACPI_APEI if ACPI
select HAVE_ACPI_APEI_NMI if ACPI
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
- select HAVE_AOUT if X86_32
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
@@ -88,12 +103,10 @@ config X86
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
- select HAVE_ARCH_SOFT_DIRTY if X86_64
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
- select HAVE_ARCH_WITHIN_STACK_FRAMES
- select HAVE_EBPF_JIT if X86_64
select HAVE_ARCH_VMAP_STACK if X86_64
+ select HAVE_ARCH_WITHIN_STACK_FRAMES
select HAVE_CC_STACKPROTECTOR
select HAVE_CMPXCHG_DOUBLE
select HAVE_CMPXCHG_LOCAL
@@ -106,6 +119,7 @@ config X86
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
+ select HAVE_EBPF_JIT if X86_64
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64
@@ -113,7 +127,6 @@ config X86
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
select HAVE_GCC_PLUGINS
- select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_HW_BREAKPOINT
select HAVE_IDE
select HAVE_IOREMAP_PROT
@@ -142,15 +155,11 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_STACK_VALIDATION if X86_64
select HAVE_SYSCALL_TRACEPOINTS
- select HAVE_UID16 if X86_32 || IA32_EMULATION
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
select IRQ_FORCED_THREADING
- select MODULES_USE_ELF_RELA if X86_64
- select MODULES_USE_ELF_REL if X86_32
- select OLD_SIGACTION if X86_32
- select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
select PERF_EVENTS
select RTC_LIB
select RTC_MC146818_LIB
@@ -160,11 +169,7 @@ config X86
select THREAD_INFO_IN_TASK
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
- select X86_DEV_DMA_OPS if X86_64
select X86_FEATURE_NAMES if PROC_FS
- select HAVE_STACK_VALIDATION if X86_64
- select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS
- select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS
config INSTRUCTION_DECODER
def_bool y
@@ -407,6 +412,19 @@ config GOLDFISH
def_bool y
depends on X86_GOLDFISH
+config INTEL_RDT_A
+ bool "Intel Resource Director Technology Allocation support"
+ default n
+ depends on X86 && CPU_SUP_INTEL
+ select KERNFS
+ help
+ Select to enable resource allocation which is a sub-feature of
+ Intel Resource Director Technology(RDT). More information about
+ RDT can be found in the Intel x86 Architecture Software
+ Developer Manual.
+
+ Say N if unsure.
+
if X86_32
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
@@ -550,18 +568,6 @@ config X86_INTEL_QUARK
Say Y here if you have a Quark based system such as the Arduino
compatible Intel Galileo.
-config MLX_PLATFORM
- tristate "Mellanox Technologies platform support"
- depends on X86_64
- depends on X86_EXTENDED_PLATFORM
- ---help---
- This option enables system support for the Mellanox Technologies
- platform.
-
- Say Y here if you are building a kernel for Mellanox system.
-
- Otherwise, say N.
-
config X86_INTEL_LPSS
bool "Intel Low Power Subsystem Support"
depends on X86 && ACPI
@@ -939,6 +945,27 @@ config SCHED_MC
making when dealing with multi-core CPU chips at a cost of slightly
increased overhead in some places. If unsure say N here.
+config SCHED_MC_PRIO
+ bool "CPU core priorities scheduler support"
+ depends on SCHED_MC && CPU_SUP_INTEL
+ select X86_INTEL_PSTATE
+ select CPU_FREQ
+ default y
+ ---help---
+ Intel Turbo Boost Max Technology 3.0 enabled CPUs have a
+ core ordering determined at manufacturing time, which allows
+ certain cores to reach higher turbo frequencies (when running
+ single threaded workloads) than others.
+
+ Enabling this kernel feature teaches the scheduler about
+ the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the
+ scheduler's CPU selection logic accordingly, so that higher
+ overall system performance can be achieved.
+
+ This feature will have no effect on CPUs without this feature.
+
+ If unsure say Y here.
+
source "kernel/Kconfig.preempt"
config UP_LATE_INIT
@@ -1025,7 +1052,7 @@ config X86_MCE_INTEL
config X86_MCE_AMD
def_bool y
prompt "AMD MCE features"
- depends on X86_MCE && X86_LOCAL_APIC
+ depends on X86_MCE && X86_LOCAL_APIC && AMD_NB
---help---
Additional support for AMD specific MCE features such as
the DRAM Error Threshold.
@@ -1525,7 +1552,7 @@ config X86_CHECK_BIOS_CORRUPTION
line. By default it scans the low 64k of memory every 60
seconds; see the memory_corruption_check_size and
memory_corruption_check_period parameters in
- Documentation/kernel-parameters.txt to adjust this.
+ Documentation/admin-guide/kernel-parameters.rst to adjust this.
When enabled with the default parameters, this option has
almost no overhead, as it reserves a relatively small amount
@@ -1737,6 +1764,8 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
def_bool y
# Note: only available in 64-bit mode
depends on CPU_SUP_INTEL && X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS
+ select ARCH_HAS_PKEYS
---help---
Memory Protection Keys provides a mechanism for enforcing
page-based protections, but without requiring modification of the
@@ -2092,7 +2121,7 @@ config DEBUG_HOTPLUG_CPU0
config COMPAT_VDSO
def_bool n
prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)"
- depends on X86_32 || IA32_EMULATION
+ depends on COMPAT_32
---help---
Certain buggy versions of glibc will crash if they are
presented with a 32-bit vDSO that is not mapped at the address
@@ -2694,9 +2723,10 @@ source "fs/Kconfig.binfmt"
config IA32_EMULATION
bool "IA32 Emulation"
depends on X86_64
+ select ARCH_WANT_OLD_COMPAT_IPC
select BINFMT_ELF
select COMPAT_BINFMT_ELF
- select ARCH_WANT_OLD_COMPAT_IPC
+ select COMPAT_OLD_SIGACTION
---help---
Include code to run legacy 32-bit programs under a
64-bit kernel. You should likely turn this on, unless you're
@@ -2721,6 +2751,12 @@ config X86_X32
elf32_x86_64 support enabled to compile a kernel with this
option set.
+config COMPAT_32
+ def_bool y
+ depends on IA32_EMULATION || X86_32
+ select HAVE_UID16
+ select OLD_SIGSUSPEND3
+
config COMPAT
def_bool y
depends on IA32_EMULATION || X86_X32
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 12ea8f8384f4..0d810fb15eac 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -65,7 +65,7 @@ clean-files += cpustr.h
# ---------------------------------------------------------------------------
-KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP
+KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
UBSAN_SANITIZE := n
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 34d9e15857c3..44163e8c3868 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -25,7 +25,7 @@ KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
-KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
+KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ -O2
KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC)
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
cflags-$(CONFIG_X86_32) := -march=i386
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cc69e37548db..ff01c8fc76f7 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -537,6 +537,69 @@ free_handle:
efi_call_early(free_pool, pci_handle);
}
+static void retrieve_apple_device_properties(struct boot_params *boot_params)
+{
+ efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID;
+ struct setup_data *data, *new;
+ efi_status_t status;
+ u32 size = 0;
+ void *p;
+
+ status = efi_call_early(locate_protocol, &guid, NULL, &p);
+ if (status != EFI_SUCCESS)
+ return;
+
+ if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) {
+ efi_printk(sys_table, "Unsupported properties proto version\n");
+ return;
+ }
+
+ efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size);
+ if (!size)
+ return;
+
+ do {
+ status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
+ size + sizeof(struct setup_data), &new);
+ if (status != EFI_SUCCESS) {
+ efi_printk(sys_table,
+ "Failed to alloc mem for properties\n");
+ return;
+ }
+
+ status = efi_call_proto(apple_properties_protocol, get_all, p,
+ new->data, &size);
+
+ if (status == EFI_BUFFER_TOO_SMALL)
+ efi_call_early(free_pool, new);
+ } while (status == EFI_BUFFER_TOO_SMALL);
+
+ new->type = SETUP_APPLE_PROPERTIES;
+ new->len = size;
+ new->next = 0;
+
+ data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
+ if (!data)
+ boot_params->hdr.setup_data = (unsigned long)new;
+ else {
+ while (data->next)
+ data = (struct setup_data *)(unsigned long)data->next;
+ data->next = (unsigned long)new;
+ }
+}
+
+static void setup_quirks(struct boot_params *boot_params)
+{
+ efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 };
+ efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
+ efi_table_attr(efi_system_table, fw_vendor, sys_table);
+
+ if (!memcmp(fw_vendor, apple, sizeof(apple))) {
+ if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
+ retrieve_apple_device_properties(boot_params);
+ }
+}
+
static efi_status_t
setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height)
{
@@ -1098,6 +1161,8 @@ struct boot_params *efi_main(struct efi_config *c,
setup_efi_pci(boot_params);
+ setup_quirks(boot_params);
+
status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
sizeof(*gdt), (void **)&gdt);
if (status != EFI_SUCCESS) {
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index efdfba21a5b2..4d85e600db78 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -119,8 +119,7 @@ ENTRY(startup_32)
*/
/* Load new GDT with the 64bit segments using 32bit descriptor */
- leal gdt(%ebp), %eax
- movl %eax, gdt+2(%ebp)
+ addl %ebp, gdt+2(%ebp)
lgdt gdt(%ebp)
/* Enable PAE mode */
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 4224ede43b4e..26240dde081e 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -87,12 +87,6 @@ int validate_cpu(void)
return -1;
}
- if (CONFIG_X86_MINIMUM_CPU_FAMILY <= 4 && !IS_ENABLED(CONFIG_M486) &&
- !has_eflag(X86_EFLAGS_ID)) {
- printf("This kernel requires a CPU with the CPUID instruction. Build with CONFIG_M486=y to run on this CPU.\n");
- return -1;
- }
-
if (err_flags) {
puts("This kernel requires the following features "
"not present on the CPU:\n");
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index aa8b0672f87a..31c34ee131f3 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -21,7 +21,6 @@
#include <linux/hardirq.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/err.h>
#include <crypto/algapi.h>
@@ -29,14 +28,14 @@
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <crypto/b128ops.h>
-#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
#include <asm/fpu/api.h>
#include <asm/crypto/aes.h>
-#include <crypto/ablk_helper.h>
#include <crypto/scatterwalk.h>
#include <crypto/internal/aead.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#ifdef CONFIG_X86_64
@@ -45,28 +44,26 @@
#define AESNI_ALIGN 16
+#define AESNI_ALIGN_ATTR __attribute__ ((__aligned__(AESNI_ALIGN)))
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE - 1))
#define RFC4106_HASH_SUBKEY_SIZE 16
+#define AESNI_ALIGN_EXTRA ((AESNI_ALIGN - 1) & ~(CRYPTO_MINALIGN - 1))
+#define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA)
+#define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA)
/* This data is stored at the end of the crypto_tfm struct.
* It's a type of per "session" data storage location.
* This needs to be 16 byte aligned.
*/
struct aesni_rfc4106_gcm_ctx {
- u8 hash_subkey[16] __attribute__ ((__aligned__(AESNI_ALIGN)));
- struct crypto_aes_ctx aes_key_expanded
- __attribute__ ((__aligned__(AESNI_ALIGN)));
+ u8 hash_subkey[16] AESNI_ALIGN_ATTR;
+ struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR;
u8 nonce[4];
};
-struct aesni_lrw_ctx {
- struct lrw_table_ctx lrw_table;
- u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
-};
-
struct aesni_xts_ctx {
- u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
- u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1];
+ u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
+ u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx)] AESNI_ALIGN_ATTR;
};
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -360,96 +357,95 @@ static void __aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
aesni_dec(ctx, dst, src);
}
-static int ecb_encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int aesni_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key,
+ unsigned int len)
+{
+ return aes_set_key_common(crypto_skcipher_tfm(tfm),
+ crypto_skcipher_ctx(tfm), key, len);
+}
+
+static int ecb_encrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
-static int ecb_decrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int ecb_decrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK);
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
-static int cbc_encrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int cbc_encrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
return err;
}
-static int cbc_decrypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int cbc_decrypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt(desc, &walk);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes)) {
aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
kernel_fpu_end();
@@ -458,7 +454,7 @@ static int cbc_decrypt(struct blkcipher_desc *desc,
#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
- struct blkcipher_walk *walk)
+ struct skcipher_walk *walk)
{
u8 *ctrblk = walk->iv;
u8 keystream[AES_BLOCK_SIZE];
@@ -491,157 +487,53 @@ static void aesni_ctr_enc_avx_tfm(struct crypto_aes_ctx *ctx, u8 *out,
}
#endif
-static int ctr_crypt(struct blkcipher_desc *desc,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int ctr_crypt(struct skcipher_request *req)
{
- struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
- struct blkcipher_walk walk;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_aes_ctx *ctx = aes_ctx(crypto_skcipher_ctx(tfm));
+ struct skcipher_walk walk;
+ unsigned int nbytes;
int err;
- blkcipher_walk_init(&walk, dst, src, nbytes);
- err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+ err = skcipher_walk_virt(&walk, req, true);
kernel_fpu_begin();
while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr,
nbytes & AES_BLOCK_MASK, walk.iv);
nbytes &= AES_BLOCK_SIZE - 1;
- err = blkcipher_walk_done(desc, &walk, nbytes);
+ err = skcipher_walk_done(&walk, nbytes);
}
if (walk.nbytes) {
ctr_crypt_final(ctx, &walk);
- err = blkcipher_walk_done(desc, &walk, 0);
+ err = skcipher_walk_done(&walk, 0);
}
kernel_fpu_end();
return err;
}
-#endif
-
-static int ablk_ecb_init(struct crypto_tfm *tfm)
-{
- return ablk_init_common(tfm, "__driver-ecb-aes-aesni");
-}
-
-static int ablk_cbc_init(struct crypto_tfm *tfm)
-{
- return ablk_init_common(tfm, "__driver-cbc-aes-aesni");
-}
-
-#ifdef CONFIG_X86_64
-static int ablk_ctr_init(struct crypto_tfm *tfm)
-{
- return ablk_init_common(tfm, "__driver-ctr-aes-aesni");
-}
-
-#endif
-
-#if IS_ENABLED(CONFIG_CRYPTO_PCBC)
-static int ablk_pcbc_init(struct crypto_tfm *tfm)
-{
- return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))");
-}
-#endif
-
-static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes)
-{
- aesni_ecb_enc(ctx, blks, blks, nbytes);
-}
-static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes)
-{
- aesni_ecb_dec(ctx, blks, blks, nbytes);
-}
-
-static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key,
+static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
- struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
int err;
- err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key,
- keylen - AES_BLOCK_SIZE);
+ err = xts_verify_key(tfm, key, keylen);
if (err)
return err;
- return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE);
-}
-
-static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm)
-{
- struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
-
- lrw_free_table(&ctx->lrw_table);
-}
-
-static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[8];
- struct lrw_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .table_ctx = &ctx->lrw_table,
- .crypt_ctx = aes_ctx(ctx->raw_aes_ctx),
- .crypt_fn = lrw_xts_encrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
- kernel_fpu_begin();
- ret = lrw_crypt(desc, dst, src, nbytes, &req);
- kernel_fpu_end();
-
- return ret;
-}
-
-static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[8];
- struct lrw_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .table_ctx = &ctx->lrw_table,
- .crypt_ctx = aes_ctx(ctx->raw_aes_ctx),
- .crypt_fn = lrw_xts_decrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
- kernel_fpu_begin();
- ret = lrw_crypt(desc, dst, src, nbytes, &req);
- kernel_fpu_end();
-
- return ret;
-}
-
-static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key,
- unsigned int keylen)
-{
- struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- int err;
-
- err = xts_check_key(tfm, key, keylen);
- if (err)
- return err;
+ keylen /= 2;
/* first half of xts-key is for crypt */
- err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2);
+ err = aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_crypt_ctx,
+ key, keylen);
if (err)
return err;
/* second half of xts-key is for tweak */
- return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2,
- keylen / 2);
+ return aes_set_key_common(crypto_skcipher_tfm(tfm), ctx->raw_tweak_ctx,
+ key + keylen, keylen);
}
@@ -650,8 +542,6 @@ static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
aesni_enc(ctx, out, in);
}
-#ifdef CONFIG_X86_64
-
static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
{
glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
@@ -698,83 +588,28 @@ static const struct common_glue_ctx aesni_dec_xts = {
} }
};
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
+static int xts_encrypt(struct skcipher_request *req)
{
- struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes,
- XTS_TWEAK_CAST(aesni_xts_tweak),
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx));
+ return glue_xts_req_128bit(&aesni_enc_xts, req,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
}
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
+static int xts_decrypt(struct skcipher_request *req)
{
- struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
-
- return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes,
- XTS_TWEAK_CAST(aesni_xts_tweak),
- aes_ctx(ctx->raw_tweak_ctx),
- aes_ctx(ctx->raw_crypt_ctx));
-}
-
-#else
-
-static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[8];
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx),
- .tweak_fn = aesni_xts_tweak,
- .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx),
- .crypt_fn = lrw_xts_encrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
- kernel_fpu_begin();
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- kernel_fpu_end();
-
- return ret;
-}
-
-static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
- struct scatterlist *src, unsigned int nbytes)
-{
- struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
- be128 buf[8];
- struct xts_crypt_req req = {
- .tbuf = buf,
- .tbuflen = sizeof(buf),
-
- .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx),
- .tweak_fn = aesni_xts_tweak,
- .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx),
- .crypt_fn = lrw_xts_decrypt_callback,
- };
- int ret;
-
- desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
-
- kernel_fpu_begin();
- ret = xts_crypt(desc, dst, src, nbytes, &req);
- kernel_fpu_end();
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return ret;
+ return glue_xts_req_128bit(&aesni_dec_xts, req,
+ XTS_TWEAK_CAST(aesni_xts_tweak),
+ aes_ctx(ctx->raw_tweak_ctx),
+ aes_ctx(ctx->raw_crypt_ctx));
}
-#endif
-
-#ifdef CONFIG_X86_64
static int rfc4106_init(struct crypto_aead *aead)
{
struct cryptd_aead *cryptd_tfm;
@@ -1077,9 +912,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
- AESNI_ALIGN - 1,
- .cra_alignmask = 0,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
@@ -1091,14 +924,12 @@ static struct crypto_alg aesni_algs[] = { {
}
}
}, {
- .cra_name = "__aes-aesni",
- .cra_driver_name = "__driver-aes-aesni",
- .cra_priority = 0,
+ .cra_name = "__aes",
+ .cra_driver_name = "__aes-aesni",
+ .cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
- AESNI_ALIGN - 1,
- .cra_alignmask = 0,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
.cra_module = THIS_MODULE,
.cra_u = {
.cipher = {
@@ -1109,250 +940,94 @@ static struct crypto_alg aesni_algs[] = { {
.cia_decrypt = __aes_decrypt
}
}
-}, {
- .cra_name = "__ecb-aes-aesni",
- .cra_driver_name = "__driver-ecb-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
- AESNI_ALIGN - 1,
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = aes_set_key,
- .encrypt = ecb_encrypt,
- .decrypt = ecb_decrypt,
- },
- },
-}, {
- .cra_name = "__cbc-aes-aesni",
- .cra_driver_name = "__driver-cbc-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
- AESNI_ALIGN - 1,
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = aes_set_key,
- .encrypt = cbc_encrypt,
- .decrypt = cbc_decrypt,
- },
- },
-}, {
- .cra_name = "ecb(aes)",
- .cra_driver_name = "ecb-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_ecb_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
+} };
+
+static struct skcipher_alg aesni_skciphers[] = {
+ {
+ .base = {
+ .cra_name = "__ecb(aes)",
+ .cra_driver_name = "__ecb-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
},
- },
-}, {
- .cra_name = "cbc(aes)",
- .cra_driver_name = "cbc-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_cbc_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ecb_encrypt,
+ .decrypt = ecb_decrypt,
+ }, {
+ .base = {
+ .cra_name = "__cbc(aes)",
+ .cra_driver_name = "__cbc-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
},
- },
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = cbc_encrypt,
+ .decrypt = cbc_decrypt,
#ifdef CONFIG_X86_64
-}, {
- .cra_name = "__ctr-aes-aesni",
- .cra_driver_name = "__driver-ctr-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct crypto_aes_ctx) +
- AESNI_ALIGN - 1,
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = aes_set_key,
- .encrypt = ctr_crypt,
- .decrypt = ctr_crypt,
+ }, {
+ .base = {
+ .cra_name = "__ctr(aes)",
+ .cra_driver_name = "__ctr-aes-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = 1,
+ .cra_ctxsize = CRYPTO_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
},
- },
-}, {
- .cra_name = "ctr(aes)",
- .cra_driver_name = "ctr-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = 1,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_ctr_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_encrypt,
- .geniv = "chainiv",
+ .min_keysize = AES_MIN_KEY_SIZE,
+ .max_keysize = AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .chunksize = AES_BLOCK_SIZE,
+ .setkey = aesni_skcipher_setkey,
+ .encrypt = ctr_crypt,
+ .decrypt = ctr_crypt,
+ }, {
+ .base = {
+ .cra_name = "__xts(aes)",
+ .cra_driver_name = "__xts-aes-aesni",
+ .cra_priority = 401,
+ .cra_flags = CRYPTO_ALG_INTERNAL,
+ .cra_blocksize = AES_BLOCK_SIZE,
+ .cra_ctxsize = XTS_AES_CTX_SIZE,
+ .cra_module = THIS_MODULE,
},
- },
+ .min_keysize = 2 * AES_MIN_KEY_SIZE,
+ .max_keysize = 2 * AES_MAX_KEY_SIZE,
+ .ivsize = AES_BLOCK_SIZE,
+ .setkey = xts_aesni_setkey,
+ .encrypt = xts_encrypt,
+ .decrypt = xts_decrypt,
#endif
+ }
+};
+
+struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
+
+struct {
+ const char *algname;
+ const char *drvname;
+ const char *basename;
+ struct simd_skcipher_alg *simd;
+} aesni_simd_skciphers2[] = {
#if IS_ENABLED(CONFIG_CRYPTO_PCBC)
-}, {
- .cra_name = "pcbc(aes)",
- .cra_driver_name = "pcbc-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_pcbc_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
+ {
+ .algname = "pcbc(aes)",
+ .drvname = "pcbc-aes-aesni",
+ .basename = "fpu(pcbc(__aes-aesni))",
},
#endif
-}, {
- .cra_name = "__lrw-aes-aesni",
- .cra_driver_name = "__driver-lrw-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct aesni_lrw_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_exit = lrw_aesni_exit_tfm,
- .cra_u = {
- .blkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = lrw_aesni_setkey,
- .encrypt = lrw_encrypt,
- .decrypt = lrw_decrypt,
- },
- },
-}, {
- .cra_name = "__xts-aes-aesni",
- .cra_driver_name = "__driver-xts-aes-aesni",
- .cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
- CRYPTO_ALG_INTERNAL,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct aesni_xts_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_blkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_u = {
- .blkcipher = {
- .min_keysize = 2 * AES_MIN_KEY_SIZE,
- .max_keysize = 2 * AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = xts_aesni_setkey,
- .encrypt = xts_encrypt,
- .decrypt = xts_decrypt,
- },
- },
-}, {
- .cra_name = "lrw(aes)",
- .cra_driver_name = "lrw-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE,
- .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
-}, {
- .cra_name = "xts(aes)",
- .cra_driver_name = "xts-aes-aesni",
- .cra_priority = 400,
- .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
- .cra_blocksize = AES_BLOCK_SIZE,
- .cra_ctxsize = sizeof(struct async_helper_ctx),
- .cra_alignmask = 0,
- .cra_type = &crypto_ablkcipher_type,
- .cra_module = THIS_MODULE,
- .cra_init = ablk_init,
- .cra_exit = ablk_exit,
- .cra_u = {
- .ablkcipher = {
- .min_keysize = 2 * AES_MIN_KEY_SIZE,
- .max_keysize = 2 * AES_MAX_KEY_SIZE,
- .ivsize = AES_BLOCK_SIZE,
- .setkey = ablk_set_key,
- .encrypt = ablk_encrypt,
- .decrypt = ablk_decrypt,
- },
- },
-} };
+};
#ifdef CONFIG_X86_64
static struct aead_alg aesni_aead_algs[] = { {
@@ -1401,9 +1076,27 @@ static const struct x86_cpu_id aesni_cpu_id[] = {
};
MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id);
+static void aesni_free_simds(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) &&
+ aesni_simd_skciphers[i]; i++)
+ simd_skcipher_free(aesni_simd_skciphers[i]);
+
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2) &&
+ aesni_simd_skciphers2[i].simd; i++)
+ simd_skcipher_free(aesni_simd_skciphers2[i].simd);
+}
+
static int __init aesni_init(void)
{
+ struct simd_skcipher_alg *simd;
+ const char *basename;
+ const char *algname;
+ const char *drvname;
int err;
+ int i;
if (!x86_match_cpu(aesni_cpu_id))
return -ENODEV;
@@ -1445,13 +1138,48 @@ static int __init aesni_init(void)
if (err)
goto fpu_exit;
+ err = crypto_register_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
+ if (err)
+ goto unregister_algs;
+
err = crypto_register_aeads(aesni_aead_algs,
ARRAY_SIZE(aesni_aead_algs));
if (err)
- goto unregister_algs;
+ goto unregister_skciphers;
+
+ for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) {
+ algname = aesni_skciphers[i].base.cra_name + 2;
+ drvname = aesni_skciphers[i].base.cra_driver_name + 2;
+ basename = aesni_skciphers[i].base.cra_driver_name;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ goto unregister_simds;
+
+ aesni_simd_skciphers[i] = simd;
+ }
- return err;
+ for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers2); i++) {
+ algname = aesni_simd_skciphers2[i].algname;
+ drvname = aesni_simd_skciphers2[i].drvname;
+ basename = aesni_simd_skciphers2[i].basename;
+ simd = simd_skcipher_create_compat(algname, drvname, basename);
+ err = PTR_ERR(simd);
+ if (IS_ERR(simd))
+ goto unregister_simds;
+ aesni_simd_skciphers2[i].simd = simd;
+ }
+
+ return 0;
+
+unregister_simds:
+ aesni_free_simds();
+ crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
+unregister_skciphers:
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
unregister_algs:
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
fpu_exit:
@@ -1461,7 +1189,10 @@ fpu_exit:
static void __exit aesni_exit(void)
{
+ aesni_free_simds();
crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs));
+ crypto_unregister_skciphers(aesni_skciphers,
+ ARRAY_SIZE(aesni_skciphers));
crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs));
crypto_fpu_exit();
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index 0857b1a1de3b..c194d5717ae5 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -48,26 +48,13 @@
#ifdef CONFIG_X86_64
/*
* use carryless multiply version of crc32c when buffer
- * size is >= 512 (when eager fpu is enabled) or
- * >= 1024 (when eager fpu is disabled) to account
+ * size is >= 512 to account
* for fpu state save/restore overhead.
*/
-#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
-#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
+#define CRC32C_PCL_BREAKEVEN 512
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
-static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
-#if defined(X86_FEATURE_EAGER_FPU)
-#define set_pcl_breakeven_point() \
-do { \
- if (!use_eager_fpu()) \
- crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
-} while (0)
-#else
-#define set_pcl_breakeven_point() \
- (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
-#endif
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
@@ -190,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
- if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+ if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
@@ -202,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
- if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
+ if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
@@ -261,7 +248,6 @@ static int __init crc32c_intel_mod_init(void)
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
- set_pcl_breakeven_point();
}
#endif
return crypto_register_shash(&alg);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index e7d679e2a018..406680476c52 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -11,143 +11,186 @@
*
*/
-#include <crypto/algapi.h>
+#include <crypto/internal/skcipher.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/crypto.h>
#include <asm/fpu/api.h>
struct crypto_fpu_ctx {
- struct crypto_blkcipher *child;
+ struct crypto_skcipher *child;
};
-static int crypto_fpu_setkey(struct crypto_tfm *parent, const u8 *key,
+static int crypto_fpu_setkey(struct crypto_skcipher *parent, const u8 *key,
unsigned int keylen)
{
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(parent);
- struct crypto_blkcipher *child = ctx->child;
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(parent);
+ struct crypto_skcipher *child = ctx->child;
int err;
- crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
- crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) &
- CRYPTO_TFM_REQ_MASK);
- err = crypto_blkcipher_setkey(child, key, keylen);
- crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) &
- CRYPTO_TFM_RES_MASK);
+ crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) &
+ CRYPTO_TFM_REQ_MASK);
+ err = crypto_skcipher_setkey(child, key, keylen);
+ crypto_skcipher_set_flags(parent, crypto_skcipher_get_flags(child) &
+ CRYPTO_TFM_RES_MASK);
return err;
}
-static int crypto_fpu_encrypt(struct blkcipher_desc *desc_in,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int crypto_fpu_encrypt(struct skcipher_request *req)
{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *child = ctx->child;
+ SKCIPHER_REQUEST_ON_STACK(subreq, child);
int err;
- struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
- struct crypto_blkcipher *child = ctx->child;
- struct blkcipher_desc desc = {
- .tfm = child,
- .info = desc_in->info,
- .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
- };
+
+ skcipher_request_set_tfm(subreq, child);
+ skcipher_request_set_callback(subreq, 0, NULL, NULL);
+ skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
+ req->iv);
kernel_fpu_begin();
- err = crypto_blkcipher_crt(desc.tfm)->encrypt(&desc, dst, src, nbytes);
+ err = crypto_skcipher_encrypt(subreq);
kernel_fpu_end();
+
+ skcipher_request_zero(subreq);
return err;
}
-static int crypto_fpu_decrypt(struct blkcipher_desc *desc_in,
- struct scatterlist *dst, struct scatterlist *src,
- unsigned int nbytes)
+static int crypto_fpu_decrypt(struct skcipher_request *req)
{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher *child = ctx->child;
+ SKCIPHER_REQUEST_ON_STACK(subreq, child);
int err;
- struct crypto_fpu_ctx *ctx = crypto_blkcipher_ctx(desc_in->tfm);
- struct crypto_blkcipher *child = ctx->child;
- struct blkcipher_desc desc = {
- .tfm = child,
- .info = desc_in->info,
- .flags = desc_in->flags & ~CRYPTO_TFM_REQ_MAY_SLEEP,
- };
+
+ skcipher_request_set_tfm(subreq, child);
+ skcipher_request_set_callback(subreq, 0, NULL, NULL);
+ skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen,
+ req->iv);
kernel_fpu_begin();
- err = crypto_blkcipher_crt(desc.tfm)->decrypt(&desc, dst, src, nbytes);
+ err = crypto_skcipher_decrypt(subreq);
kernel_fpu_end();
+
+ skcipher_request_zero(subreq);
return err;
}
-static int crypto_fpu_init_tfm(struct crypto_tfm *tfm)
+static int crypto_fpu_init_tfm(struct crypto_skcipher *tfm)
{
- struct crypto_instance *inst = crypto_tfm_alg_instance(tfm);
- struct crypto_spawn *spawn = crypto_instance_ctx(inst);
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
- struct crypto_blkcipher *cipher;
+ struct skcipher_instance *inst = skcipher_alg_instance(tfm);
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct crypto_skcipher_spawn *spawn;
+ struct crypto_skcipher *cipher;
- cipher = crypto_spawn_blkcipher(spawn);
+ spawn = skcipher_instance_ctx(inst);
+ cipher = crypto_spawn_skcipher(spawn);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
ctx->child = cipher;
+
return 0;
}
-static void crypto_fpu_exit_tfm(struct crypto_tfm *tfm)
+static void crypto_fpu_exit_tfm(struct crypto_skcipher *tfm)
+{
+ struct crypto_fpu_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+ crypto_free_skcipher(ctx->child);
+}
+
+static void crypto_fpu_free(struct skcipher_instance *inst)
{
- struct crypto_fpu_ctx *ctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(ctx->child);
+ crypto_drop_skcipher(skcipher_instance_ctx(inst));
+ kfree(inst);
}
-static struct crypto_instance *crypto_fpu_alloc(struct rtattr **tb)
+static int crypto_fpu_create(struct crypto_template *tmpl, struct rtattr **tb)
{
- struct crypto_instance *inst;
- struct crypto_alg *alg;
+ struct crypto_skcipher_spawn *spawn;
+ struct skcipher_instance *inst;
+ struct crypto_attr_type *algt;
+ struct skcipher_alg *alg;
+ const char *cipher_name;
int err;
- err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER);
+ algt = crypto_get_attr_type(tb);
+ if (IS_ERR(algt))
+ return PTR_ERR(algt);
+
+ if ((algt->type ^ (CRYPTO_ALG_INTERNAL | CRYPTO_ALG_TYPE_SKCIPHER)) &
+ algt->mask)
+ return -EINVAL;
+
+ if (!(algt->mask & CRYPTO_ALG_INTERNAL))
+ return -EINVAL;
+
+ cipher_name = crypto_attr_alg_name(tb[1]);
+ if (IS_ERR(cipher_name))
+ return PTR_ERR(cipher_name);
+
+ inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ spawn = skcipher_instance_ctx(inst);
+
+ crypto_set_skcipher_spawn(spawn, skcipher_crypto_instance(inst));
+ err = crypto_grab_skcipher(spawn, cipher_name, CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL | CRYPTO_ALG_ASYNC);
if (err)
- return ERR_PTR(err);
-
- alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_BLKCIPHER,
- CRYPTO_ALG_TYPE_MASK);
- if (IS_ERR(alg))
- return ERR_CAST(alg);
-
- inst = crypto_alloc_instance("fpu", alg);
- if (IS_ERR(inst))
- goto out_put_alg;
-
- inst->alg.cra_flags = alg->cra_flags;
- inst->alg.cra_priority = alg->cra_priority;
- inst->alg.cra_blocksize = alg->cra_blocksize;
- inst->alg.cra_alignmask = alg->cra_alignmask;
- inst->alg.cra_type = alg->cra_type;
- inst->alg.cra_blkcipher.ivsize = alg->cra_blkcipher.ivsize;
- inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize;
- inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize;
- inst->alg.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
- inst->alg.cra_init = crypto_fpu_init_tfm;
- inst->alg.cra_exit = crypto_fpu_exit_tfm;
- inst->alg.cra_blkcipher.setkey = crypto_fpu_setkey;
- inst->alg.cra_blkcipher.encrypt = crypto_fpu_encrypt;
- inst->alg.cra_blkcipher.decrypt = crypto_fpu_decrypt;
-
-out_put_alg:
- crypto_mod_put(alg);
- return inst;
-}
+ goto out_free_inst;
-static void crypto_fpu_free(struct crypto_instance *inst)
-{
- crypto_drop_spawn(crypto_instance_ctx(inst));
+ alg = crypto_skcipher_spawn_alg(spawn);
+
+ err = crypto_inst_setname(skcipher_crypto_instance(inst), "fpu",
+ &alg->base);
+ if (err)
+ goto out_drop_skcipher;
+
+ inst->alg.base.cra_flags = CRYPTO_ALG_INTERNAL;
+ inst->alg.base.cra_priority = alg->base.cra_priority;
+ inst->alg.base.cra_blocksize = alg->base.cra_blocksize;
+ inst->alg.base.cra_alignmask = alg->base.cra_alignmask;
+
+ inst->alg.ivsize = crypto_skcipher_alg_ivsize(alg);
+ inst->alg.min_keysize = crypto_skcipher_alg_min_keysize(alg);
+ inst->alg.max_keysize = crypto_skcipher_alg_max_keysize(alg);
+
+ inst->alg.base.cra_ctxsize = sizeof(struct crypto_fpu_ctx);
+
+ inst->alg.init = crypto_fpu_init_tfm;
+ inst->alg.exit = crypto_fpu_exit_tfm;
+
+ inst->alg.setkey = crypto_fpu_setkey;
+ inst->alg.encrypt = crypto_fpu_encrypt;
+ inst->alg.decrypt = crypto_fpu_decrypt;
+
+ inst->free = crypto_fpu_free;
+
+ err = skcipher_register_instance(tmpl, inst);
+ if (err)
+ goto out_drop_skcipher;
+
+out:
+ return err;
+
+out_drop_skcipher:
+ crypto_drop_skcipher(spawn);
+out_free_inst:
kfree(inst);
+ goto out;
}
static struct crypto_template crypto_fpu_tmpl = {
.name = "fpu",
- .alloc = crypto_fpu_alloc,
- .free = crypto_fpu_free,
+ .create = crypto_fpu_create,
.module = THIS_MODULE,
};
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 6a85598931b5..260a060d7275 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -27,10 +27,10 @@
#include <linux/module.h>
#include <crypto/b128ops.h>
+#include <crypto/internal/skcipher.h>
#include <crypto/lrw.h>
#include <crypto/xts.h>
#include <asm/crypto/glue_helper.h>
-#include <crypto/scatterwalk.h>
static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
struct blkcipher_desc *desc,
@@ -339,6 +339,41 @@ done:
return nbytes;
}
+static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ void *ctx,
+ struct skcipher_walk *walk)
+{
+ const unsigned int bsize = 128 / 8;
+ unsigned int nbytes = walk->nbytes;
+ u128 *src = walk->src.virt.addr;
+ u128 *dst = walk->dst.virt.addr;
+ unsigned int num_blocks, func_bytes;
+ unsigned int i;
+
+ /* Process multi-block batch */
+ for (i = 0; i < gctx->num_funcs; i++) {
+ num_blocks = gctx->funcs[i].num_blocks;
+ func_bytes = bsize * num_blocks;
+
+ if (nbytes >= func_bytes) {
+ do {
+ gctx->funcs[i].fn_u.xts(ctx, dst, src,
+ walk->iv);
+
+ src += num_blocks;
+ dst += num_blocks;
+ nbytes -= func_bytes;
+ } while (nbytes >= func_bytes);
+
+ if (nbytes < bsize)
+ goto done;
+ }
+ }
+
+done:
+ return nbytes;
+}
+
/* for implementations implementing faster XTS IV generator */
int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
struct blkcipher_desc *desc, struct scatterlist *dst,
@@ -379,6 +414,43 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
}
EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit);
+int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx)
+{
+ const unsigned int bsize = 128 / 8;
+ struct skcipher_walk walk;
+ bool fpu_enabled = false;
+ unsigned int nbytes;
+ int err;
+
+ err = skcipher_walk_virt(&walk, req, false);
+ nbytes = walk.nbytes;
+ if (!nbytes)
+ return err;
+
+ /* set minimum length to bsize, for tweak_fn */
+ fpu_enabled = glue_skwalk_fpu_begin(bsize, gctx->fpu_blocks_limit,
+ &walk, fpu_enabled,
+ nbytes < bsize ? bsize : nbytes);
+
+ /* calculate first value of T */
+ tweak_fn(tweak_ctx, walk.iv, walk.iv);
+
+ while (nbytes) {
+ nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk);
+
+ err = skcipher_walk_done(&walk, nbytes);
+ nbytes = walk.nbytes;
+ }
+
+ glue_fpu_end(fpu_enabled);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
+
void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
common_glue_func_t fn)
{
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb.c b/arch/x86/crypto/sha1-mb/sha1_mb.c
index 9e5b67127a09..acf9fdf01671 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha1-mb/sha1_mb.c
@@ -114,7 +114,7 @@ static inline void sha1_init_digest(uint32_t *digest)
}
static inline uint32_t sha1_pad(uint8_t padblock[SHA1_BLOCK_SIZE * 2],
- uint32_t total_len)
+ uint64_t total_len)
{
uint32_t i = total_len & (SHA1_BLOCK_SIZE - 1);
diff --git a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
index 98a35bcc6f4a..13590ccf965c 100644
--- a/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
+++ b/arch/x86/crypto/sha1-mb/sha1_mb_ctx.h
@@ -125,7 +125,7 @@ struct sha1_hash_ctx {
/* error flag */
int error;
- uint32_t total_length;
+ uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA1_BLOCK_SIZE * 2];
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb.c b/arch/x86/crypto/sha256-mb/sha256_mb.c
index 6f97fb33ae21..7926a226b120 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb.c
+++ b/arch/x86/crypto/sha256-mb/sha256_mb.c
@@ -115,7 +115,7 @@ inline void sha256_init_digest(uint32_t *digest)
}
inline uint32_t sha256_pad(uint8_t padblock[SHA256_BLOCK_SIZE * 2],
- uint32_t total_len)
+ uint64_t total_len)
{
uint32_t i = total_len & (SHA256_BLOCK_SIZE - 1);
diff --git a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
index edd252b73206..aabb30320af0 100644
--- a/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
+++ b/arch/x86/crypto/sha256-mb/sha256_mb_ctx.h
@@ -125,7 +125,7 @@ struct sha256_hash_ctx {
/* error flag */
int error;
- uint32_t total_length;
+ uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA256_BLOCK_SIZE * 2];
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb.c b/arch/x86/crypto/sha512-mb/sha512_mb.c
index d210174a52b0..9c1bb6d58141 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb.c
+++ b/arch/x86/crypto/sha512-mb/sha512_mb.c
@@ -117,7 +117,7 @@ inline void sha512_init_digest(uint64_t *digest)
}
inline uint32_t sha512_pad(uint8_t padblock[SHA512_BLOCK_SIZE * 2],
- uint32_t total_len)
+ uint64_t total_len)
{
uint32_t i = total_len & (SHA512_BLOCK_SIZE - 1);
diff --git a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
index 9d4b2c8208d5..e4653f5eec3f 100644
--- a/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
+++ b/arch/x86/crypto/sha512-mb/sha512_mb_ctx.h
@@ -119,7 +119,7 @@ struct sha512_hash_ctx {
/* error flag */
int error;
- uint32_t total_length;
+ uint64_t total_length;
const void *incoming_buffer;
uint32_t incoming_buffer_length;
uint8_t partial_block_buffer[SHA512_BLOCK_SIZE * 2];
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e5884066c..05ed3d393da7 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -90,8 +90,8 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
- .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
- addq $-(15*8+\addskip), %rsp
+ .macro ALLOC_PT_GPREGS_ON_STACK
+ addq $-(15*8), %rsp
.endm
.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
@@ -147,15 +147,6 @@ For 32-bit we have the following conventions - kernel is built with
movq 5*8+\offset(%rsp), %rbx
.endm
- .macro ZERO_EXTRA_REGS
- xorl %r15d, %r15d
- xorl %r14d, %r14d
- xorl %r13d, %r13d
- xorl %r12d, %r12d
- xorl %ebp, %ebp
- xorl %ebx, %ebx
- .endm
-
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
movq 6*8(%rsp), %r11
@@ -201,6 +192,26 @@ For 32-bit we have the following conventions - kernel is built with
.byte 0xf1
.endm
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
+ * the original rbp.
+ */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+ .if \ptregs_offset
+ leaq \ptregs_offset(%rsp), %rbp
+ .else
+ mov %rsp, %rbp
+ .endif
+ orq $0x1, %rbp
+#endif
+.endm
+
#endif /* CONFIG_X86_64 */
/*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21b352a11b49..701d29f8e4d3 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -45,6 +45,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/export.h>
+#include <asm/frame.h>
.section .entry.text, "ax"
@@ -175,6 +176,22 @@
SET_KERNEL_GS %edx
.endm
+/*
+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
+ * is just setting the LSB, which makes it an invalid stack address and is also
+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
+ * original rbp.
+ */
+.macro ENCODE_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
+ mov %esp, %ebp
+ orl $0x1, %ebp
+#endif
+.endm
+
.macro RESTORE_INT_REGS
popl %ebx
popl %ecx
@@ -238,6 +255,23 @@ ENTRY(__switch_to_asm)
END(__switch_to_asm)
/*
+ * The unwinder expects the last frame on the stack to always be at the same
+ * offset from the end of the page, which allows it to validate the stack.
+ * Calling schedule_tail() directly would break that convention because its an
+ * asmlinkage function so its argument has to be pushed on the stack. This
+ * wrapper creates a proper "end of stack" frame header before the call.
+ */
+ENTRY(schedule_tail_wrapper)
+ FRAME_BEGIN
+
+ pushl %eax
+ call schedule_tail
+ popl %eax
+
+ FRAME_END
+ ret
+ENDPROC(schedule_tail_wrapper)
+/*
* A newly forked process directly context switches into this address.
*
* eax: prev task we switched from
@@ -245,9 +279,7 @@ END(__switch_to_asm)
* edi: kernel thread arg
*/
ENTRY(ret_from_fork)
- pushl %eax
- call schedule_tail
- popl %eax
+ call schedule_tail_wrapper
testl %ebx, %ebx
jnz 1f /* kernel threads are uncommon */
@@ -307,13 +339,13 @@ END(ret_from_exception)
#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
-need_resched:
+.Lneed_resched:
cmpl $0, PER_CPU_VAR(__preempt_count)
jnz restore_all
testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
- jmp need_resched
+ jmp .Lneed_resched
END(resume_kernel)
#endif
@@ -334,7 +366,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region)
*/
ENTRY(xen_sysenter_target)
addl $5*4, %esp /* remove xen-provided frame */
- jmp sysenter_past_esp
+ jmp .Lsysenter_past_esp
#endif
/*
@@ -371,7 +403,7 @@ ENTRY(xen_sysenter_target)
*/
ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
-sysenter_past_esp:
+.Lsysenter_past_esp:
pushl $__USER_DS /* pt_regs->ss */
pushl %ebp /* pt_regs->sp (stashed in bp) */
pushfl /* pt_regs->flags (except IF = 0) */
@@ -504,9 +536,9 @@ ENTRY(entry_INT80_32)
restore_all:
TRACE_IRQS_IRET
-restore_all_notrace:
+.Lrestore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
- ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX
+ ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
/*
@@ -518,22 +550,23 @@ restore_all_notrace:
movb PT_CS(%esp), %al
andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- je ldt_ss # returning to user-space with LDT SS
+ je .Lldt_ss # returning to user-space with LDT SS
#endif
-restore_nocheck:
+.Lrestore_nocheck:
RESTORE_REGS 4 # skip orig_eax/error_code
-irq_return:
+.Lirq_return:
INTERRUPT_RETURN
+
.section .fixup, "ax"
ENTRY(iret_exc )
pushl $0 # no error code
pushl $do_iret_error
- jmp error_code
+ jmp common_exception
.previous
- _ASM_EXTABLE(irq_return, iret_exc)
+ _ASM_EXTABLE(.Lirq_return, iret_exc)
#ifdef CONFIG_X86_ESPFIX32
-ldt_ss:
+.Lldt_ss:
/*
* Setup and switch to ESPFIX stack
*
@@ -562,7 +595,7 @@ ldt_ss:
*/
DISABLE_INTERRUPTS(CLBR_EAX)
lss (%esp), %esp /* switch to espfix segment */
- jmp restore_nocheck
+ jmp .Lrestore_nocheck
#endif
ENDPROC(entry_INT80_32)
@@ -624,6 +657,7 @@ common_interrupt:
ASM_CLAC
addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
movl %esp, %eax
call do_IRQ
@@ -635,6 +669,7 @@ ENTRY(name) \
ASM_CLAC; \
pushl $~(nr); \
SAVE_ALL; \
+ ENCODE_FRAME_POINTER; \
TRACE_IRQS_OFF \
movl %esp, %eax; \
call fn; \
@@ -659,7 +694,7 @@ ENTRY(coprocessor_error)
ASM_CLAC
pushl $0
pushl $do_coprocessor_error
- jmp error_code
+ jmp common_exception
END(coprocessor_error)
ENTRY(simd_coprocessor_error)
@@ -673,14 +708,14 @@ ENTRY(simd_coprocessor_error)
#else
pushl $do_simd_coprocessor_error
#endif
- jmp error_code
+ jmp common_exception
END(simd_coprocessor_error)
ENTRY(device_not_available)
ASM_CLAC
pushl $-1 # mark this as an int
pushl $do_device_not_available
- jmp error_code
+ jmp common_exception
END(device_not_available)
#ifdef CONFIG_PARAVIRT
@@ -694,59 +729,59 @@ ENTRY(overflow)
ASM_CLAC
pushl $0
pushl $do_overflow
- jmp error_code
+ jmp common_exception
END(overflow)
ENTRY(bounds)
ASM_CLAC
pushl $0
pushl $do_bounds
- jmp error_code
+ jmp common_exception
END(bounds)
ENTRY(invalid_op)
ASM_CLAC
pushl $0
pushl $do_invalid_op
- jmp error_code
+ jmp common_exception
END(invalid_op)
ENTRY(coprocessor_segment_overrun)
ASM_CLAC
pushl $0
pushl $do_coprocessor_segment_overrun
- jmp error_code
+ jmp common_exception
END(coprocessor_segment_overrun)
ENTRY(invalid_TSS)
ASM_CLAC
pushl $do_invalid_TSS
- jmp error_code
+ jmp common_exception
END(invalid_TSS)
ENTRY(segment_not_present)
ASM_CLAC
pushl $do_segment_not_present
- jmp error_code
+ jmp common_exception
END(segment_not_present)
ENTRY(stack_segment)
ASM_CLAC
pushl $do_stack_segment
- jmp error_code
+ jmp common_exception
END(stack_segment)
ENTRY(alignment_check)
ASM_CLAC
pushl $do_alignment_check
- jmp error_code
+ jmp common_exception
END(alignment_check)
ENTRY(divide_error)
ASM_CLAC
pushl $0 # no error code
pushl $do_divide_error
- jmp error_code
+ jmp common_exception
END(divide_error)
#ifdef CONFIG_X86_MCE
@@ -754,7 +789,7 @@ ENTRY(machine_check)
ASM_CLAC
pushl $0
pushl machine_check_vector
- jmp error_code
+ jmp common_exception
END(machine_check)
#endif
@@ -762,13 +797,14 @@ ENTRY(spurious_interrupt_bug)
ASM_CLAC
pushl $0
pushl $do_spurious_interrupt_bug
- jmp error_code
+ jmp common_exception
END(spurious_interrupt_bug)
#ifdef CONFIG_XEN
ENTRY(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
/*
@@ -823,6 +859,7 @@ ENTRY(xen_failsafe_callback)
jmp iret_exc
5: pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
+ ENCODE_FRAME_POINTER
jmp ret_from_exception
.section .fixup, "ax"
@@ -882,15 +919,15 @@ ftrace_call:
popl %edx
popl %ecx
popl %eax
-ftrace_ret:
+.Lftrace_ret:
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
.globl ftrace_graph_call
ftrace_graph_call:
jmp ftrace_stub
#endif
-.globl ftrace_stub
-ftrace_stub:
+/* This is weak to keep gas from relaxing the jumps */
+WEAK(ftrace_stub)
ret
END(ftrace_caller)
@@ -952,7 +989,7 @@ GLOBAL(ftrace_regs_call)
popl %gs
addl $8, %esp /* Skip orig_ax and ip */
popf /* Pop flags at end (no addl to corrupt flags) */
- jmp ftrace_ret
+ jmp .Lftrace_ret
popf
jmp ftrace_stub
@@ -963,7 +1000,7 @@ ENTRY(mcount)
jb ftrace_stub /* Paging not enabled yet? */
cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
+ jnz .Ltrace
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
cmpl $ftrace_stub, ftrace_graph_return
jnz ftrace_graph_caller
@@ -976,7 +1013,7 @@ ftrace_stub:
ret
/* taken from glibc */
-trace:
+.Ltrace:
pushl %eax
pushl %ecx
pushl %edx
@@ -1027,7 +1064,7 @@ return_to_handler:
ENTRY(trace_page_fault)
ASM_CLAC
pushl $trace_do_page_fault
- jmp error_code
+ jmp common_exception
END(trace_page_fault)
#endif
@@ -1035,7 +1072,10 @@ ENTRY(page_fault)
ASM_CLAC
pushl $do_page_fault
ALIGN
-error_code:
+ jmp common_exception
+END(page_fault)
+
+common_exception:
/* the function address is in %gs's slot on the stack */
pushl %fs
pushl %es
@@ -1047,6 +1087,7 @@ error_code:
pushl %edx
pushl %ecx
pushl %ebx
+ ENCODE_FRAME_POINTER
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
@@ -1064,7 +1105,7 @@ error_code:
movl %esp, %eax # pt_regs pointer
call *%edi
jmp ret_from_exception
-END(page_fault)
+END(common_exception)
ENTRY(debug)
/*
@@ -1079,6 +1120,7 @@ ENTRY(debug)
ASM_CLAC
pushl $-1 # mark this as an int
SAVE_ALL
+ ENCODE_FRAME_POINTER
xorl %edx, %edx # error code 0
movl %esp, %eax # pt_regs pointer
@@ -1094,11 +1136,11 @@ ENTRY(debug)
.Ldebug_from_sysenter_stack:
/* We're on the SYSENTER stack. Switch off. */
- movl %esp, %ebp
+ movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
TRACE_IRQS_OFF
call do_debug
- movl %ebp, %esp
+ movl %ebx, %esp
jmp ret_from_exception
END(debug)
@@ -1116,11 +1158,12 @@ ENTRY(nmi)
movl %ss, %eax
cmpw $__ESPFIX_SS, %ax
popl %eax
- je nmi_espfix_stack
+ je .Lnmi_espfix_stack
#endif
pushl %eax # pt_regs->orig_ax
SAVE_ALL
+ ENCODE_FRAME_POINTER
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
@@ -1132,21 +1175,21 @@ ENTRY(nmi)
/* Not on SYSENTER stack. */
call do_nmi
- jmp restore_all_notrace
+ jmp .Lrestore_all_notrace
.Lnmi_from_sysenter_stack:
/*
* We're on the SYSENTER stack. Switch off. No one (not even debug)
* is using the thread stack right now, so it's safe for us to use it.
*/
- movl %esp, %ebp
+ movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
call do_nmi
- movl %ebp, %esp
- jmp restore_all_notrace
+ movl %ebx, %esp
+ jmp .Lrestore_all_notrace
#ifdef CONFIG_X86_ESPFIX32
-nmi_espfix_stack:
+.Lnmi_espfix_stack:
/*
* create the pointer to lss back
*/
@@ -1159,12 +1202,13 @@ nmi_espfix_stack:
.endr
pushl %eax
SAVE_ALL
+ ENCODE_FRAME_POINTER
FIXUP_ESPFIX_STACK # %eax == %esp
xorl %edx, %edx # zero error code
call do_nmi
RESTORE_REGS
lss 12+4(%esp), %esp # back to espfix stack
- jmp irq_return
+ jmp .Lirq_return
#endif
END(nmi)
@@ -1172,6 +1216,7 @@ ENTRY(int3)
ASM_CLAC
pushl $-1 # mark this as an int
SAVE_ALL
+ ENCODE_FRAME_POINTER
TRACE_IRQS_OFF
xorl %edx, %edx # zero error code
movl %esp, %eax # pt_regs pointer
@@ -1181,14 +1226,14 @@ END(int3)
ENTRY(general_protection)
pushl $do_general_protection
- jmp error_code
+ jmp common_exception
END(general_protection)
#ifdef CONFIG_KVM_GUEST
ENTRY(async_page_fault)
ASM_CLAC
pushl $do_async_page_fault
- jmp error_code
+ jmp common_exception
END(async_page_fault)
#endif
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ef766a358b37..5b219707c2f2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,12 +38,6 @@
#include <asm/export.h>
#include <linux/err.h>
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
.code64
.section .entry.text, "ax"
@@ -469,6 +463,7 @@ END(irq_entries_start)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER
testb $3, CS(%rsp)
jz 1f
@@ -985,6 +980,7 @@ ENTRY(xen_failsafe_callback)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)
@@ -1028,6 +1024,7 @@ ENTRY(paranoid_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -1075,6 +1072,7 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1257,6 +1255,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ ENCODE_FRAME_POINTER
/*
* At this point we no longer need to worry about stack damage
@@ -1270,11 +1269,10 @@ ENTRY(nmi)
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts.
*/
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret
.Lnmi_from_kernel:
/*
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 23c881caabd1..40121d14d34d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address,
+ ret = vm_insert_pfn(vma, vmf->address,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn(
vma,
- (unsigned long)vmf->virtual_address,
+ vmf->address,
__pa(pvti) >> PAGE_SHIFT);
}
}
@@ -161,8 +161,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
}
text_start = addr - image->sym_vvar_start;
- current->mm->context.vdso = (void __user *)text_start;
- current->mm->context.vdso_image = image;
/*
* MAYWRITE to allow gdb to COW and set breakpoints
@@ -189,14 +187,12 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr)
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
do_munmap(mm, text_start, image->size);
+ } else {
+ current->mm->context.vdso = (void __user *)text_start;
+ current->mm->context.vdso_image = image;
}
up_fail:
- if (ret) {
- current->mm->context.vdso = NULL;
- current->mm->context.vdso_image = NULL;
- }
-
up_write(&mm->mmap_sem);
return ret;
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6e395c996900..f1c22584a46f 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -365,7 +365,11 @@ int x86_add_exclusive(unsigned int what)
{
int i;
- if (x86_pmu.lbr_pt_coexist)
+ /*
+ * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
+ * LBR and BTS are still mutually exclusive.
+ */
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
return 0;
if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
@@ -388,7 +392,7 @@ fail_unlock:
void x86_del_exclusive(unsigned int what)
{
- if (x86_pmu.lbr_pt_coexist)
+ if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
return;
atomic_dec(&x86_pmu.lbr_exclusive[what]);
@@ -2299,7 +2303,7 @@ valid_user_frame(const void __user *fp, unsigned long size)
static unsigned long get_segment_base(unsigned int segment)
{
struct desc_struct *desc;
- int idx = segment >> 3;
+ unsigned int idx = segment >> 3;
if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
#ifdef CONFIG_MODIFY_LDT_SYSCALL
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index cb8522290e6a..86138267b68a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2110,6 +2110,27 @@ again:
GLOBAL_STATUS_LBRS_FROZEN);
if (!status)
goto done;
+ /*
+ * In case multiple PEBS events are sampled at the same time,
+ * it is possible to have GLOBAL_STATUS bit 62 set indicating
+ * PEBS buffer overflow and also seeing at most 3 PEBS counters
+ * having their bits set in the status register. This is a sign
+ * that there was at least one PEBS record pending at the time
+ * of the PMU interrupt. PEBS counters must only be processed
+ * via the drain_pebs() calls and not via the regular sample
+ * processing loop coming after that the function, otherwise
+ * phony regular samples may be generated in the sampling buffer
+ * not marked with the EXACT tag. Another possibility is to have
+ * one PEBS event and at least one non-PEBS event whic hoverflows
+ * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will
+ * not be set, yet the overflow status bit for the PEBS counter will
+ * be on Skylake.
+ *
+ * To avoid this problem, we systematically ignore the PEBS-enabled
+ * counters from the GLOBAL_STATUS mask and we always process PEBS
+ * events via drain_pebs().
+ */
+ status &= ~cpuc->pebs_enabled;
/*
* PEBS overflow sets bit 62 in the global status register
@@ -2117,15 +2138,6 @@ again:
if (__test_and_clear_bit(62, (unsigned long *)&status)) {
handled++;
x86_pmu.drain_pebs(regs);
- /*
- * There are cases where, even though, the PEBS ovfl bit is set
- * in GLOBAL_OVF_STATUS, the PEBS events may also have their
- * overflow bits set for their counters. We must clear them
- * here because they have been processed as exact samples in
- * the drain_pebs() routine. They must not be processed again
- * in the for_each_bit_set() loop for regular samples below.
- */
- status &= ~cpuc->pebs_enabled;
status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI;
}
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 8f82b02934fa..0c45cc8e64ba 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -7,9 +7,9 @@
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
+#include <asm/intel_rdt_common.h>
#include "../perf_event.h"
-#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
@@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */
static bool cqm_enabled, mbm_enabled;
unsigned int mbm_socket_max;
-/**
- * struct intel_pqr_state - State cache for the PQR MSR
- * @rmid: The cached Resource Monitoring ID
- * @closid: The cached Class Of Service ID
- * @rmid_usecnt: The usage counter for rmid
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them.
- *
- * The cache also helps to avoid pointless updates if the value does
- * not change.
- */
-struct intel_pqr_state {
- u32 rmid;
- u32 closid;
- int rmid_usecnt;
-};
-
/*
* The cached intel_pqr_state is strictly per CPU and can never be
* updated from a remote CPU. Both functions which modify the state
* (intel_cqm_event_start and intel_cqm_event_stop) are called with
* interrupts disabled, which is sufficient for the protection.
*/
-static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
static struct hrtimer *mbm_timers;
/**
* struct sample - mbm event's (local or total) data
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index c5047b8f777b..1c1b9fe705c8 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -36,13 +36,6 @@ static DEFINE_PER_CPU(struct pt, pt_ctx);
static struct pt_pmu pt_pmu;
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
/*
* Capabilities of Intel PT hardware, such as number of address bits or
* supported output schemes, are cached and exported to userspace as "caps"
@@ -64,21 +57,21 @@ static struct pt_cap_desc {
u8 reg;
u32 mask;
} pt_caps[] = {
- PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
- PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
- PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)),
- PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)),
- PT_CAP(mtc, 0, CR_EBX, BIT(3)),
- PT_CAP(ptwrite, 0, CR_EBX, BIT(4)),
- PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)),
- PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
- PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
- PT_CAP(single_range_output, 0, CR_ECX, BIT(2)),
- PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
- PT_CAP(num_address_ranges, 1, CR_EAX, 0x3),
- PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000),
- PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff),
- PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000),
+ PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)),
+ PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)),
+ PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)),
+ PT_CAP(mtc, 0, CPUID_EBX, BIT(3)),
+ PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)),
+ PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)),
+ PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)),
+ PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)),
+ PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)),
+ PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3),
+ PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000),
+ PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff),
+ PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000),
};
static u32 pt_cap_get(enum pt_capabilities cap)
@@ -213,10 +206,10 @@ static int __init pt_pmu_hw_init(void)
for (i = 0; i < PT_CPUID_LEAVES; i++) {
cpuid_count(20, i,
- &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
- &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
- &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
- &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
+ &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM],
+ &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]);
}
ret = -ENOMEM;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 272427700d48..e6832be714bc 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -669,7 +669,7 @@ static struct event_constraint snbep_uncore_cbox_constraints[] = {
UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
- EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
+ UNCORE_EVENT_CONSTRAINT(0x1f, 0xe),
UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index a77ee026643d..bcbb1d2ae10b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -604,7 +604,7 @@ struct x86_pmu {
u64 lbr_sel_mask; /* LBR_SELECT valid bits */
const int *lbr_sel_map; /* lbr_select mappings */
bool lbr_double_abort; /* duplicated lbr aborts */
- bool lbr_pt_coexist; /* LBR may coexist with PT */
+ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
/*
* Intel PT/LBR/BTS are exclusive
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 2cfed174e3c9..2b892e2313a9 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -6,10 +6,6 @@ generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
-genhdr-y += unistd_32.h
-genhdr-y += unistd_64.h
-genhdr-y += unistd_x32.h
-
generic-y += clkdev.h
generic-y += cputime.h
generic-y += dma-contiguous.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 5391b0ae7cc3..395b69551fce 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
boot_cpu_data.x86_model <= 0x05 &&
boot_cpu_data.x86_mask < 0x0A)
return 1;
- else if (amd_e400_c1e_detected)
+ else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E))
return 1;
else
return max_cstate;
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 5e828da2e18f..00c88a01301d 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -21,6 +21,10 @@ extern int amd_numa_init(void);
extern int amd_get_subcaches(int);
extern int amd_set_subcaches(int, unsigned long);
+extern int amd_smn_read(u16 node, u32 address, u32 *value);
+extern int amd_smn_write(u16 node, u32 address, u32 value);
+extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo);
+
struct amd_l3_cache {
unsigned indices;
u8 subcaches[4];
@@ -55,6 +59,7 @@ struct threshold_bank {
};
struct amd_northbridge {
+ struct pci_dev *root;
struct pci_dev *misc;
struct pci_dev *link;
struct amd_l3_cache l3_cache;
@@ -66,7 +71,6 @@ struct amd_northbridge_info {
u64 flags;
struct amd_northbridge *nb;
};
-extern struct amd_northbridge_info amd_northbridges;
#define AMD_NB_GART BIT(0)
#define AMD_NB_L3_INDEX_DISABLE BIT(1)
@@ -74,20 +78,9 @@ extern struct amd_northbridge_info amd_northbridges;
#ifdef CONFIG_AMD_NB
-static inline u16 amd_nb_num(void)
-{
- return amd_northbridges.num;
-}
-
-static inline bool amd_nb_has_feature(unsigned feature)
-{
- return ((amd_northbridges.flags & feature) == feature);
-}
-
-static inline struct amd_northbridge *node_to_amd_nb(int node)
-{
- return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
-}
+u16 amd_nb_num(void);
+bool amd_nb_has_feature(unsigned int feature);
+struct amd_northbridge *node_to_amd_nb(int node);
static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
{
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index f5aaf6c83222..0c5fbc68e82d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -11,7 +11,6 @@
#include <asm/fixmap.h>
#include <asm/mpspec.h>
#include <asm/msr.h>
-#include <asm/idle.h>
#define ARCH_APICTIMER_STOPS_ON_C3 1
@@ -196,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
static inline void native_apic_msr_eoi_write(u32 reg, u32 v)
{
- wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
+ wrmsr_notrace(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0);
}
static inline u32 native_apic_msr_read(u32 reg)
@@ -332,6 +331,7 @@ struct apic {
* on write for EOI.
*/
void (*eoi_write)(u32 reg, u32 v);
+ void (*native_eoi_write)(u32 reg, u32 v);
u64 (*icr_read)(void);
void (*icr_write)(u32 low, u32 high);
void (*wait_icr_idle)(void);
@@ -639,7 +639,6 @@ extern void irq_exit(void);
static inline void entering_irq(void)
{
irq_enter();
- exit_idle();
}
static inline void entering_ack_irq(void)
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
new file mode 100644
index 000000000000..44b8762fa0c7
--- /dev/null
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -0,0 +1,16 @@
+#include <asm/ftrace.h>
+#include <asm/uaccess.h>
+#include <asm/string.h>
+#include <asm/page.h>
+#include <asm/checksum.h>
+
+#include <asm-generic/asm-prototypes.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/special_insns.h>
+#include <asm/preempt.h>
+
+#ifndef CONFIG_X86_CMPXCHG64
+extern void cmpxchg8b_emu(void);
+#endif
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 1d2b69fc0ceb..d59c15c3defd 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -204,6 +204,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit)
#define static_cpu_has_bug(bit) static_cpu_has((bit))
#define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit))
+#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit))
#define MAX_CPU_FEATURES (NCAPINTS * 32)
#define cpu_have_feature boot_cpu_has
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index a39629206864..eafee3161d1c 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -104,8 +104,8 @@
#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */
#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */
#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */
@@ -189,10 +189,14 @@
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */
#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */
#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
@@ -221,11 +225,13 @@
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
@@ -279,8 +285,10 @@
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */
/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
@@ -311,4 +319,6 @@
#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */
#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */
#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+
#endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 03bb1065c335..29e53ea7d764 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -5,8 +5,8 @@
#ifndef _CRYPTO_GLUE_HELPER_H
#define _CRYPTO_GLUE_HELPER_H
+#include <crypto/internal/skcipher.h>
#include <linux/kernel.h>
-#include <linux/crypto.h>
#include <asm/fpu/api.h>
#include <crypto/b128ops.h>
@@ -69,6 +69,31 @@ static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit,
return true;
}
+static inline bool glue_skwalk_fpu_begin(unsigned int bsize,
+ int fpu_blocks_limit,
+ struct skcipher_walk *walk,
+ bool fpu_enabled, unsigned int nbytes)
+{
+ if (likely(fpu_blocks_limit < 0))
+ return false;
+
+ if (fpu_enabled)
+ return true;
+
+ /*
+ * Vector-registers are only used when chunk to be processed is large
+ * enough, so do not enable FPU until it is necessary.
+ */
+ if (nbytes < bsize * (unsigned int)fpu_blocks_limit)
+ return false;
+
+ /* prevent sleeping if FPU is in use */
+ skcipher_walk_atomise(walk);
+
+ kernel_fpu_begin();
+ return true;
+}
+
static inline void glue_fpu_end(bool fpu_enabled)
{
if (fpu_enabled)
@@ -139,6 +164,18 @@ extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
common_glue_func_t tweak_fn, void *tweak_ctx,
void *crypt_ctx);
+extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx,
+ struct blkcipher_desc *desc,
+ struct scatterlist *dst,
+ struct scatterlist *src, unsigned int nbytes,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx);
+
+extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
+ struct skcipher_request *req,
+ common_glue_func_t tweak_fn, void *tweak_ctx,
+ void *crypt_ctx);
+
extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
le128 *iv, common_glue_func_t fn);
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 476b574de99e..ec23d8e1297c 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -1,13 +1,17 @@
#ifndef _ASM_X86_E820_H
#define _ASM_X86_E820_H
-#ifdef CONFIG_EFI
+/*
+ * E820_X_MAX is the maximum size of the extended E820 table. The extended
+ * table may contain up to 3 extra E820 entries per possible NUMA node, so we
+ * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128.
+ * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h.
+ */
#include <linux/numa.h>
#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
-#else /* ! CONFIG_EFI */
-#define E820_X_MAX E820MAX
-#endif
+
#include <uapi/asm/e820.h>
+
#ifndef __ASSEMBLY__
/* see comment in arch/x86/kernel/e820.c */
extern struct e820map *e820;
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 389d700b961e..e99675b9c861 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -210,12 +210,18 @@ static inline bool efi_is_64bit(void)
return __efi_early()->is64;
}
+#define efi_table_attr(table, attr, instance) \
+ (efi_is_64bit() ? \
+ ((table##_64_t *)(unsigned long)instance)->attr : \
+ ((table##_32_t *)(unsigned long)instance)->attr)
+
+#define efi_call_proto(protocol, f, instance, ...) \
+ __efi_early()->call(efi_table_attr(protocol, f, instance), \
+ instance, ##__VA_ARGS__)
+
#define efi_call_early(f, ...) \
- __efi_early()->call(efi_is_64bit() ? \
- ((efi_boot_services_64_t *)(unsigned long) \
- __efi_early()->boot_services)->f : \
- ((efi_boot_services_32_t *)(unsigned long) \
- __efi_early()->boot_services)->f, __VA_ARGS__)
+ __efi_early()->call(efi_table_attr(efi_boot_services, f, \
+ __efi_early()->boot_services), __VA_ARGS__)
#define __efi_call_early(f, ...) \
__efi_early()->call((unsigned long)f, __VA_ARGS__);
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
index 1c7eefe32502..7ec59edde154 100644
--- a/arch/x86/include/asm/floppy.h
+++ b/arch/x86/include/asm/floppy.h
@@ -229,18 +229,18 @@ static struct fd_routine_l {
int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
} fd_routine[] = {
{
- request_dma,
- free_dma,
- get_dma_residue,
- dma_mem_alloc,
- hard_dma_setup
+ ._request_dma = request_dma,
+ ._free_dma = free_dma,
+ ._get_dma_residue = get_dma_residue,
+ ._dma_mem_alloc = dma_mem_alloc,
+ ._dma_setup = hard_dma_setup
},
{
- vdma_request_dma,
- vdma_nop,
- vdma_get_dma_residue,
- vdma_mem_alloc,
- vdma_dma_setup
+ ._request_dma = vdma_request_dma,
+ ._free_dma = vdma_nop,
+ ._get_dma_residue = vdma_get_dma_residue,
+ ._dma_mem_alloc = vdma_mem_alloc,
+ ._dma_setup = vdma_dma_setup
}
};
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 1429a7c736db..0877ae018fc9 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -27,16 +27,6 @@ extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
/*
- * Some instructions like VIA's padlock instructions generate a spurious
- * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context as well. To prevent these kernel instructions
- * in interrupt context interacting wrongly with other user/kernel fpu usage, we
- * should use them only in the context of irq_ts_save/restore()
- */
-extern int irq_ts_save(void);
-extern void irq_ts_restore(int TS_state);
-
-/*
* Query the presence of one or more xfeatures. Works on any legacy CPU as well.
*
* If 'feature_name' is set then put a human-readable description of
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 2737366ea583..d4a684997497 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -60,11 +60,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* FPU related CPU feature flag helper routines:
*/
-static __always_inline __pure bool use_eager_fpu(void)
-{
- return static_cpu_has(X86_FEATURE_EAGER_FPU);
-}
-
static __always_inline __pure bool use_xsaveopt(void)
{
return static_cpu_has(X86_FEATURE_XSAVEOPT);
@@ -484,42 +479,42 @@ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size)
DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx);
/*
- * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx,
- * on this CPU.
+ * The in-register FPU state for an FPU context on a CPU is assumed to be
+ * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx
+ * matches the FPU.
*
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
+ * If the FPU register state is valid, the kernel can skip restoring the
+ * FPU state from memory.
+ *
+ * Any code that clobbers the FPU registers or updates the in-memory
+ * FPU state for a task MUST let the rest of the kernel know that the
+ * FPU registers are no longer valid for this task.
+ *
+ * Either one of these invalidation functions is enough. Invalidate
+ * a resource you control: CPU if using the CPU for something else
+ * (with preemption disabled), FPU for the current task, or a task that
+ * is prevented from running by the current task.
*/
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+static inline void __cpu_invalidate_fpregs_state(void)
{
- per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
+ __this_cpu_write(fpu_fpregs_owner_ctx, NULL);
}
-static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu)
-{
- return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
-}
-
-
-/*
- * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation'
- * idiom, which is then paired with the sw-flag (fpregs_active) later on:
- */
-
-static inline void __fpregs_activate_hw(void)
+static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu)
{
- if (!use_eager_fpu())
- clts();
+ fpu->last_cpu = -1;
}
-static inline void __fpregs_deactivate_hw(void)
+static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
- if (!use_eager_fpu())
- stts();
+ return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}
-/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */
-static inline void __fpregs_deactivate(struct fpu *fpu)
+/*
+ * These generally need preemption protection to work,
+ * do try to avoid using these on their own:
+ */
+static inline void fpregs_deactivate(struct fpu *fpu)
{
WARN_ON_FPU(!fpu->fpregs_active);
@@ -528,8 +523,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu)
trace_x86_fpu_regs_deactivated(fpu);
}
-/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */
-static inline void __fpregs_activate(struct fpu *fpu)
+static inline void fpregs_activate(struct fpu *fpu)
{
WARN_ON_FPU(fpu->fpregs_active);
@@ -554,51 +548,19 @@ static inline int fpregs_active(void)
}
/*
- * Encapsulate the CR0.TS handling together with the
- * software flag.
- *
- * These generally need preemption protection to work,
- * do try to avoid using these on their own.
- */
-static inline void fpregs_activate(struct fpu *fpu)
-{
- __fpregs_activate_hw();
- __fpregs_activate(fpu);
-}
-
-static inline void fpregs_deactivate(struct fpu *fpu)
-{
- __fpregs_deactivate(fpu);
- __fpregs_deactivate_hw();
-}
-
-/*
* FPU state switching for scheduling.
*
* This is a two-stage process:
*
- * - switch_fpu_prepare() saves the old state and
- * sets the new state of the CR0.TS bit. This is
- * done within the context of the old process.
+ * - switch_fpu_prepare() saves the old state.
+ * This is done within the context of the old process.
*
* - switch_fpu_finish() restores the new state as
* necessary.
*/
-typedef struct { int preload; } fpu_switch_t;
-
-static inline fpu_switch_t
-switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
+static inline void
+switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
- fpu_switch_t fpu;
-
- /*
- * If the task has used the math, pre-load the FPU on xsave processors
- * or if the past 5 consecutive context-switches used math.
- */
- fpu.preload = static_cpu_has(X86_FEATURE_FPU) &&
- new_fpu->fpstate_active &&
- (use_eager_fpu() || new_fpu->counter > 5);
-
if (old_fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
@@ -608,29 +570,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
/* But leave fpu_fpregs_owner_ctx! */
old_fpu->fpregs_active = 0;
trace_x86_fpu_regs_deactivated(old_fpu);
-
- /* Don't change CR0.TS if we just switch! */
- if (fpu.preload) {
- new_fpu->counter++;
- __fpregs_activate(new_fpu);
- trace_x86_fpu_regs_activated(new_fpu);
- prefetch(&new_fpu->state);
- } else {
- __fpregs_deactivate_hw();
- }
- } else {
- old_fpu->counter = 0;
+ } else
old_fpu->last_cpu = -1;
- if (fpu.preload) {
- new_fpu->counter++;
- if (fpu_want_lazy_restore(new_fpu, cpu))
- fpu.preload = 0;
- else
- prefetch(&new_fpu->state);
- fpregs_activate(new_fpu);
- }
- }
- return fpu;
}
/*
@@ -638,15 +579,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu)
*/
/*
- * By the time this gets called, we've already cleared CR0.TS and
- * given the process the FPU if we are going to preload the FPU
- * state - all we need to do is to conditionally restore the register
- * state itself.
+ * Set up the userspace FPU context for the new task, if the task
+ * has used the FPU.
*/
-static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch)
+static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
{
- if (fpu_switch.preload)
- copy_kernel_to_fpregs(&new_fpu->state);
+ bool preload = static_cpu_has(X86_FEATURE_FPU) &&
+ new_fpu->fpstate_active;
+
+ if (preload) {
+ if (!fpregs_state_valid(new_fpu, cpu))
+ copy_kernel_to_fpregs(&new_fpu->state);
+ fpregs_activate(new_fpu);
+ }
}
/*
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 48df486b02f9..3c80f5b9c09d 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -322,17 +322,6 @@ struct fpu {
unsigned char fpregs_active;
/*
- * @counter:
- *
- * This counter contains the number of consecutive context switches
- * during which the FPU stays used. If this is over a threshold, the
- * lazy FPU restore logic becomes eager, to save the trap overhead.
- * This is an unsigned char so that after 256 iterations the counter
- * wraps and the context switch behavior turns lazy again; this is to
- * deal with bursty apps that only use the FPU for a short time:
- */
- unsigned char counter;
- /*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
@@ -340,29 +329,6 @@ struct fpu {
* the registers in the FPU are more recent than this state
* copy. If the task context-switches away then they get
* saved here and represent the FPU state.
- *
- * After context switches there may be a (short) time period
- * during which the in-FPU hardware registers are unchanged
- * and still perfectly match this state, if the tasks
- * scheduled afterwards are not using the FPU.
- *
- * This is the 'lazy restore' window of optimization, which
- * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'.
- *
- * We detect whether a subsequent task uses the FPU via setting
- * CR0::TS to 1, which causes any FPU use to raise a #NM fault.
- *
- * During this window, if the task gets scheduled again, we
- * might be able to skip having to do a restore from this
- * memory buffer to the hardware registers - at the cost of
- * incurring the overhead of #NM fault traps.
- *
- * Note that on modern CPUs that support the XSAVEOPT (or other
- * optimized XSAVE instructions), we don't use #NM traps anymore,
- * as the hardware can track whether FPU registers need saving
- * or not. On such CPUs we activate the non-lazy ('eagerfpu')
- * logic, which unconditionally saves/restores all FPU state
- * across context switches. (if FPU state exists.)
*/
union fpregs_state state;
/*
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 430bacf73074..1b2799e0699a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,21 +21,16 @@
/* Supervisor features */
#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
-/* Supported features which support lazy state saving */
-#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \
+/* All currently supported features */
+#define XCNTXT_MASK (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM)
-
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \
- XFEATURE_MASK_BNDCSR | \
- XFEATURE_MASK_PKRU)
-
-/* All currently supported features */
-#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU | \
+ XFEATURE_MASK_BNDREGS | \
+ XFEATURE_MASK_BNDCSR)
#ifdef CONFIG_X86_64
#define REX_PREFIX "0x48, "
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
deleted file mode 100644
index c5d1785373ed..000000000000
--- a/arch/x86/include/asm/idle.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_IDLE_H
-#define _ASM_X86_IDLE_H
-
-#define IDLE_START 1
-#define IDLE_END 2
-
-struct notifier_block;
-void idle_notifier_register(struct notifier_block *n);
-void idle_notifier_unregister(struct notifier_block *n);
-
-#ifdef CONFIG_X86_64
-void enter_idle(void);
-void exit_idle(void);
-#else /* !CONFIG_X86_64 */
-static inline void enter_idle(void) { }
-static inline void exit_idle(void) { }
-static inline void __exit_idle(void) { }
-#endif /* CONFIG_X86_64 */
-
-void amd_e400_remove_cpu(int cpu);
-
-#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
new file mode 100644
index 000000000000..95ce5c85b009
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -0,0 +1,224 @@
+#ifndef _ASM_X86_INTEL_RDT_H
+#define _ASM_X86_INTEL_RDT_H
+
+#ifdef CONFIG_INTEL_RDT_A
+
+#include <linux/kernfs.h>
+#include <linux/jump_label.h>
+
+#include <asm/intel_rdt_common.h>
+
+#define IA32_L3_QOS_CFG 0xc81
+#define IA32_L3_CBM_BASE 0xc90
+#define IA32_L2_CBM_BASE 0xd10
+
+#define L3_QOS_CDP_ENABLE 0x01ULL
+
+/**
+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
+ * @kn: kernfs node
+ * @rdtgroup_list: linked list for all rdtgroups
+ * @closid: closid for this rdtgroup
+ * @cpu_mask: CPUs assigned to this rdtgroup
+ * @flags: status bits
+ * @waitcount: how many cpus expect to find this
+ * group when they acquire rdtgroup_mutex
+ */
+struct rdtgroup {
+ struct kernfs_node *kn;
+ struct list_head rdtgroup_list;
+ int closid;
+ struct cpumask cpu_mask;
+ int flags;
+ atomic_t waitcount;
+};
+
+/* rdtgroup.flags */
+#define RDT_DELETED 1
+
+/* List of all resource groups */
+extern struct list_head rdt_all_groups;
+
+int __init rdtgroup_init(void);
+
+/**
+ * struct rftype - describe each file in the resctrl file system
+ * @name: file name
+ * @mode: access mode
+ * @kf_ops: operations
+ * @seq_show: show content of the file
+ * @write: write to the file
+ */
+struct rftype {
+ char *name;
+ umode_t mode;
+ struct kernfs_ops *kf_ops;
+
+ int (*seq_show)(struct kernfs_open_file *of,
+ struct seq_file *sf, void *v);
+ /*
+ * write() is the generic write callback which maps directly to
+ * kernfs write operation and overrides all other operations.
+ * Maximum write size is determined by ->max_write_len.
+ */
+ ssize_t (*write)(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+};
+
+/**
+ * struct rdt_resource - attributes of an RDT resource
+ * @enabled: Is this feature enabled on this machine
+ * @capable: Is this feature available on this machine
+ * @name: Name to use in "schemata" file
+ * @num_closid: Number of CLOSIDs available
+ * @max_cbm: Largest Cache Bit Mask allowed
+ * @min_cbm_bits: Minimum number of consecutive bits to be set
+ * in a cache bit mask
+ * @domains: All domains for this resource
+ * @num_domains: Number of domains active
+ * @msr_base: Base MSR address for CBMs
+ * @tmp_cbms: Scratch space when updating schemata
+ * @num_tmp_cbms: Number of CBMs in tmp_cbms
+ * @cache_level: Which cache level defines scope of this domain
+ * @cbm_idx_multi: Multiplier of CBM index
+ * @cbm_idx_offset: Offset of CBM index. CBM index is computed by:
+ * closid * cbm_idx_multi + cbm_idx_offset
+ */
+struct rdt_resource {
+ bool enabled;
+ bool capable;
+ char *name;
+ int num_closid;
+ int cbm_len;
+ int min_cbm_bits;
+ u32 max_cbm;
+ struct list_head domains;
+ int num_domains;
+ int msr_base;
+ u32 *tmp_cbms;
+ int num_tmp_cbms;
+ int cache_level;
+ int cbm_idx_multi;
+ int cbm_idx_offset;
+};
+
+/**
+ * struct rdt_domain - group of cpus sharing an RDT resource
+ * @list: all instances of this resource
+ * @id: unique id for this instance
+ * @cpu_mask: which cpus share this resource
+ * @cbm: array of cache bit masks (indexed by CLOSID)
+ */
+struct rdt_domain {
+ struct list_head list;
+ int id;
+ struct cpumask cpu_mask;
+ u32 *cbm;
+};
+
+/**
+ * struct msr_param - set a range of MSRs from a domain
+ * @res: The resource to use
+ * @low: Beginning index from base MSR
+ * @high: End index
+ */
+struct msr_param {
+ struct rdt_resource *res;
+ int low;
+ int high;
+};
+
+extern struct mutex rdtgroup_mutex;
+
+extern struct rdt_resource rdt_resources_all[];
+extern struct rdtgroup rdtgroup_default;
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
+int __init rdtgroup_init(void);
+
+enum {
+ RDT_RESOURCE_L3,
+ RDT_RESOURCE_L3DATA,
+ RDT_RESOURCE_L3CODE,
+ RDT_RESOURCE_L2,
+
+ /* Must be the last */
+ RDT_NUM_RESOURCES,
+};
+
+#define for_each_capable_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->capable)
+
+#define for_each_enabled_rdt_resource(r) \
+ for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
+ r++) \
+ if (r->enabled)
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
+union cpuid_0x10_1_eax {
+ struct {
+ unsigned int cbm_len:5;
+ } split;
+ unsigned int full;
+};
+
+/* CPUID.(EAX=10H, ECX=ResID=1).EDX */
+union cpuid_0x10_1_edx {
+ struct {
+ unsigned int cos_max:16;
+ } split;
+ unsigned int full;
+};
+
+DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+void rdt_cbm_update(void *arg);
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
+void rdtgroup_kn_unlock(struct kernfs_node *kn);
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off);
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v);
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ * which supports resource control and we enable by mounting the
+ * resctrl file system.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ * when a task with a different CLOSid is scheduled in.
+ *
+ * Must be called with preemption disabled.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+ if (static_branch_likely(&rdt_enable_key)) {
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ int closid;
+
+ /*
+ * If this task has a closid assigned, use it.
+ * Else use the closid assigned to this cpu.
+ */
+ closid = current->closid;
+ if (closid == 0)
+ closid = this_cpu_read(cpu_closid);
+
+ if (closid != state->closid) {
+ state->closid = closid;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
+ }
+ }
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif /* CONFIG_INTEL_RDT_A */
+#endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
new file mode 100644
index 000000000000..b31081b89407
--- /dev/null
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_X86_INTEL_RDT_COMMON_H
+#define _ASM_X86_INTEL_RDT_COMMON_H
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+
+/**
+ * struct intel_pqr_state - State cache for the PQR MSR
+ * @rmid: The cached Resource Monitoring ID
+ * @closid: The cached Class Of Service ID
+ * @rmid_usecnt: The usage counter for rmid
+ *
+ * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
+ * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
+ * contains both parts, so we need to cache them.
+ *
+ * The cache also helps to avoid pointless updates if the value does
+ * not change.
+ */
+struct intel_pqr_state {
+ u32 rmid;
+ u32 closid;
+ int rmid_usecnt;
+};
+
+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index d31881188431..29a594a3b82a 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -21,7 +21,6 @@ enum die_val {
DIE_NMIUNKNOWN,
};
-extern void printk_address(unsigned long address);
extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bdde80731f49..2e25038dbd93 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,6 +191,8 @@ enum {
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
#define PFERR_PK_BIT 5
+#define PFERR_GUEST_FINAL_BIT 32
+#define PFERR_GUEST_PAGE_BIT 33
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
@@ -198,6 +200,13 @@ enum {
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
+#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
+
+#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \
+ PFERR_USER_MASK | \
+ PFERR_WRITE_MASK | \
+ PFERR_PRESENT_MASK)
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -695,6 +704,7 @@ struct kvm_apic_map {
/* Hyper-V emulation context */
struct kvm_hv {
+ struct mutex hv_lock;
u64 hv_guest_os_id;
u64 hv_hypercall;
u64 hv_tsc_page;
@@ -1062,6 +1072,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
+bool pdptrs_changed(struct kvm_vcpu *vcpu);
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
@@ -1124,7 +1135,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
struct x86_emulate_ctxt;
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
int kvm_emulate_halt(struct kvm_vcpu *vcpu);
int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
@@ -1203,7 +1215,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
@@ -1358,7 +1370,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu);
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
int kvm_is_in_guest(void);
diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h
index c2b8d24a235c..d74747b031ec 100644
--- a/arch/x86/include/asm/kvm_page_track.h
+++ b/arch/x86/include/asm/kvm_page_track.h
@@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node {
* @gpa: the physical address written by guest.
* @new: the data was written to the address.
* @bytes: the written length.
+ * @node: this node
*/
void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
- int bytes);
+ int bytes, struct kvm_page_track_notifier_node *node);
+ /*
+ * It is called when memory slot is being moved or removed
+ * users can drop write-protection for the pages in that memory slot
+ *
+ * @kvm: the kvm where memory slot being moved or removed
+ * @slot: the memory slot being moved or removed
+ * @node: this node
+ */
+ void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node);
};
void kvm_page_track_init(struct kvm *kvm);
@@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
struct kvm_page_track_notifier_node *n);
void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
int bytes);
+void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot);
#endif
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index ef01fef3eebc..6c119cfae218 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -9,7 +9,6 @@
#define LHCALL_FLUSH_TLB 5
#define LHCALL_LOAD_IDT_ENTRY 6
#define LHCALL_SET_STACK 7
-#define LHCALL_TS 8
#define LHCALL_SET_CLOCKEVENT 9
#define LHCALL_HALT 10
#define LHCALL_SET_PMD 13
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9bd7ff5ffbcc..5132f2a6c0a2 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -252,8 +252,10 @@ static inline void cmci_recheck(void) {}
#ifdef CONFIG_X86_MCE_AMD
void mce_amd_feature_init(struct cpuinfo_x86 *c);
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr);
#else
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
+static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
#endif
int mce_available(struct cpuinfo_x86 *c);
@@ -293,9 +295,7 @@ void do_machine_check(struct pt_regs *, long);
/*
* Threshold handler
*/
-
extern void (*mce_threshold_vector)(void);
-extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
/* Deferred error interrupt handler */
extern void (*deferred_error_int_vector)(void);
@@ -356,27 +356,31 @@ enum smca_bank_types {
N_SMCA_BANK_TYPES
};
-struct smca_bank_name {
- const char *name; /* Short name for sysfs */
- const char *long_name; /* Long name for pretty-printing */
-};
-
-extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES];
+#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype))
-#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype)
-
-struct smca_hwid_mcatype {
+struct smca_hwid {
unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */
u32 hwid_mcatype; /* (hwid,mcatype) tuple */
u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */
};
-struct smca_bank_info {
- struct smca_hwid_mcatype *type;
- u32 type_instance;
+struct smca_bank {
+ struct smca_hwid *hwid;
+ /* Instance ID */
+ u32 id;
};
-extern struct smca_bank_info smca_banks[MAX_NR_BANKS];
+extern struct smca_bank smca_banks[MAX_NR_BANKS];
+
+extern const char *smca_get_long_name(enum smca_bank_types t);
+
+extern int mce_threshold_create_device(unsigned int cpu);
+extern int mce_threshold_remove_device(unsigned int cpu);
+
+#else
+
+static inline int mce_threshold_create_device(unsigned int cpu) { return 0; };
+static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; };
#endif
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index da0d81fa0b54..38711df3bcb5 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -20,6 +20,15 @@ do { \
(u32)((u64)(val)), \
(u32)((u64)(val) >> 32))
+struct ucode_patch {
+ struct list_head plist;
+ void *data; /* Intel uses only this one */
+ u32 patch_id;
+ u16 equiv_cpu;
+};
+
+extern struct list_head microcode_cache;
+
struct cpu_signature {
unsigned int sig;
unsigned int pf;
@@ -55,12 +64,7 @@ struct ucode_cpu_info {
void *mc;
};
extern struct ucode_cpu_info ucode_cpu_info[];
-
-#ifdef CONFIG_MICROCODE
-int __init microcode_init(void);
-#else
-static inline int __init microcode_init(void) { return 0; };
-#endif
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa);
#ifdef CONFIG_MICROCODE_INTEL
extern struct microcode_ops * __init init_intel_microcode(void);
@@ -131,11 +135,13 @@ static inline unsigned int x86_cpuid_family(void)
}
#ifdef CONFIG_MICROCODE
+int __init microcode_init(void);
extern void __init load_ucode_bsp(void);
extern void load_ucode_ap(void);
void reload_early_microcode(void);
extern bool get_builtin_firmware(struct cpio_data *cd, const char *name);
#else
+static inline int __init microcode_init(void) { return 0; };
static inline void __init load_ucode_bsp(void) { }
static inline void load_ucode_ap(void) { }
static inline void reload_early_microcode(void) { }
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 15eb75484cc0..3e3e20be829a 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -40,38 +40,18 @@ struct microcode_amd {
unsigned int mpb[0];
};
-static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
- unsigned int sig)
-{
- int i = 0;
-
- if (!equiv_cpu_table)
- return 0;
-
- while (equiv_cpu_table[i].installed_cpu != 0) {
- if (sig == equiv_cpu_table[i].installed_cpu)
- return equiv_cpu_table[i].equiv_cpu;
-
- i++;
- }
- return 0;
-}
-
-extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
-extern int apply_microcode_amd(int cpu);
-extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
-
#define PATCH_MAX_SIZE PAGE_SIZE
#ifdef CONFIG_MICROCODE_AMD
extern void __init load_ucode_amd_bsp(unsigned int family);
-extern void load_ucode_amd_ap(void);
-extern int __init save_microcode_in_initrd_amd(void);
+extern void load_ucode_amd_ap(unsigned int family);
+extern int __init save_microcode_in_initrd_amd(unsigned int family);
void reload_ucode_amd(void);
#else
static inline void __init load_ucode_amd_bsp(unsigned int family) {}
-static inline void load_ucode_amd_ap(void) {}
-static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; }
+static inline void load_ucode_amd_ap(unsigned int family) {}
+static inline int __init
+save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
void reload_ucode_amd(void) {}
#endif
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index 5e69154c9f07..195becc6f780 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -52,10 +52,6 @@ struct extended_sigtable {
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
-extern int microcode_sanity_check(void *mc, int print_err);
-extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
-
#ifdef CONFIG_MICROCODE_INTEL
extern void __init load_ucode_intel_bsp(void);
extern void load_ucode_intel_ap(void);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 72198c64e646..f9813b6d8b80 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -31,6 +31,10 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
+#ifdef CONFIG_X86_INTEL_MPX
+ /* address of the bounds directory */
+ void __user *bd_addr;
+#endif
} mm_context_t;
#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8e0a9fe86de4..306c7e12af55 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
- int size;
+ unsigned int size;
};
/*
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index 7a35495275a9..0b416d4cf73b 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -59,7 +59,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs);
int mpx_handle_bd_fault(void);
static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
{
- return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR);
+ return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR);
}
static inline void mpx_mm_init(struct mm_struct *mm)
{
@@ -67,7 +67,7 @@ static inline void mpx_mm_init(struct mm_struct *mm)
* NULL is theoretically a valid place to put the bounds
* directory, so point this at an invalid address.
*/
- mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
+ mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
}
void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long start, unsigned long end);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 78f3760ca1f2..710273c617b8 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -37,6 +37,10 @@
#define EFER_FFXSR (1<<_EFER_FFXSR)
/* Intel MSRs. Some also available on other CPUs */
+
+#define MSR_PPIN_CTL 0x0000004e
+#define MSR_PPIN 0x0000004f
+
#define MSR_IA32_PERFCTR0 0x000000c1
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index b5fee97813cd..db0b90c3b03e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -70,14 +70,14 @@ extern struct tracepoint __tracepoint_read_msr;
extern struct tracepoint __tracepoint_write_msr;
extern struct tracepoint __tracepoint_rdpmc;
#define msr_tracepoint_active(t) static_key_false(&(t).key)
-extern void do_trace_write_msr(unsigned msr, u64 val, int failed);
-extern void do_trace_read_msr(unsigned msr, u64 val, int failed);
-extern void do_trace_rdpmc(unsigned msr, u64 val, int failed);
+extern void do_trace_write_msr(unsigned int msr, u64 val, int failed);
+extern void do_trace_read_msr(unsigned int msr, u64 val, int failed);
+extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed);
#else
#define msr_tracepoint_active(t) false
-static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {}
-static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {}
-static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {}
+static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {}
+static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {}
+static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {}
#endif
static inline unsigned long long native_read_msr(unsigned int msr)
@@ -115,22 +115,36 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr,
}
/* Can be uninlined because referenced by paravirt */
-notrace static inline void native_write_msr(unsigned int msr,
- unsigned low, unsigned high)
+static inline void notrace
+__native_write_msr_notrace(unsigned int msr, u32 low, u32 high)
{
asm volatile("1: wrmsr\n"
"2:\n"
_ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe)
: : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+
+/* Can be uninlined because referenced by paravirt */
+static inline void notrace
+native_write_msr(unsigned int msr, u32 low, u32 high)
+{
+ __native_write_msr_notrace(msr, low, high);
if (msr_tracepoint_active(__tracepoint_write_msr))
do_trace_write_msr(msr, ((u64)high << 32 | low), 0);
}
+static inline void
+wrmsr_notrace(unsigned int msr, u32 low, u32 high)
+{
+ __native_write_msr_notrace(msr, low, high);
+}
+
/* Can be uninlined because referenced by paravirt */
-notrace static inline int native_write_msr_safe(unsigned int msr,
- unsigned low, unsigned high)
+static inline int notrace
+native_write_msr_safe(unsigned int msr, u32 low, u32 high)
{
int err;
+
asm volatile("2: wrmsr ; xor %[err],%[err]\n"
"1:\n\t"
".section .fixup,\"ax\"\n\t"
@@ -223,7 +237,7 @@ do { \
(void)((high) = (u32)(__val >> 32)); \
} while (0)
-static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
+static inline void wrmsr(unsigned int msr, u32 low, u32 high)
{
native_write_msr(msr, low, high);
}
@@ -231,13 +245,13 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
#define rdmsrl(msr, val) \
((val) = native_read_msr((msr)))
-static inline void wrmsrl(unsigned msr, u64 val)
+static inline void wrmsrl(unsigned int msr, u64 val)
{
native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
}
/* wrmsr with exception handling */
-static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
+static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high)
{
return native_write_msr_safe(msr, low, high);
}
@@ -252,7 +266,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
__err; \
})
-static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
+static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p)
{
int err;
@@ -325,12 +339,12 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q)
static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
- rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+ rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
}
static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
struct msr *msrs)
{
- wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+ wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
}
static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
u32 *l, u32 *h)
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
deleted file mode 100644
index 7d3a48275394..000000000000
--- a/arch/x86/include/asm/mutex.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifdef CONFIG_X86_32
-# include <asm/mutex_32.h>
-#else
-# include <asm/mutex_64.h>
-#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
deleted file mode 100644
index e9355a84fc67..000000000000
--- a/arch/x86/include/asm/mutex_32.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Assembly implementation of the mutex fastpath, based on atomic
- * decrement/increment.
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#ifndef _ASM_X86_MUTEX_32_H
-#define _ASM_X86_MUTEX_32_H
-
-#include <asm/alternative.h>
-
-/**
- * __mutex_fastpath_lock - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- * @fn: function to call if the original value was not 1
- *
- * Change the count from 1 to a value lower than 1, and call <fn> if it
- * wasn't 1 originally. This function MUST leave the value lower than 1
- * even when the "1" assertion wasn't true.
- */
-#define __mutex_fastpath_lock(count, fail_fn) \
-do { \
- unsigned int dummy; \
- \
- typecheck(atomic_t *, count); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
- " jns 1f \n" \
- " call " #fail_fn "\n" \
- "1:\n" \
- : "=a" (dummy) \
- : "a" (count) \
- : "memory", "ecx", "edx"); \
-} while (0)
-
-
-/**
- * __mutex_fastpath_lock_retval - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- *
- * Change the count from 1 to a value lower than 1. This function returns 0
- * if the fastpath succeeds, or -1 otherwise.
- */
-static inline int __mutex_fastpath_lock_retval(atomic_t *count)
-{
- if (unlikely(atomic_dec_return(count) < 0))
- return -1;
- else
- return 0;
-}
-
-/**
- * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
- * @count: pointer of type atomic_t
- * @fail_fn: function to call if the original value was not 0
- *
- * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
- * In the failure case, this function is allowed to either set the value
- * to 1, or to set it to a value lower than 1.
- *
- * If the implementation sets it to a value of lower than 1, the
- * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
- * to return 0 otherwise.
- */
-#define __mutex_fastpath_unlock(count, fail_fn) \
-do { \
- unsigned int dummy; \
- \
- typecheck(atomic_t *, count); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
- " jg 1f\n" \
- " call " #fail_fn "\n" \
- "1:\n" \
- : "=a" (dummy) \
- : "a" (count) \
- : "memory", "ecx", "edx"); \
-} while (0)
-
-#define __mutex_slowpath_needs_to_unlock() 1
-
-/**
- * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
- *
- * @count: pointer of type atomic_t
- * @fail_fn: fallback function
- *
- * Change the count from 1 to a value lower than 1, and return 0 (failure)
- * if it wasn't 1 originally, or return 1 (success) otherwise. This function
- * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
- * Additionally, if the value was < 0 originally, this function must not leave
- * it to 0 on failure.
- */
-static inline int __mutex_fastpath_trylock(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- /* cmpxchg because it never induces a false contention state. */
- if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
- return 1;
-
- return 0;
-}
-
-#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
deleted file mode 100644
index d9850758464e..000000000000
--- a/arch/x86/include/asm/mutex_64.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Assembly implementation of the mutex fastpath, based on atomic
- * decrement/increment.
- *
- * started by Ingo Molnar:
- *
- * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
- */
-#ifndef _ASM_X86_MUTEX_64_H
-#define _ASM_X86_MUTEX_64_H
-
-/**
- * __mutex_fastpath_lock - decrement and call function if negative
- * @v: pointer of type atomic_t
- * @fail_fn: function to call if the result is negative
- *
- * Atomically decrements @v and calls <fail_fn> if the result is negative.
- */
-#ifdef CC_HAVE_ASM_GOTO
-static inline void __mutex_fastpath_lock(atomic_t *v,
- void (*fail_fn)(atomic_t *))
-{
- asm_volatile_goto(LOCK_PREFIX " decl %0\n"
- " jns %l[exit]\n"
- : : "m" (v->counter)
- : "memory", "cc"
- : exit);
- fail_fn(v);
-exit:
- return;
-}
-#else
-#define __mutex_fastpath_lock(v, fail_fn) \
-do { \
- unsigned long dummy; \
- \
- typecheck(atomic_t *, v); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
- " jns 1f \n" \
- " call " #fail_fn "\n" \
- "1:" \
- : "=D" (dummy) \
- : "D" (v) \
- : "rax", "rsi", "rdx", "rcx", \
- "r8", "r9", "r10", "r11", "memory"); \
-} while (0)
-#endif
-
-/**
- * __mutex_fastpath_lock_retval - try to take the lock by moving the count
- * from 1 to a 0 value
- * @count: pointer of type atomic_t
- *
- * Change the count from 1 to a value lower than 1. This function returns 0
- * if the fastpath succeeds, or -1 otherwise.
- */
-static inline int __mutex_fastpath_lock_retval(atomic_t *count)
-{
- if (unlikely(atomic_dec_return(count) < 0))
- return -1;
- else
- return 0;
-}
-
-/**
- * __mutex_fastpath_unlock - increment and call function if nonpositive
- * @v: pointer of type atomic_t
- * @fail_fn: function to call if the result is nonpositive
- *
- * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
- */
-#ifdef CC_HAVE_ASM_GOTO
-static inline void __mutex_fastpath_unlock(atomic_t *v,
- void (*fail_fn)(atomic_t *))
-{
- asm_volatile_goto(LOCK_PREFIX " incl %0\n"
- " jg %l[exit]\n"
- : : "m" (v->counter)
- : "memory", "cc"
- : exit);
- fail_fn(v);
-exit:
- return;
-}
-#else
-#define __mutex_fastpath_unlock(v, fail_fn) \
-do { \
- unsigned long dummy; \
- \
- typecheck(atomic_t *, v); \
- typecheck_fn(void (*)(atomic_t *), fail_fn); \
- \
- asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
- " jg 1f\n" \
- " call " #fail_fn "\n" \
- "1:" \
- : "=D" (dummy) \
- : "D" (v) \
- : "rax", "rsi", "rdx", "rcx", \
- "r8", "r9", "r10", "r11", "memory"); \
-} while (0)
-#endif
-
-#define __mutex_slowpath_needs_to_unlock() 1
-
-/**
- * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
- *
- * @count: pointer of type atomic_t
- * @fail_fn: fallback function
- *
- * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
- * if it wasn't 1 originally. [the fallback function is never used on
- * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
- */
-static inline int __mutex_fastpath_trylock(atomic_t *count,
- int (*fail_fn)(atomic_t *))
-{
- if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1))
- return 1;
-
- return 0;
-}
-
-#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ce932812f142..1eea6ca40694 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -41,11 +41,6 @@ static inline void set_debugreg(unsigned long val, int reg)
PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
}
-static inline void clts(void)
-{
- PVOP_VCALL0(pv_cpu_ops.clts);
-}
-
static inline unsigned long read_cr0(void)
{
return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
@@ -678,6 +673,11 @@ static __always_inline void pv_kick(int cpu)
PVOP_VCALL1(pv_lock_ops.kick, cpu);
}
+static __always_inline bool pv_vcpu_is_preempted(int cpu)
+{
+ return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
+}
+
#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 0f400c0e4979..bb2de45a60f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -103,8 +103,6 @@ struct pv_cpu_ops {
unsigned long (*get_debugreg)(int regno);
void (*set_debugreg)(int regno, unsigned long value);
- void (*clts)(void);
-
unsigned long (*read_cr0)(void);
void (*write_cr0)(unsigned long);
@@ -310,6 +308,8 @@ struct pv_lock_ops {
void (*wait)(u8 *ptr, u8 val);
void (*kick)(int cpu);
+
+ struct paravirt_callee_save vcpu_is_preempted;
};
/* This contains all the paravirt structures: we get a convenient
@@ -508,6 +508,18 @@ int paravirt_disable_iospace(void);
#define PVOP_TEST_NULL(op) ((void)op)
#endif
+#define PVOP_RETMASK(rettype) \
+ ({ unsigned long __mask = ~0UL; \
+ switch (sizeof(rettype)) { \
+ case 1: __mask = 0xffUL; break; \
+ case 2: __mask = 0xffffUL; break; \
+ case 4: __mask = 0xffffffffUL; break; \
+ default: break; \
+ } \
+ __mask; \
+ })
+
+
#define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \
pre, post, ...) \
({ \
@@ -535,7 +547,7 @@ int paravirt_disable_iospace(void);
paravirt_clobber(clbr), \
##__VA_ARGS__ \
: "memory", "cc" extra_clbr); \
- __ret = (rettype)__eax; \
+ __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \
} \
__ret; \
})
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 84f58de08c2b..9fa03604b2b3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -507,17 +507,6 @@ do { \
#endif
-/* This is not atomic against other CPUs -- CPU preemption needs to be off */
-#define x86_test_and_clear_bit_percpu(bit, var) \
-({ \
- bool old__; \
- asm volatile("btr %2,"__percpu_arg(1)"\n\t" \
- CC_SET(c) \
- : CC_OUT(c) (old__), "+m" (var) \
- : "dIr" (bit)); \
- old__; \
-})
-
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
const unsigned long __percpu *addr)
{
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 1cc82ece9ac1..62b775926045 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -116,8 +116,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
native_set_pgd(pgd, native_make_pgd(0));
}
-extern void sync_global_pgds(unsigned long start, unsigned long end,
- int removed);
+extern void sync_global_pgds(unsigned long start, unsigned long end);
/*
* Conversion functions: convert a page and protection to a page entry,
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 17f218645701..ec1f3c651150 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -24,7 +24,13 @@ static __always_inline int preempt_count(void)
static __always_inline void preempt_count_set(int pc)
{
- raw_cpu_write_4(__preempt_count, pc);
+ int old, new;
+
+ do {
+ old = raw_cpu_read_4(__preempt_count);
+ new = (old & PREEMPT_NEED_RESCHED) |
+ (pc & ~PREEMPT_NEED_RESCHED);
+ } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
}
/*
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 984a7bf17f6a..eaf100508c36 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -137,6 +137,17 @@ struct cpuinfo_x86 {
u32 microcode;
};
+struct cpuid_regs {
+ u32 eax, ebx, ecx, edx;
+};
+
+enum cpuid_regs_idx {
+ CPUID_EAX = 0,
+ CPUID_EBX,
+ CPUID_ECX,
+ CPUID_EDX,
+};
+
#define X86_VENDOR_INTEL 0
#define X86_VENDOR_CYRIX 1
#define X86_VENDOR_AMD 2
@@ -178,6 +189,9 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *);
extern void print_cpu_info(struct cpuinfo_x86 *);
void print_cpu_msr(struct cpuinfo_x86 *);
extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+extern u32 get_scattered_cpuid_leaf(unsigned int level,
+ unsigned int sub_leaf,
+ enum cpuid_regs_idx reg);
extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern void init_amd_cacheinfo(struct cpuinfo_x86 *c);
@@ -588,43 +602,76 @@ static __always_inline void cpu_relax(void)
rep_nop();
}
-#define cpu_relax_lowlatency() cpu_relax()
-
-/* Stop speculative execution and prefetching of modified code. */
+/*
+ * This function forces the icache and prefetched instruction stream to
+ * catch up with reality in two very specific cases:
+ *
+ * a) Text was modified using one virtual address and is about to be executed
+ * from the same physical page at a different virtual address.
+ *
+ * b) Text was modified on a different CPU, may subsequently be
+ * executed on this CPU, and you want to make sure the new version
+ * gets executed. This generally means you're calling this in a IPI.
+ *
+ * If you're calling this for a different reason, you're probably doing
+ * it wrong.
+ */
static inline void sync_core(void)
{
- int tmp;
-
-#ifdef CONFIG_M486
/*
- * Do a CPUID if available, otherwise do a jump. The jump
- * can conveniently enough be the jump around CPUID.
+ * There are quite a few ways to do this. IRET-to-self is nice
+ * because it works on every CPU, at any CPL (so it's compatible
+ * with paravirtualization), and it never exits to a hypervisor.
+ * The only down sides are that it's a bit slow (it seems to be
+ * a bit more than 2x slower than the fastest options) and that
+ * it unmasks NMIs. The "push %cs" is needed because, in
+ * paravirtual environments, __KERNEL_CS may not be a valid CS
+ * value when we do IRET directly.
+ *
+ * In case NMI unmasking or performance ever becomes a problem,
+ * the next best option appears to be MOV-to-CR2 and an
+ * unconditional jump. That sequence also works on all CPUs,
+ * but it will fault at CPL3 (i.e. Xen PV and lguest).
+ *
+ * CPUID is the conventional way, but it's nasty: it doesn't
+ * exist on some 486-like CPUs, and it usually exits to a
+ * hypervisor.
+ *
+ * Like all of Linux's memory ordering operations, this is a
+ * compiler barrier as well.
*/
- asm volatile("cmpl %2,%1\n\t"
- "jl 1f\n\t"
- "cpuid\n"
- "1:"
- : "=a" (tmp)
- : "rm" (boot_cpu_data.cpuid_level), "ri" (0), "0" (1)
- : "ebx", "ecx", "edx", "memory");
+ register void *__sp asm(_ASM_SP);
+
+#ifdef CONFIG_X86_32
+ asm volatile (
+ "pushfl\n\t"
+ "pushl %%cs\n\t"
+ "pushl $1f\n\t"
+ "iret\n\t"
+ "1:"
+ : "+r" (__sp) : : "memory");
#else
- /*
- * CPUID is a barrier to speculative execution.
- * Prefetched instructions are automatically
- * invalidated when modified.
- */
- asm volatile("cpuid"
- : "=a" (tmp)
- : "0" (1)
- : "ebx", "ecx", "edx", "memory");
+ unsigned int tmp;
+
+ asm volatile (
+ "mov %%ss, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq %%rsp\n\t"
+ "addq $8, (%%rsp)\n\t"
+ "pushfq\n\t"
+ "mov %%cs, %0\n\t"
+ "pushq %q0\n\t"
+ "pushq $1f\n\t"
+ "iretq\n\t"
+ "1:"
+ : "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
#endif
}
extern void select_idle_routine(const struct cpuinfo_x86 *c);
-extern void init_amd_e400_c1e_mask(void);
+extern void amd_e400_c1e_apic_setup(void);
extern unsigned long boot_option_idle_override;
-extern bool amd_e400_c1e_detected;
enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
IDLE_POLL};
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index eaba08076030..c343ab52579f 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -32,6 +32,12 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
{
pv_queued_spin_unlock(lock);
}
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+ return pv_vcpu_is_preempted(cpu);
+}
#else
static inline void queued_spin_unlock(struct qspinlock *lock)
{
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 19a2224f9e16..12af3e35edfa 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -6,11 +6,6 @@
#include <asm/nops.h>
-static inline void native_clts(void)
-{
- asm volatile("clts");
-}
-
/*
* Volatile isn't enough to prevent the compiler from reordering the
* read/write functions for the control registers and messing everything up.
@@ -208,16 +203,8 @@ static inline void load_gs_index(unsigned selector)
#endif
-/* Clear the 'TS' bit */
-static inline void clts(void)
-{
- native_clts();
-}
-
#endif/* CONFIG_PARAVIRT */
-#define stts() write_cr0(read_cr0() | X86_CR0_TS)
-
static inline void clflush(volatile void *__p)
{
asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 37f2e0b377ad..a3269c897ec5 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -30,8 +30,7 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,
int get_stack_info(unsigned long *stack, struct task_struct *task,
struct stack_info *info, unsigned long *visit_mask);
-void stack_type_str(enum stack_type type, const char **begin,
- const char **end);
+const char *stack_type_name(enum stack_type type);
static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
{
@@ -43,8 +42,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len)
addr + len > begin && addr + len <= end);
}
-extern int kstack_depth_to_print;
-
#ifdef CONFIG_X86_32
#define STACKSLOTS_PER_LINE 8
#else
@@ -86,9 +83,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs)
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl);
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl);
-
extern unsigned int code_bytes;
/* The form of the top of the frame on the stack */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index cf75871d2f81..6358a85e2270 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -146,4 +146,36 @@ struct pci_bus;
int x86_pci_root_bus_node(int bus);
void x86_pci_root_bus_resources(int bus, struct list_head *resources);
+extern bool x86_topology_update;
+
+#ifdef CONFIG_SCHED_MC_PRIO
+#include <asm/percpu.h>
+
+DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+/* Interface to set priority of a cpu */
+void sched_set_itmt_core_prio(int prio, int core_cpu);
+
+/* Interface to notify scheduler that system supports ITMT */
+int sched_set_itmt_support(void);
+
+/* Interface to notify scheduler that system revokes ITMT support */
+void sched_clear_itmt_support(void);
+
+#else /* CONFIG_SCHED_MC_PRIO */
+
+#define sysctl_sched_itmt_enabled 0
+static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+}
+static inline int sched_set_itmt_support(void)
+{
+ return 0;
+}
+static inline void sched_clear_itmt_support(void)
+{
+}
+#endif /* CONFIG_SCHED_MC_PRIO */
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h
index 2fbc66c7885b..2422b14c50a7 100644
--- a/arch/x86/include/asm/trace/exceptions.h
+++ b/arch/x86/include/asm/trace/exceptions.h
@@ -6,7 +6,7 @@
#include <linux/tracepoint.h>
-extern void trace_irq_vector_regfunc(void);
+extern int trace_irq_vector_regfunc(void);
extern void trace_irq_vector_unregfunc(void);
DECLARE_EVENT_CLASS(x86_exceptions,
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 9217ab1f5bf6..342e59789fcd 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
__field(struct fpu *, fpu)
__field(bool, fpregs_active)
__field(bool, fpstate_active)
- __field(int, counter)
__field(u64, xfeatures)
__field(u64, xcomp_bv)
),
@@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpu = fpu;
__entry->fpregs_active = fpu->fpregs_active;
__entry->fpstate_active = fpu->fpstate_active;
- __entry->counter = fpu->counter;
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
__entry->xfeatures = fpu->state.xsave.header.xfeatures;
__entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
}
),
- TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx",
+ TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
__entry->fpu,
__entry->fpregs_active,
__entry->fpstate_active,
- __entry->counter,
__entry->xfeatures,
__entry->xcomp_bv
)
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 38a09a13a9bc..32dd6a9e343c 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -6,7 +6,7 @@
#include <linux/tracepoint.h>
-extern void trace_irq_vector_regfunc(void);
+extern int trace_irq_vector_regfunc(void);
extern void trace_irq_vector_unregfunc(void);
DECLARE_EVENT_CLASS(x86_irq_vector,
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 33b6365c22fe..abb1fdcc545a 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable;
* Boot-time check whether the TSCs are synchronized across
* all CPUs/cores:
*/
+#ifdef CONFIG_X86_TSC
+extern bool tsc_store_and_check_tsc_adjust(bool bootcpu);
+extern void tsc_verify_tsc_adjust(bool resume);
extern void check_tsc_sync_source(int cpu);
extern void check_tsc_sync_target(void);
+#else
+static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; }
+static inline void tsc_verify_tsc_adjust(bool resume) { }
+static inline void check_tsc_sync_source(int cpu) { }
+static inline void check_tsc_sync_target(void) { }
+#endif
extern int notsc_setup(char *);
extern void tsc_save_sched_clock_state(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index faf3687f1035..ea148313570f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
__chk_range_not_ok((unsigned long __force)(addr), size, limit); \
})
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task())
+#else
+# define WARN_ON_IN_IRQ()
+#endif
+
/**
* access_ok: - Checks if a user space pointer is valid
* @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
@@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
-#define access_ok(type, addr, size) \
- likely(!__range_not_ok(addr, size, user_addr_max()))
+#define access_ok(type, addr, size) \
+({ \
+ WARN_ON_IN_IRQ(); \
+ likely(!__range_not_ok(addr, size, user_addr_max())); \
+})
/*
* These are the main single-value transfer routines. They automatically
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 46de9ac4b990..6fa75b17aec3 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -12,7 +12,8 @@ struct unwind_state {
struct task_struct *task;
int graph_idx;
#ifdef CONFIG_FRAME_POINTER
- unsigned long *bp;
+ unsigned long *bp, *orig_sp;
+ struct pt_regs *regs;
#else
unsigned long *sp;
#endif
@@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
if (unwind_done(state))
return NULL;
- return state->bp + 1;
+ return state->regs ? &state->regs->ip : state->bp + 1;
+}
+
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ if (unwind_done(state))
+ return NULL;
+
+ return state->regs;
}
#else /* !CONFIG_FRAME_POINTER */
@@ -58,6 +67,11 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
return NULL;
}
+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ return NULL;
+}
+
#endif /* CONFIG_FRAME_POINTER */
#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index e728699db774..3a01996db58f 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void)
* works on all CPUs. This is volatile so that it orders
* correctly wrt barrier() and to keep gcc from cleverly
* hoisting it out of the calling function.
+ *
+ * If RDPID is available, use it.
*/
- asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ alternative_io ("lsl %[p],%[seg]",
+ ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+ X86_FEATURE_RDPID,
+ [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
return p;
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a002b07a7099..2b5b2d4b924e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,7 @@
#define VMX_H
+#include <linux/bitops.h>
#include <linux/types.h>
#include <uapi/asm/vmx.h>
@@ -60,6 +61,7 @@
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_DESC 0x00000004
#define SECONDARY_EXEC_RDTSCP 0x00000008
#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
@@ -110,6 +112,36 @@
#define VMX_MISC_SAVE_EFER_LMA 0x00000020
#define VMX_MISC_ACTIVITY_HLT 0x00000040
+static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
+{
+ return vmx_basic & GENMASK_ULL(30, 0);
+}
+
+static inline u32 vmx_basic_vmcs_size(u64 vmx_basic)
+{
+ return (vmx_basic & GENMASK_ULL(44, 32)) >> 32;
+}
+
+static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc)
+{
+ return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+}
+
+static inline int vmx_misc_cr3_count(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(24, 16)) >> 16;
+}
+
+static inline int vmx_misc_max_msr(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(27, 25)) >> 25;
+}
+
+static inline int vmx_misc_mseg_revid(u64 vmx_misc)
+{
+ return (vmx_misc & GENMASK_ULL(63, 32)) >> 32;
+}
+
/* VMCS Encodings */
enum vmcs_field {
VIRTUAL_PROCESSOR_ID = 0x00000000,
@@ -399,10 +431,11 @@ enum vmcs_field {
#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3
-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0
#define VMX_EPT_EXTENT_CONTEXT 1
#define VMX_EPT_EXTENT_GLOBAL 2
#define VMX_EPT_EXTENT_SHIFT 24
@@ -419,8 +452,10 @@ enum vmcs_field {
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
#define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */
#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */
#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */
#define VMX_EPT_DEFAULT_GAW 3
#define VMX_EPT_MAX_GAW 0x4
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 6ba793178441..7ba7e90a9ad6 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -59,7 +59,7 @@ struct x86_init_irqs {
/**
* struct x86_init_oem - oem platform specific customizing functions
- * @arch_setup: platform specific architecure setup
+ * @arch_setup: platform specific architecture setup
* @banner: print a platform specific banner
*/
struct x86_init_oem {
@@ -165,8 +165,25 @@ struct x86_legacy_devices {
};
/**
+ * enum x86_legacy_i8042_state - i8042 keyboard controller state
+ * @X86_LEGACY_I8042_PLATFORM_ABSENT: the controller is always absent on
+ * given platform/subarch.
+ * @X86_LEGACY_I8042_FIRMWARE_ABSENT: firmware reports that the controller
+ * is absent.
+ * @X86_LEGACY_i8042_EXPECTED_PRESENT: the controller is likely to be
+ * present, the i8042 driver should probe for controller existence.
+ */
+enum x86_legacy_i8042_state {
+ X86_LEGACY_I8042_PLATFORM_ABSENT,
+ X86_LEGACY_I8042_FIRMWARE_ABSENT,
+ X86_LEGACY_I8042_EXPECTED_PRESENT,
+};
+
+/**
* struct x86_legacy_features - legacy x86 features
*
+ * @i8042: indicated if we expect the device to have i8042 controller
+ * present.
* @rtc: this device has a CMOS real-time clock present
* @reserve_bios_regions: boot code will search for the EBDA address and the
* start of the 640k - 1M BIOS region. If false, the platform must
@@ -175,6 +192,7 @@ struct x86_legacy_devices {
* documentation for further details.
*/
struct x86_legacy_features {
+ enum x86_legacy_i8042_state i8042;
int rtc;
int reserve_bios_regions;
struct x86_legacy_devices devices;
@@ -188,15 +206,14 @@ struct x86_legacy_features {
* @set_wallclock: set time back to HW clock
* @is_untracked_pat_range exclude from PAT logic
* @nmi_init enable NMI on cpus
- * @i8042_detect pre-detect if i8042 controller exists
* @save_sched_clock_state: save state for sched_clock() on suspend
* @restore_sched_clock_state: restore state for sched_clock() on resume
- * @apic_post_init: adjust apic if neeeded
+ * @apic_post_init: adjust apic if needed
* @legacy: legacy features
* @set_legacy_features: override legacy features. Use of this callback
* is highly discouraged. You should only need
* this if your hardware platform requires further
- * custom fine tuning far beyong what may be
+ * custom fine tuning far beyond what may be
* possible in x86_early_init_platform_quirks() by
* only using the current x86_hardware_subarch
* semantics.
@@ -210,7 +227,6 @@ struct x86_platform_ops {
bool (*is_untracked_pat_range)(u64 start, u64 end);
void (*nmi_init)(void);
unsigned char (*get_nmi_reason)(void);
- int (*i8042_detect)(void);
void (*save_sched_clock_state)(void);
void (*restore_sched_clock_state)(void);
void (*apic_post_init)(void);
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index c18ce67495fa..b10bf319ed20 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -7,6 +7,7 @@
#define SETUP_DTB 2
#define SETUP_PCI 3
#define SETUP_EFI 4
+#define SETUP_APPLE_PROPERTIES 5
/* ram_size flags */
#define RAMDISK_IMAGE_START_MASK 0x07FF
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 94dc8ca434e0..1421a6585126 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -45,7 +45,9 @@ struct kvm_steal_time {
__u64 steal;
__u32 version;
__u32 flags;
- __u32 pad[12];
+ __u8 preempted;
+ __u8 u8_pad[3];
+ __u32 pad[11];
};
#define KVM_STEAL_ALIGNMENT_BITS 5
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
index 69a6e07e3149..eb6247a7009b 100644
--- a/arch/x86/include/uapi/asm/mce.h
+++ b/arch/x86/include/uapi/asm/mce.h
@@ -28,6 +28,7 @@ struct mce {
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
+ __u64 ppin; /* Protected Processor Inventory Number */
};
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h
index ae135de547f5..835aa51c7f6e 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,10 +6,8 @@
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
-#ifdef CONFIG_CHECKPOINT_RESTORE
-# define ARCH_MAP_VDSO_X32 0x2001
-# define ARCH_MAP_VDSO_32 0x2002
-# define ARCH_MAP_VDSO_64 0x2003
-#endif
+#define ARCH_MAP_VDSO_X32 0x2001
+#define ARCH_MAP_VDSO_32 0x2002
+#define ARCH_MAP_VDSO_64 0x2003
#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 37fee272618f..14458658e988 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -65,6 +65,8 @@
#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
#define EXIT_REASON_APIC_ACCESS 44
#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
#define EXIT_REASON_EPT_VIOLATION 48
#define EXIT_REASON_EPT_MISCONFIG 49
#define EXIT_REASON_INVEPT 50
@@ -113,6 +115,8 @@
{ EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
{ EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
{ EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
+ { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \
+ { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \
{ EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
{ EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
{ EXIT_REASON_INVEPT, "INVEPT" }, \
@@ -129,6 +133,7 @@
{ EXIT_REASON_XRSTORS, "XRSTORS" }
#define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1
+#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2
#define VMX_ABORT_LOAD_HOST_MSR_FAIL 4
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 79076d75bdbf..581386c7e429 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -75,7 +75,7 @@ apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SMP) += smpboot.o
-obj-$(CONFIG_SMP) += tsc_sync.o
+obj-$(CONFIG_X86_TSC) += tsc_sync.o
obj-$(CONFIG_SMP) += setup_percpu.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
@@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
+obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
ifdef CONFIG_FRAME_POINTER
obj-y += unwind_frame.o
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index c280df6b2aa2..ea3046e0b0cf 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data)
struct acpi_hest_ia_corrected *cmc;
struct acpi_hest_ia_error_bank *mc_bank;
- if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
- return 0;
-
cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
if (!cmc->enabled)
return 0;
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 931ced8ca345..64422f850e95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -76,6 +76,7 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
+#ifdef CONFIG_X86_IO_APIC
/*
* Locks related to IOAPIC hotplug
* Hotplug side:
@@ -88,6 +89,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
* ->ioapic_lock
*/
static DEFINE_MUTEX(acpi_ioapic_lock);
+#endif
/* --------------------------------------------------------------------------
Boot-time Configuration
@@ -713,7 +715,7 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
int nid;
nid = acpi_get_node(handle);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
set_apicid_to_node(physid, nid);
numa_set_node(cpu, nid);
}
@@ -928,6 +930,13 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
x86_platform.legacy.devices.pnpbios = 0;
}
+ if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
+ !(acpi_gbl_FADT.boot_flags & ACPI_FADT_8042) &&
+ x86_platform.legacy.i8042 != X86_LEGACY_I8042_PLATFORM_ABSENT) {
+ pr_debug("ACPI: i8042 controller is absent\n");
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_FIRMWARE_ABSENT;
+ }
+
if (acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC) {
pr_debug("ACPI: not registering RTC platform device\n");
x86_platform.legacy.rtc = 0;
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
movq pt_regs_r14(%rax), %r14
movq pt_regs_r15(%rax), %r15
+#ifdef CONFIG_KASAN
+ /*
+ * The suspend path may have poisoned some areas deeper in the stack,
+ * which we now need to unpoison.
+ */
+ movq %rsp, %rdi
+ call kasan_unpoison_task_stack_below
+#endif
+
xorl %eax, %eax
addq $8, %rsp
FRAME_END
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5cb272a7a5a3..c5b8f760473c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -337,7 +337,11 @@ done:
n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}
-static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+/*
+ * "noinline" to cause control flow change and thus invalidate I$ and
+ * cause refetch after modification.
+ */
+static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
unsigned long flags;
@@ -346,7 +350,6 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
local_irq_save(flags);
add_nops(instr + (a->instrlen - a->padlen), a->padlen);
- sync_core();
local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
@@ -359,9 +362,12 @@ static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
* This implies that asymmetric systems where APs have less capabilities than
* the boot processor are not handled. Tough. Make sure you disable such
* features by hand.
+ *
+ * Marked "noinline" to cause control flow change and thus insn cache
+ * to refetch changed I$ lines.
*/
-void __init_or_module apply_alternatives(struct alt_instr *start,
- struct alt_instr *end)
+void __init_or_module noinline apply_alternatives(struct alt_instr *start,
+ struct alt_instr *end)
{
struct alt_instr *a;
u8 *instr, *replacement;
@@ -667,7 +673,6 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
unsigned long flags;
local_irq_save(flags);
memcpy(addr, opcode, len);
- sync_core();
local_irq_restore(flags);
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4fdf6230d93c..458da8509b75 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -13,8 +13,20 @@
#include <linux/spinlock.h>
#include <asm/amd_nb.h>
+#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
+#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463
+#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
+
+/* Protect the PCI config register pairs used for SMN and DF indirect access. */
+static DEFINE_MUTEX(smn_mutex);
+
static u32 *flush_words;
+static const struct pci_device_id amd_root_ids[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
+ {}
+};
+
const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
@@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{}
};
-EXPORT_SYMBOL(amd_nb_misc_ids);
+EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
@@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{}
};
@@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
{ }
};
-struct amd_northbridge_info amd_northbridges;
-EXPORT_SYMBOL(amd_northbridges);
+static struct amd_northbridge_info amd_northbridges;
+
+u16 amd_nb_num(void)
+{
+ return amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_nb_num);
+
+bool amd_nb_has_feature(unsigned int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
+EXPORT_SYMBOL_GPL(amd_nb_has_feature);
+
+struct amd_northbridge *node_to_amd_nb(int node)
+{
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
+}
+EXPORT_SYMBOL_GPL(node_to_amd_nb);
static struct pci_dev *next_northbridge(struct pci_dev *dev,
const struct pci_device_id *ids)
@@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev,
return dev;
}
+static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write)
+{
+ struct pci_dev *root;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ root = node_to_amd_nb(node)->root;
+ if (!root)
+ goto out;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(root, 0x60, address);
+ if (err) {
+ pr_warn("Error programming SMN address 0x%x.\n", address);
+ goto out_unlock;
+ }
+
+ err = (write ? pci_write_config_dword(root, 0x64, *value)
+ : pci_read_config_dword(root, 0x64, value));
+ if (err)
+ pr_warn("Error %s SMN address 0x%x.\n",
+ (write ? "writing to" : "reading from"), address);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+
+int amd_smn_read(u16 node, u32 address, u32 *value)
+{
+ return __amd_smn_rw(node, address, value, false);
+}
+EXPORT_SYMBOL_GPL(amd_smn_read);
+
+int amd_smn_write(u16 node, u32 address, u32 value)
+{
+ return __amd_smn_rw(node, address, &value, true);
+}
+EXPORT_SYMBOL_GPL(amd_smn_write);
+
+/*
+ * Data Fabric Indirect Access uses FICAA/FICAD.
+ *
+ * Fabric Indirect Configuration Access Address (FICAA): Constructed based
+ * on the device's Instance Id and the PCI function and register offset of
+ * the desired register.
+ *
+ * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO
+ * and FICAD HI registers but so far we only need the LO register.
+ */
+int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
+{
+ struct pci_dev *F4;
+ u32 ficaa;
+ int err = -ENODEV;
+
+ if (node >= amd_northbridges.num)
+ goto out;
+
+ F4 = node_to_amd_nb(node)->link;
+ if (!F4)
+ goto out;
+
+ ficaa = 1;
+ ficaa |= reg & 0x3FC;
+ ficaa |= (func & 0x7) << 11;
+ ficaa |= instance_id << 16;
+
+ mutex_lock(&smn_mutex);
+
+ err = pci_write_config_dword(F4, 0x5C, ficaa);
+ if (err) {
+ pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa);
+ goto out_unlock;
+ }
+
+ err = pci_read_config_dword(F4, 0x98, lo);
+ if (err)
+ pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa);
+
+out_unlock:
+ mutex_unlock(&smn_mutex);
+
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(amd_df_indirect_read);
+
int amd_cache_northbridges(void)
{
u16 i = 0;
struct amd_northbridge *nb;
- struct pci_dev *misc, *link;
+ struct pci_dev *root, *misc, *link;
- if (amd_nb_num())
+ if (amd_northbridges.num)
return 0;
misc = NULL;
@@ -74,15 +198,17 @@ int amd_cache_northbridges(void)
if (!i)
return -ENODEV;
- nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL);
if (!nb)
return -ENOMEM;
amd_northbridges.nb = nb;
amd_northbridges.num = i;
- link = misc = NULL;
- for (i = 0; i != amd_nb_num(); i++) {
+ link = misc = root = NULL;
+ for (i = 0; i != amd_northbridges.num; i++) {
+ node_to_amd_nb(i)->root = root =
+ next_northbridge(root, amd_root_ids);
node_to_amd_nb(i)->misc = misc =
next_northbridge(misc, amd_nb_misc_ids);
node_to_amd_nb(i)->link = link =
@@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res)
{
u32 address;
u64 base, msr;
- unsigned segn_busn_bits;
+ unsigned int segn_busn_bits;
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return NULL;
/* assume all cpus from fam10h have mmconfig */
- if (boot_cpu_data.x86 < 0x10)
+ if (boot_cpu_data.x86 < 0x10)
return NULL;
address = MSR_FAM10H_MMIO_CONF_BASE;
@@ -226,14 +352,14 @@ static void amd_cache_gart(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL);
if (!flush_words) {
amd_northbridges.flags &= ~AMD_NB_GART;
pr_notice("Cannot initialize GART flush words, GART support disabled\n");
return;
}
- for (i = 0; i != amd_nb_num(); i++)
+ for (i = 0; i != amd_northbridges.num; i++)
pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]);
}
@@ -246,18 +372,20 @@ void amd_flush_garts(void)
if (!amd_nb_has_feature(AMD_NB_GART))
return;
- /* Avoid races between AGP and IOMMU. In theory it's not needed
- but I'm not sure if the hardware won't lose flush requests
- when another is pending. This whole thing is so expensive anyways
- that it doesn't matter to serialize more. -AK */
+ /*
+ * Avoid races between AGP and IOMMU. In theory it's not needed
+ * but I'm not sure if the hardware won't lose flush requests
+ * when another is pending. This whole thing is so expensive anyways
+ * that it doesn't matter to serialize more. -AK
+ */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
flush_words[i] | 1);
flushed++;
}
- for (i = 0; i < amd_nb_num(); i++) {
+ for (i = 0; i < amd_northbridges.num; i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 88c657b057e2..5b7e43eff139 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -48,7 +48,6 @@
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
-#include <asm/idle.h>
#include <asm/mtrr.h>
#include <asm/time.h>
#include <asm/smp.h>
@@ -894,11 +893,13 @@ void __init setup_boot_APIC_clock(void)
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
void setup_secondary_APIC_clock(void)
{
setup_APIC_timer();
+ amd_e400_c1e_apic_setup();
}
/*
@@ -2159,21 +2160,6 @@ int __generic_processor_info(int apicid, int version, bool enabled)
}
/*
- * This can happen on physical hotplug. The sanity check at boot time
- * is done from native_smp_prepare_cpus() after num_possible_cpus() is
- * established.
- */
- if (topology_update_package_map(apicid, cpu) < 0) {
- int thiscpu = max + disabled_cpus;
-
- pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n",
- thiscpu, apicid);
-
- disabled_cpus++;
- return -ENOSPC;
- }
-
- /*
* Validate version
*/
if (version == 0x0) {
@@ -2263,6 +2249,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
/* Should happen once for each apic */
WARN_ON((*drv)->eoi_write == eoi_write);
+ (*drv)->native_eoi_write = (*drv)->eoi_write;
(*drv)->eoi_write = eoi_write;
}
}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 48e6d84f173e..945e512a112a 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -48,7 +48,6 @@
#include <linux/bootmem.h>
#include <asm/irqdomain.h>
-#include <asm/idle.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/cpu.h>
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 51287cd90bf6..643818a7688b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
static int use_apm_idle; /* = 0 */
static unsigned int last_jiffies; /* = 0 */
static unsigned int last_stime; /* = 0 */
- cputime_t stime;
+ cputime_t stime, utime;
int apm_idle_done = 0;
unsigned int jiffies_since_last_check = jiffies - last_jiffies;
unsigned int bucket;
recalc:
- task_cputime(current, NULL, &stime);
+ task_cputime(current, &utime, &stime);
if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
use_apm_idle = 0;
} else if (jiffies_since_last_check > idle_period) {
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4a8697f7d4ef..52000010c62e 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
+obj-y += bugs.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
-obj-$(CONFIG_X86_32) += bugs.o
-obj-$(CONFIG_X86_64) += bugs_64.o
-
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
obj-$(CONFIG_CPU_SUP_AMD) += amd.o
obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
@@ -34,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
+obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
+
obj-$(CONFIG_X86_MCE) += mcheck/
obj-$(CONFIG_MTRR) += mtrr/
obj-$(CONFIG_MICROCODE) += microcode/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 1e81a37c034e..71cae73a5076 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -20,6 +20,10 @@
#include "cpu.h"
+static const int amd_erratum_383[];
+static const int amd_erratum_400[];
+static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+
/*
* nodes_per_socket: Stores the number of nodes per socket.
* Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX
@@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
smp_num_siblings = ((ebx >> 8) & 3) + 1;
c->x86_max_cores /= smp_num_siblings;
c->cpu_core_id = ebx & 0xff;
+
+ /*
+ * We may have multiple LLCs if L3 caches exist, so check if we
+ * have an L3 cache by looking at the L3 cache CPUID leaf.
+ */
+ if (cpuid_edx(0x80000006)) {
+ if (c->x86 == 0x17) {
+ /*
+ * LLC is at the core complex level.
+ * Core complex id is ApicId[3].
+ */
+ per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
+ } else {
+ /* LLC is at the node level. */
+ per_cpu(cpu_llc_id, cpu) = node_id;
+ }
+ }
} else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
node_id = value & 7;
+
+ per_cpu(cpu_llc_id, cpu) = node_id;
} else
return;
@@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_AMD_DCM);
cus_per_node = c->x86_max_cores / nodes_per_socket;
- /* store NodeID, use llc_shared_map to store sibling info */
- per_cpu(cpu_llc_id, cpu) = node_id;
-
/* core id has to be in the [0 .. cores_per_node - 1] range */
c->cpu_core_id %= cus_per_node;
}
@@ -356,15 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c)
/* use socket ID also for last level cache */
per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
amd_get_topology(c);
-
- /*
- * Fix percpu cpu_llc_id here as LLC topology is different
- * for Fam17h systems.
- */
- if (c->x86 != 0x17 || !cpuid_edx(0x80000006))
- return;
-
- per_cpu(cpu_llc_id, cpu) = c->apicid >> 3;
#endif
}
@@ -585,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
/* F16h erratum 793, CVE-2013-6885 */
if (c->x86 == 0x16 && c->x86_model <= 0xf)
msr_set_bit(MSR_AMD64_LS_CFG, 15);
-}
-static const int amd_erratum_383[];
-static const int amd_erratum_400[];
-static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
+ /*
+ * Check whether the machine is affected by erratum 400. This is
+ * used to select the proper idle routine and to enable the check
+ * whether the machine is affected in arch_post_acpi_init(), which
+ * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check.
+ */
+ if (cpu_has_amd_erratum(c, amd_erratum_400))
+ set_cpu_bug(c, X86_BUG_AMD_E400);
+}
static void init_amd_k8(struct cpuinfo_x86 *c)
{
@@ -770,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c)
if (c->x86 > 0x11)
set_cpu_cap(c, X86_FEATURE_ARAT);
- if (cpu_has_amd_erratum(c, amd_erratum_400))
- set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
-
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
/* 3DNow or LM implies PREFETCHW */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..a44ef52184df 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,15 +16,19 @@
#include <asm/msr.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
void __init check_bugs(void)
{
identify_boot_cpu();
-#ifndef CONFIG_SMP
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
+ if (!IS_ENABLED(CONFIG_SMP)) {
+ pr_info("CPU: ");
+ print_cpu_info(&boot_cpu_data);
+ }
+
+#ifdef CONFIG_X86_32
/*
* Check whether we are able to run this kernel safely on SMP.
*
@@ -40,4 +44,18 @@ void __init check_bugs(void)
alternative_instructions();
fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+ alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
+#endif
}
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index a972ac4c7e7d..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 1994 Linus Torvalds
- * Copyright (C) 2000 SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
- identify_boot_cpu();
-#if !defined(CONFIG_SMP)
- pr_info("CPU: ");
- print_cpu_info(&boot_cpu_data);
-#endif
- alternative_instructions();
-
- /*
- * Make sure the first 2MB area is not mapped by huge pages
- * There are typically fixed size MTRRs in there and overlapping
- * MTRRs into large pages causes slow downs.
- *
- * Right now we don't do that with gbpages because there seems
- * very little benefit for that case.
- */
- if (!direct_gbpages)
- set_memory_4k((unsigned long)__va(0), 1);
-}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc9e980c68ec..dc1697ca5191 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -667,13 +667,14 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_1_EDX] = edx;
}
+ /* Thermal and Power Management Leaf: level 0x00000006 (eax) */
+ if (c->cpuid_level >= 0x00000006)
+ c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+
/* Additional Intel-defined flags: level 0x00000007 */
if (c->cpuid_level >= 0x00000007) {
cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
-
c->x86_capability[CPUID_7_0_EBX] = ebx;
-
- c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
c->x86_capability[CPUID_7_ECX] = ecx;
}
@@ -979,29 +980,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c)
}
/*
- * The physical to logical package id mapping is initialized from the
- * acpi/mptables information. Make sure that CPUID actually agrees with
- * that.
+ * Validate that ACPI/mptables have the same information about the
+ * effective APIC id and update the package map.
*/
-static void sanitize_package_id(struct cpuinfo_x86 *c)
+static void validate_apic_and_package_id(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
- unsigned int pkg, apicid, cpu = smp_processor_id();
+ unsigned int apicid, cpu = smp_processor_id();
apicid = apic->cpu_present_to_apicid(cpu);
- pkg = apicid >> boot_cpu_data.x86_coreid_bits;
- if (apicid != c->initial_apicid) {
- pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n",
+ if (apicid != c->apicid) {
+ pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n",
cpu, apicid, c->initial_apicid);
- c->initial_apicid = apicid;
- }
- if (pkg != c->phys_proc_id) {
- pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n",
- cpu, pkg, c->phys_proc_id);
- c->phys_proc_id = pkg;
}
- c->logical_proc_id = topology_phys_to_logical_pkg(pkg);
+ BUG_ON(topology_update_package_map(c->phys_proc_id, cpu));
#else
c->logical_proc_id = 0;
#endif
@@ -1132,7 +1125,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
- sanitize_package_id(c);
}
/*
@@ -1172,7 +1164,6 @@ void enable_sep_cpu(void)
void __init identify_boot_cpu(void)
{
identify_cpu(&boot_cpu_data);
- init_amd_e400_c1e_mask();
#ifdef CONFIG_X86_32
sysenter_setup();
enable_sep_cpu();
@@ -1188,53 +1179,9 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
enable_sep_cpu();
#endif
mtrr_ap_init();
+ validate_apic_and_package_id(c);
}
-struct msr_range {
- unsigned min;
- unsigned max;
-};
-
-static const struct msr_range msr_range_array[] = {
- { 0x00000000, 0x00000418},
- { 0xc0000000, 0xc000040b},
- { 0xc0010000, 0xc0010142},
- { 0xc0011000, 0xc001103b},
-};
-
-static void __print_cpu_msr(void)
-{
- unsigned index_min, index_max;
- unsigned index;
- u64 val;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
- index_min = msr_range_array[i].min;
- index_max = msr_range_array[i].max;
-
- for (index = index_min; index < index_max; index++) {
- if (rdmsrl_safe(index, &val))
- continue;
- pr_info(" MSR%08x: %016llx\n", index, val);
- }
- }
-}
-
-static int show_msr;
-
-static __init int setup_show_msr(char *arg)
-{
- int num;
-
- get_option(&arg, &num);
-
- if (num > 0)
- show_msr = num;
- return 1;
-}
-__setup("show_msr=", setup_show_msr);
-
static __init int setup_noclflush(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_CLFLUSH);
@@ -1268,14 +1215,6 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(", stepping: 0x%x)\n", c->x86_mask);
else
pr_cont(")\n");
-
- print_cpu_msr(c);
-}
-
-void print_cpu_msr(struct cpuinfo_x86 *c)
-{
- if (c->cpu_index < show_msr)
- __print_cpu_msr();
}
static __init int setup_disablecpuid(char *arg)
@@ -1490,11 +1429,8 @@ void cpu_init(void)
*/
cr4_init_shadow();
- /*
- * Load microcode on this cpu if a valid microcode is available.
- * This is early microcode loading procedure.
- */
- load_ucode_ap();
+ if (cpu)
+ load_ucode_ap();
t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index de6626c18e42..0282b0df004a 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -153,6 +153,7 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
+ unsigned int id;
unsigned long size;
struct amd_northbridge *nb;
};
@@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index,
static void ci_leaf_init(struct cacheinfo *this_leaf,
struct _cpuid4_info_regs *base)
{
+ this_leaf->id = base->id;
+ this_leaf->attributes = CACHE_ID;
this_leaf->level = base->eax.split.level;
this_leaf->type = cache_type_map[base->eax.split.type];
this_leaf->coherency_line_size =
@@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu)
return 0;
}
+/*
+ * The max shared threads number comes from CPUID.4:EAX[25-14] with input
+ * ECX as cache index. Then right shift apicid by the number's order to get
+ * cache id for this cache node.
+ */
+static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs)
+{
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ unsigned long num_threads_sharing;
+ int index_msb;
+
+ num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ id4_regs->id = c->apicid >> index_msb;
+}
+
static int __populate_cache_leaves(unsigned int cpu)
{
unsigned int idx, ret;
@@ -931,9 +950,12 @@ static int __populate_cache_leaves(unsigned int cpu)
ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
if (ret)
return ret;
+ get_cache_id(cpu, &id4_regs);
ci_leaf_init(this_leaf++, &id4_regs);
__cache_cpumap_setup(cpu, idx, &id4_regs);
}
+ this_cpu_ci->cpu_map_populated = true;
+
return 0;
}
diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c
new file mode 100644
index 000000000000..5a533fefefa0
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -0,0 +1,403 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ * Vikas Shivappa <vikas.shivappa@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/* Mutex to protect rdtgroup access. */
+DEFINE_MUTEX(rdtgroup_mutex);
+
+DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
+
+#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
+
+struct rdt_resource rdt_resources_all[] = {
+ {
+ .name = "L3",
+ .domains = domain_init(RDT_RESOURCE_L3),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3DATA",
+ .domains = domain_init(RDT_RESOURCE_L3DATA),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 0
+ },
+ {
+ .name = "L3CODE",
+ .domains = domain_init(RDT_RESOURCE_L3CODE),
+ .msr_base = IA32_L3_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 3,
+ .cbm_idx_multi = 2,
+ .cbm_idx_offset = 1
+ },
+ {
+ .name = "L2",
+ .domains = domain_init(RDT_RESOURCE_L2),
+ .msr_base = IA32_L2_CBM_BASE,
+ .min_cbm_bits = 1,
+ .cache_level = 2,
+ .cbm_idx_multi = 1,
+ .cbm_idx_offset = 0
+ },
+};
+
+static int cbm_idx(struct rdt_resource *r, int closid)
+{
+ return closid * r->cbm_idx_multi + r->cbm_idx_offset;
+}
+
+/*
+ * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs
+ * as they do not have CPUID enumeration support for Cache allocation.
+ * The check for Vendor/Family/Model is not enough to guarantee that
+ * the MSRs won't #GP fault because only the following SKUs support
+ * CAT:
+ * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz
+ * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz
+ * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz
+ * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz
+ * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz
+ *
+ * Probe by trying to write the first of the L3 cach mask registers
+ * and checking that the bits stick. Max CLOSids is always 4 and max cbm length
+ * is always 20 on hsw server parts. The minimum cache bitmask length
+ * allowed for HSW server is always 2 bits. Hardcode all of them.
+ */
+static inline bool cache_alloc_hsw_probe(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ boot_cpu_data.x86 == 6 &&
+ boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+ u32 l, h, max_cbm = BIT_MASK(20) - 1;
+
+ if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
+ return false;
+ rdmsr(IA32_L3_CBM_BASE, l, h);
+
+ /* If all the bits were set in MSR, return success */
+ if (l != max_cbm)
+ return false;
+
+ r->num_closid = 4;
+ r->cbm_len = 20;
+ r->max_cbm = max_cbm;
+ r->min_cbm_bits = 2;
+ r->capable = true;
+ r->enabled = true;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void rdt_get_config(int idx, struct rdt_resource *r)
+{
+ union cpuid_0x10_1_eax eax;
+ union cpuid_0x10_1_edx edx;
+ u32 ebx, ecx;
+
+ cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full);
+ r->num_closid = edx.split.cos_max + 1;
+ r->cbm_len = eax.split.cbm_len + 1;
+ r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1;
+ r->capable = true;
+ r->enabled = true;
+}
+
+static void rdt_get_cdp_l3_config(int type)
+{
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ struct rdt_resource *r = &rdt_resources_all[type];
+
+ r->num_closid = r_l3->num_closid / 2;
+ r->cbm_len = r_l3->cbm_len;
+ r->max_cbm = r_l3->max_cbm;
+ r->capable = true;
+ /*
+ * By default, CDP is disabled. CDP can be enabled by mount parameter
+ * "cdp" during resctrl file system mount time.
+ */
+ r->enabled = false;
+}
+
+static inline bool get_rdt_resources(void)
+{
+ bool ret = false;
+
+ if (cache_alloc_hsw_probe())
+ return true;
+
+ if (!boot_cpu_has(X86_FEATURE_RDT_A))
+ return false;
+
+ if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
+ rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
+ if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
+ rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
+ }
+ ret = true;
+ }
+ if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
+ /* CPUID 0x10.2 fields are same format at 0x10.1 */
+ rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
+ ret = true;
+ }
+
+ return ret;
+}
+
+static int get_cache_id(int cpu, int level)
+{
+ struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
+ int i;
+
+ for (i = 0; i < ci->num_leaves; i++) {
+ if (ci->info_list[i].level == level)
+ return ci->info_list[i].id;
+ }
+
+ return -1;
+}
+
+void rdt_cbm_update(void *arg)
+{
+ struct msr_param *m = (struct msr_param *)arg;
+ struct rdt_resource *r = m->res;
+ int i, cpu = smp_processor_id();
+ struct rdt_domain *d;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Find the domain that contains this CPU */
+ if (cpumask_test_cpu(cpu, &d->cpu_mask))
+ goto found;
+ }
+ pr_info_once("cpu %d not found in any domain for resource %s\n",
+ cpu, r->name);
+
+ return;
+
+found:
+ for (i = m->low; i < m->high; i++) {
+ int idx = cbm_idx(r, i);
+
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+}
+
+/*
+ * rdt_find_domain - Find a domain in a resource that matches input resource id
+ *
+ * Search resource r's domain list to find the resource id. If the resource
+ * id is found in a domain, return the domain. Otherwise, if requested by
+ * caller, return the first domain whose id is bigger than the input id.
+ * The domain list is sorted by id in ascending order.
+ */
+static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
+ struct list_head **pos)
+{
+ struct rdt_domain *d;
+ struct list_head *l;
+
+ if (id < 0)
+ return ERR_PTR(id);
+
+ list_for_each(l, &r->domains) {
+ d = list_entry(l, struct rdt_domain, list);
+ /* When id is found, return its domain. */
+ if (id == d->id)
+ return d;
+ /* Stop searching when finding id's position in sorted list. */
+ if (id < d->id)
+ break;
+ }
+
+ if (pos)
+ *pos = l;
+
+ return NULL;
+}
+
+/*
+ * domain_add_cpu - Add a cpu to a resource's domain list.
+ *
+ * If an existing domain in the resource r's domain list matches the cpu's
+ * resource id, add the cpu in the domain.
+ *
+ * Otherwise, a new domain is allocated and inserted into the right position
+ * in the domain list sorted by id in ascending order.
+ *
+ * The order in the domain list is visible to users when we print entries
+ * in the schemata file and schemata input is validated to have the same order
+ * as this list.
+ */
+static void domain_add_cpu(int cpu, struct rdt_resource *r)
+{
+ int i, id = get_cache_id(cpu, r->cache_level);
+ struct list_head *add_pos = NULL;
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, &add_pos);
+ if (IS_ERR(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ if (d) {
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ return;
+ }
+
+ d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu));
+ if (!d)
+ return;
+
+ d->id = id;
+
+ d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL);
+ if (!d->cbm) {
+ kfree(d);
+ return;
+ }
+
+ for (i = 0; i < r->num_closid; i++) {
+ int idx = cbm_idx(r, i);
+
+ d->cbm[i] = r->max_cbm;
+ wrmsrl(r->msr_base + idx, d->cbm[i]);
+ }
+
+ cpumask_set_cpu(cpu, &d->cpu_mask);
+ list_add_tail(&d->list, add_pos);
+ r->num_domains++;
+}
+
+static void domain_remove_cpu(int cpu, struct rdt_resource *r)
+{
+ int id = get_cache_id(cpu, r->cache_level);
+ struct rdt_domain *d;
+
+ d = rdt_find_domain(r, id, NULL);
+ if (IS_ERR_OR_NULL(d)) {
+ pr_warn("Could't find cache id for cpu %d\n", cpu);
+ return;
+ }
+
+ cpumask_clear_cpu(cpu, &d->cpu_mask);
+ if (cpumask_empty(&d->cpu_mask)) {
+ r->num_domains--;
+ kfree(d->cbm);
+ list_del(&d->list);
+ kfree(d);
+ }
+}
+
+static void clear_closid(int cpu)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
+ per_cpu(cpu_closid, cpu) = 0;
+ state->closid = 0;
+ wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
+}
+
+static int intel_rdt_online_cpu(unsigned int cpu)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_add_cpu(cpu, r);
+ /* The cpu is set in default rdtgroup after online. */
+ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int intel_rdt_offline_cpu(unsigned int cpu)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+ for_each_capable_rdt_resource(r)
+ domain_remove_cpu(cpu, r);
+ list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
+ if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
+ break;
+ }
+ clear_closid(cpu);
+ mutex_unlock(&rdtgroup_mutex);
+
+ return 0;
+}
+
+static int __init intel_rdt_late_init(void)
+{
+ struct rdt_resource *r;
+ int state, ret;
+
+ if (!get_rdt_resources())
+ return -ENODEV;
+
+ state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "x86/rdt/cat:online:",
+ intel_rdt_online_cpu, intel_rdt_offline_cpu);
+ if (state < 0)
+ return state;
+
+ ret = rdtgroup_init();
+ if (ret) {
+ cpuhp_remove_state(state);
+ return ret;
+ }
+
+ for_each_capable_rdt_resource(r)
+ pr_info("Intel RDT %s allocation detected\n", r->name);
+
+ return 0;
+}
+
+late_initcall(intel_rdt_late_init);
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
new file mode 100644
index 000000000000..8af04afdfcb9
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -0,0 +1,1115 @@
+/*
+ * User interface for Resource Alloction in Resource Director Technology(RDT)
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Author: Fenghua Yu <fenghua.yu@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/cpu.h>
+#include <linux/task_work.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/intel_rdt.h>
+#include <asm/intel_rdt_common.h>
+
+DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
+struct kernfs_root *rdt_root;
+struct rdtgroup rdtgroup_default;
+LIST_HEAD(rdt_all_groups);
+
+/* Kernel fs node for "info" directory under root */
+static struct kernfs_node *kn_info;
+
+/*
+ * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
+ * we can keep a bitmap of free CLOSIDs in a single integer.
+ *
+ * Using a global CLOSID across all resources has some advantages and
+ * some drawbacks:
+ * + We can simply set "current->closid" to assign a task to a resource
+ * group.
+ * + Context switch code can avoid extra memory references deciding which
+ * CLOSID to load into the PQR_ASSOC MSR
+ * - We give up some options in configuring resource groups across multi-socket
+ * systems.
+ * - Our choices on how to configure each resource become progressively more
+ * limited as the number of resources grows.
+ */
+static int closid_free_map;
+
+static void closid_init(void)
+{
+ struct rdt_resource *r;
+ int rdt_min_closid = 32;
+
+ /* Compute rdt_min_closid across all resources */
+ for_each_enabled_rdt_resource(r)
+ rdt_min_closid = min(rdt_min_closid, r->num_closid);
+
+ closid_free_map = BIT_MASK(rdt_min_closid) - 1;
+
+ /* CLOSID 0 is always reserved for the default group */
+ closid_free_map &= ~1;
+}
+
+int closid_alloc(void)
+{
+ int closid = ffs(closid_free_map);
+
+ if (closid == 0)
+ return -ENOSPC;
+ closid--;
+ closid_free_map &= ~(1 << closid);
+
+ return closid;
+}
+
+static void closid_free(int closid)
+{
+ closid_free_map |= 1 << closid;
+}
+
+/* set uid and gid of rdtgroup dirs and files to that of the creator */
+static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
+ 0, rft->kf_ops, rft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
+ int len)
+{
+ struct rftype *rft;
+ int ret;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ for (rft = rfts; rft < rfts + len; rft++) {
+ ret = rdtgroup_add_file(kn, rft);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
+ while (--rft >= rfts)
+ kernfs_remove_by_name(kn, rft->name);
+ return ret;
+}
+
+static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->seq_show)
+ return rft->seq_show(of, m, arg);
+ return 0;
+}
+
+static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct rftype *rft = of->kn->priv;
+
+ if (rft->write)
+ return rft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static struct kernfs_ops rdtgroup_kf_single_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = rdtgroup_file_write,
+ .seq_show = rdtgroup_seqfile_show,
+};
+
+static int rdtgroup_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask));
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/*
+ * This is safe against intel_rdt_sched_in() called from __switch_to()
+ * because __switch_to() is executed with interrupts disabled. A local call
+ * from rdt_update_closid() is proteced against __switch_to() because
+ * preemption is disabled.
+ */
+static void rdt_update_cpu_closid(void *closid)
+{
+ if (closid)
+ this_cpu_write(cpu_closid, *(int *)closid);
+ /*
+ * We cannot unconditionally write the MSR because the current
+ * executing task might have its own closid selected. Just reuse
+ * the context switch code.
+ */
+ intel_rdt_sched_in();
+}
+
+/*
+ * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
+ *
+ * Per task closids must have been set up before calling this function.
+ *
+ * The per cpu closids are updated with the smp function call, when @closid
+ * is not NULL. If @closid is NULL then all affected percpu closids must
+ * have been set up before calling this function.
+ */
+static void
+rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
+{
+ int cpu = get_cpu();
+
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_update_cpu_closid(closid);
+ smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
+ put_cpu();
+}
+
+static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask;
+ struct rdtgroup *rdtgrp, *r;
+ int ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ ret = cpumask_parse(buf, newmask);
+ if (ret)
+ goto unlock;
+
+ /* check that user didn't specify any offline cpus */
+ cpumask_andnot(tmpmask, newmask, cpu_online_mask);
+ if (cpumask_weight(tmpmask)) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ /* Check whether cpus are dropped from this group */
+ cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
+ if (cpumask_weight(tmpmask)) {
+ /* Can't drop from default group */
+ if (rdtgrp == &rdtgroup_default) {
+ ret = -EINVAL;
+ goto unlock;
+ }
+ /* Give any dropped cpus to rdtgroup_default */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, tmpmask);
+ rdt_update_closid(tmpmask, &rdtgroup_default.closid);
+ }
+
+ /*
+ * If we added cpus, remove them from previous group that owned them
+ * and update per-cpu closid
+ */
+ cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
+ if (cpumask_weight(tmpmask)) {
+ list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
+ if (r == rdtgrp)
+ continue;
+ cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
+ }
+ rdt_update_closid(tmpmask, &rdtgrp->closid);
+ }
+
+ /* Done pushing/pulling - update this group with new mask */
+ cpumask_copy(&rdtgrp->cpu_mask, newmask);
+
+unlock:
+ rdtgroup_kn_unlock(of->kn);
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+
+ return ret ?: nbytes;
+}
+
+struct task_move_callback {
+ struct callback_head work;
+ struct rdtgroup *rdtgrp;
+};
+
+static void move_myself(struct callback_head *head)
+{
+ struct task_move_callback *callback;
+ struct rdtgroup *rdtgrp;
+
+ callback = container_of(head, struct task_move_callback, work);
+ rdtgrp = callback->rdtgrp;
+
+ /*
+ * If resource group was deleted before this task work callback
+ * was invoked, then assign the task to root group and free the
+ * resource group.
+ */
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ current->closid = 0;
+ kfree(rdtgrp);
+ }
+
+ preempt_disable();
+ /* update PQR_ASSOC MSR to make resource group go into effect */
+ intel_rdt_sched_in();
+ preempt_enable();
+
+ kfree(callback);
+}
+
+static int __rdtgroup_move_task(struct task_struct *tsk,
+ struct rdtgroup *rdtgrp)
+{
+ struct task_move_callback *callback;
+ int ret;
+
+ callback = kzalloc(sizeof(*callback), GFP_KERNEL);
+ if (!callback)
+ return -ENOMEM;
+ callback->work.func = move_myself;
+ callback->rdtgrp = rdtgrp;
+
+ /*
+ * Take a refcount, so rdtgrp cannot be freed before the
+ * callback has been invoked.
+ */
+ atomic_inc(&rdtgrp->waitcount);
+ ret = task_work_add(tsk, &callback->work, true);
+ if (ret) {
+ /*
+ * Task is exiting. Drop the refcount and free the callback.
+ * No need to check the refcount as the group cannot be
+ * deleted before the write function unlocks rdtgroup_mutex.
+ */
+ atomic_dec(&rdtgrp->waitcount);
+ kfree(callback);
+ } else {
+ tsk->closid = rdtgrp->closid;
+ }
+ return ret;
+}
+
+static int rdtgroup_task_write_permission(struct task_struct *task,
+ struct kernfs_open_file *of)
+{
+ const struct cred *tcred = get_task_cred(task);
+ const struct cred *cred = current_cred();
+ int ret = 0;
+
+ /*
+ * Even if we're attaching all tasks in the thread group, we only
+ * need to check permissions on one of them.
+ */
+ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ ret = -EPERM;
+
+ put_cred(tcred);
+ return ret;
+}
+
+static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp,
+ struct kernfs_open_file *of)
+{
+ struct task_struct *tsk;
+ int ret;
+
+ rcu_read_lock();
+ if (pid) {
+ tsk = find_task_by_vpid(pid);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ } else {
+ tsk = current;
+ }
+
+ get_task_struct(tsk);
+ rcu_read_unlock();
+
+ ret = rdtgroup_task_write_permission(tsk, of);
+ if (!ret)
+ ret = __rdtgroup_move_task(tsk, rdtgrp);
+
+ put_task_struct(tsk);
+ return ret;
+}
+
+static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+ pid_t pid;
+
+ if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
+ return -EINVAL;
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+
+ if (rdtgrp)
+ ret = rdtgroup_move_task(pid, rdtgrp, of);
+ else
+ ret = -ENOENT;
+
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
+static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
+{
+ struct task_struct *p, *t;
+
+ rcu_read_lock();
+ for_each_process_thread(p, t) {
+ if (t->closid == r->closid)
+ seq_printf(s, "%d\n", t->pid);
+ }
+ rcu_read_unlock();
+}
+
+static int rdtgroup_tasks_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ int ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp)
+ show_rdt_tasks(rdtgrp, s);
+ else
+ ret = -ENOENT;
+ rdtgroup_kn_unlock(of->kn);
+
+ return ret;
+}
+
+/* Files in each rdtgroup */
+static struct rftype rdtgroup_base_files[] = {
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_cpus_write,
+ .seq_show = rdtgroup_cpus_show,
+ },
+ {
+ .name = "tasks",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_tasks_write,
+ .seq_show = rdtgroup_tasks_show,
+ },
+ {
+ .name = "schemata",
+ .mode = 0644,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .write = rdtgroup_schemata_write,
+ .seq_show = rdtgroup_schemata_show,
+ },
+};
+
+static int rdt_num_closids_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->num_closid);
+
+ return 0;
+}
+
+static int rdt_cbm_mask_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%x\n", r->max_cbm);
+
+ return 0;
+}
+
+static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct rdt_resource *r = of->kn->parent->priv;
+
+ seq_printf(seq, "%d\n", r->min_cbm_bits);
+
+ return 0;
+}
+
+/* rdtgroup information files for one cache resource. */
+static struct rftype res_info_files[] = {
+ {
+ .name = "num_closids",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_num_closids_show,
+ },
+ {
+ .name = "cbm_mask",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_cbm_mask_show,
+ },
+ {
+ .name = "min_cbm_bits",
+ .mode = 0444,
+ .kf_ops = &rdtgroup_kf_single_ops,
+ .seq_show = rdt_min_cbm_bits_show,
+ },
+};
+
+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
+{
+ struct kernfs_node *kn_subdir;
+ struct rdt_resource *r;
+ int ret;
+
+ /* create the directory */
+ kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
+ if (IS_ERR(kn_info))
+ return PTR_ERR(kn_info);
+ kernfs_get(kn_info);
+
+ for_each_enabled_rdt_resource(r) {
+ kn_subdir = kernfs_create_dir(kn_info, r->name,
+ kn_info->mode, r);
+ if (IS_ERR(kn_subdir)) {
+ ret = PTR_ERR(kn_subdir);
+ goto out_destroy;
+ }
+ kernfs_get(kn_subdir);
+ ret = rdtgroup_kn_set_ugid(kn_subdir);
+ if (ret)
+ goto out_destroy;
+ ret = rdtgroup_add_files(kn_subdir, res_info_files,
+ ARRAY_SIZE(res_info_files));
+ if (ret)
+ goto out_destroy;
+ kernfs_activate(kn_subdir);
+ }
+
+ /*
+ * This extra ref will be put in kernfs_remove() and guarantees
+ * that @rdtgrp->kn is always accessible.
+ */
+ kernfs_get(kn_info);
+
+ ret = rdtgroup_kn_set_ugid(kn_info);
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn_info);
+
+ return 0;
+
+out_destroy:
+ kernfs_remove(kn_info);
+ return ret;
+}
+
+static void l3_qos_cfg_update(void *arg)
+{
+ bool *enable = arg;
+
+ wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL);
+}
+
+static int set_l3_qos_cfg(struct rdt_resource *r, bool enable)
+{
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ list_for_each_entry(d, &r->domains, list) {
+ /* Pick one CPU from each domain instance to update MSR */
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ }
+ cpu = get_cpu();
+ /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ l3_qos_cfg_update(&enable);
+ /* Update QOS_CFG MSR on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+static int cdp_enable(void)
+{
+ struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA];
+ struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE];
+ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
+ int ret;
+
+ if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
+ return -EINVAL;
+
+ ret = set_l3_qos_cfg(r_l3, true);
+ if (!ret) {
+ r_l3->enabled = false;
+ r_l3data->enabled = true;
+ r_l3code->enabled = true;
+ }
+ return ret;
+}
+
+static void cdp_disable(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
+
+ r->enabled = r->capable;
+
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
+ rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
+ rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
+ set_l3_qos_cfg(r, false);
+ }
+}
+
+static int parse_rdtgroupfs_options(char *data)
+{
+ char *token, *o = data;
+ int ret = 0;
+
+ while ((token = strsep(&o, ",")) != NULL) {
+ if (!*token)
+ return -EINVAL;
+
+ if (!strcmp(token, "cdp"))
+ ret = cdp_enable();
+ }
+
+ return ret;
+}
+
+/*
+ * We don't allow rdtgroup directories to be created anywhere
+ * except the root directory. Thus when looking for the rdtgroup
+ * structure for a kernfs node we are either looking at a directory,
+ * in which case the rdtgroup structure is pointed at by the "priv"
+ * field, otherwise we have a file, and need only look to the parent
+ * to find the rdtgroup.
+ */
+static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn)
+{
+ if (kernfs_type(kn) == KERNFS_DIR) {
+ /*
+ * All the resource directories use "kn->priv"
+ * to point to the "struct rdtgroup" for the
+ * resource. "info" and its subdirectories don't
+ * have rdtgroup structures, so return NULL here.
+ */
+ if (kn == kn_info || kn->parent == kn_info)
+ return NULL;
+ else
+ return kn->priv;
+ } else {
+ return kn->parent->priv;
+ }
+}
+
+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return NULL;
+
+ atomic_inc(&rdtgrp->waitcount);
+ kernfs_break_active_protection(kn);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Was this group deleted while we waited? */
+ if (rdtgrp->flags & RDT_DELETED)
+ return NULL;
+
+ return rdtgrp;
+}
+
+void rdtgroup_kn_unlock(struct kernfs_node *kn)
+{
+ struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn);
+
+ if (!rdtgrp)
+ return;
+
+ mutex_unlock(&rdtgroup_mutex);
+
+ if (atomic_dec_and_test(&rdtgrp->waitcount) &&
+ (rdtgrp->flags & RDT_DELETED)) {
+ kernfs_unbreak_active_protection(kn);
+ kernfs_put(kn);
+ kfree(rdtgrp);
+ } else {
+ kernfs_unbreak_active_protection(kn);
+ }
+}
+
+static struct dentry *rdt_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name,
+ void *data)
+{
+ struct dentry *dentry;
+ int ret;
+
+ mutex_lock(&rdtgroup_mutex);
+ /*
+ * resctrl file system can only be mounted once.
+ */
+ if (static_branch_unlikely(&rdt_enable_key)) {
+ dentry = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ ret = parse_rdtgroupfs_options(data);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ closid_init();
+
+ ret = rdtgroup_create_info_dir(rdtgroup_default.kn);
+ if (ret) {
+ dentry = ERR_PTR(ret);
+ goto out_cdp;
+ }
+
+ dentry = kernfs_mount(fs_type, flags, rdt_root,
+ RDTGROUP_SUPER_MAGIC, NULL);
+ if (IS_ERR(dentry))
+ goto out_cdp;
+
+ static_branch_enable(&rdt_enable_key);
+ goto out;
+
+out_cdp:
+ cdp_disable();
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return dentry;
+}
+
+static int reset_all_cbms(struct rdt_resource *r)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int i, cpu;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.res = r;
+ msr_param.low = 0;
+ msr_param.high = r->num_closid;
+
+ /*
+ * Disable resource control for this resource by setting all
+ * CBMs in all domains to the maximum mask value. Pick one CPU
+ * from each domain to update the MSRs below.
+ */
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+
+ for (i = 0; i < r->num_closid; i++)
+ d->cbm[i] = r->max_cbm;
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on all other cpus in cpu_mask. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+/*
+ * Move tasks from one to the other group. If @from is NULL, then all tasks
+ * in the systems are moved unconditionally (used for teardown).
+ *
+ * If @mask is not NULL the cpus on which moved tasks are running are set
+ * in that mask so the update smp function call is restricted to affected
+ * cpus.
+ */
+static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
+ struct cpumask *mask)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ if (!from || t->closid == from->closid) {
+ t->closid = to->closid;
+#ifdef CONFIG_SMP
+ /*
+ * This is safe on x86 w/o barriers as the ordering
+ * of writing to task_cpu() and t->on_cpu is
+ * reverse to the reading here. The detection is
+ * inaccurate as tasks might move or schedule
+ * before the smp function call takes place. In
+ * such a case the function call is pointless, but
+ * there is no other side effect.
+ */
+ if (mask && t->on_cpu)
+ cpumask_set_cpu(task_cpu(t), mask);
+#endif
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+/*
+ * Forcibly remove all of subdirectories under root.
+ */
+static void rmdir_all_sub(void)
+{
+ struct rdtgroup *rdtgrp, *tmp;
+
+ /* Move all tasks to the default resource group */
+ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
+
+ list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
+ /* Remove each rdtgroup other than root */
+ if (rdtgrp == &rdtgroup_default)
+ continue;
+
+ /*
+ * Give any CPUs back to the default group. We cannot copy
+ * cpu_online_mask because a CPU might have executed the
+ * offline callback already, but is still marked online.
+ */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ kernfs_remove(rdtgrp->kn);
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+ }
+ /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
+ get_online_cpus();
+ rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
+ put_online_cpus();
+
+ kernfs_remove(kn_info);
+}
+
+static void rdt_kill_sb(struct super_block *sb)
+{
+ struct rdt_resource *r;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /*Put everything back to default values. */
+ for_each_enabled_rdt_resource(r)
+ reset_all_cbms(r);
+ cdp_disable();
+ rmdir_all_sub();
+ static_branch_disable(&rdt_enable_key);
+ kernfs_kill_sb(sb);
+ mutex_unlock(&rdtgroup_mutex);
+}
+
+static struct file_system_type rdt_fs_type = {
+ .name = "resctrl",
+ .mount = rdt_mount,
+ .kill_sb = rdt_kill_sb,
+};
+
+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+ umode_t mode)
+{
+ struct rdtgroup *parent, *rdtgrp;
+ struct kernfs_node *kn;
+ int ret, closid;
+
+ /* Only allow mkdir in the root directory */
+ if (parent_kn != rdtgroup_default.kn)
+ return -EPERM;
+
+ /* Do not accept '\n' to avoid unparsable situation. */
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ parent = rdtgroup_kn_lock_live(parent_kn);
+ if (!parent) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ ret = closid_alloc();
+ if (ret < 0)
+ goto out_unlock;
+ closid = ret;
+
+ /* allocate the rdtgroup. */
+ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
+ if (!rdtgrp) {
+ ret = -ENOSPC;
+ goto out_closid_free;
+ }
+ rdtgrp->closid = closid;
+ list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
+
+ /* kernfs creates the directory for rdtgrp */
+ kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_cancel_ref;
+ }
+ rdtgrp->kn = kn;
+
+ /*
+ * kernfs_remove() will drop the reference count on "kn" which
+ * will free it. But we still need it to stick around for the
+ * rdtgroup_kn_unlock(kn} call below. Take one extra reference
+ * here, which will be dropped inside rdtgroup_kn_unlock().
+ */
+ kernfs_get(kn);
+
+ ret = rdtgroup_kn_set_ugid(kn);
+ if (ret)
+ goto out_destroy;
+
+ ret = rdtgroup_add_files(kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret)
+ goto out_destroy;
+
+ kernfs_activate(kn);
+
+ ret = 0;
+ goto out_unlock;
+
+out_destroy:
+ kernfs_remove(rdtgrp->kn);
+out_cancel_ref:
+ list_del(&rdtgrp->rdtgroup_list);
+ kfree(rdtgrp);
+out_closid_free:
+ closid_free(closid);
+out_unlock:
+ rdtgroup_kn_unlock(parent_kn);
+ return ret;
+}
+
+static int rdtgroup_rmdir(struct kernfs_node *kn)
+{
+ int ret, cpu, closid = rdtgroup_default.closid;
+ struct rdtgroup *rdtgrp;
+ cpumask_var_t tmpmask;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ rdtgrp = rdtgroup_kn_lock_live(kn);
+ if (!rdtgrp) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ /* Give any tasks back to the default group */
+ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
+
+ /* Give any CPUs back to the default group */
+ cpumask_or(&rdtgroup_default.cpu_mask,
+ &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
+
+ /* Update per cpu closid of the moved CPUs first */
+ for_each_cpu(cpu, &rdtgrp->cpu_mask)
+ per_cpu(cpu_closid, cpu) = closid;
+ /*
+ * Update the MSR on moved CPUs and CPUs which have moved
+ * task running on them.
+ */
+ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
+ rdt_update_closid(tmpmask, NULL);
+
+ rdtgrp->flags = RDT_DELETED;
+ closid_free(rdtgrp->closid);
+ list_del(&rdtgrp->rdtgroup_list);
+
+ /*
+ * one extra hold on this, will drop when we kfree(rdtgrp)
+ * in rdtgroup_kn_unlock()
+ */
+ kernfs_get(kn);
+ kernfs_remove(rdtgrp->kn);
+ ret = 0;
+out:
+ rdtgroup_kn_unlock(kn);
+ free_cpumask_var(tmpmask);
+ return ret;
+}
+
+static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
+{
+ if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
+ seq_puts(seq, ",cdp");
+ return 0;
+}
+
+static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = {
+ .mkdir = rdtgroup_mkdir,
+ .rmdir = rdtgroup_rmdir,
+ .show_options = rdtgroup_show_options,
+};
+
+static int __init rdtgroup_setup_root(void)
+{
+ int ret;
+
+ rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED,
+ &rdtgroup_default);
+ if (IS_ERR(rdt_root))
+ return PTR_ERR(rdt_root);
+
+ mutex_lock(&rdtgroup_mutex);
+
+ rdtgroup_default.closid = 0;
+ list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
+
+ ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
+ ARRAY_SIZE(rdtgroup_base_files));
+ if (ret) {
+ kernfs_destroy_root(rdt_root);
+ goto out;
+ }
+
+ rdtgroup_default.kn = rdt_root->kn;
+ kernfs_activate(rdtgroup_default.kn);
+
+out:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+
+/*
+ * rdtgroup_init - rdtgroup initialization
+ *
+ * Setup resctrl file system including set up root, create mount point,
+ * register rdtgroup filesystem, and initialize files under root directory.
+ *
+ * Return: 0 on success or -errno
+ */
+int __init rdtgroup_init(void)
+{
+ int ret = 0;
+
+ ret = rdtgroup_setup_root();
+ if (ret)
+ return ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "resctrl");
+ if (ret)
+ goto cleanup_root;
+
+ ret = register_filesystem(&rdt_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ return 0;
+
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "resctrl");
+cleanup_root:
+ kernfs_destroy_root(rdt_root);
+
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c
new file mode 100644
index 000000000000..f369cb8db0d5
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c
@@ -0,0 +1,245 @@
+/*
+ * Resource Director Technology(RDT)
+ * - Cache Allocation code.
+ *
+ * Copyright (C) 2016 Intel Corporation
+ *
+ * Authors:
+ * Fenghua Yu <fenghua.yu@intel.com>
+ * Tony Luck <tony.luck@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * More information about RDT be found in the Intel (R) x86 Architecture
+ * Software Developer Manual June 2016, volume 3, section 17.17.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/kernfs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * Check whether a cache bit mask is valid. The SDM says:
+ * Please note that all (and only) contiguous '1' combinations
+ * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
+ * Additionally Haswell requires at least two bits set.
+ */
+static bool cbm_validate(unsigned long var, struct rdt_resource *r)
+{
+ unsigned long first_bit, zero_bit;
+
+ if (var == 0 || var > r->max_cbm)
+ return false;
+
+ first_bit = find_first_bit(&var, r->cbm_len);
+ zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit);
+
+ if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len)
+ return false;
+
+ if ((zero_bit - first_bit) < r->min_cbm_bits)
+ return false;
+ return true;
+}
+
+/*
+ * Read one cache bit mask (hex). Check that it is valid for the current
+ * resource type.
+ */
+static int parse_cbm(char *buf, struct rdt_resource *r)
+{
+ unsigned long data;
+ int ret;
+
+ ret = kstrtoul(buf, 16, &data);
+ if (ret)
+ return ret;
+ if (!cbm_validate(data, r))
+ return -EINVAL;
+ r->tmp_cbms[r->num_tmp_cbms++] = data;
+
+ return 0;
+}
+
+/*
+ * For each domain in this resource we expect to find a series of:
+ * id=mask
+ * separated by ";". The "id" is in decimal, and must appear in the
+ * right order.
+ */
+static int parse_line(char *line, struct rdt_resource *r)
+{
+ char *dom = NULL, *id;
+ struct rdt_domain *d;
+ unsigned long dom_id;
+
+ list_for_each_entry(d, &r->domains, list) {
+ dom = strsep(&line, ";");
+ if (!dom)
+ return -EINVAL;
+ id = strsep(&dom, "=");
+ if (kstrtoul(id, 10, &dom_id) || dom_id != d->id)
+ return -EINVAL;
+ if (parse_cbm(dom, r))
+ return -EINVAL;
+ }
+
+ /* Any garbage at the end of the line? */
+ if (line && line[0])
+ return -EINVAL;
+ return 0;
+}
+
+static int update_domains(struct rdt_resource *r, int closid)
+{
+ struct msr_param msr_param;
+ cpumask_var_t cpu_mask;
+ struct rdt_domain *d;
+ int cpu, idx = 0;
+
+ if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ msr_param.low = closid;
+ msr_param.high = msr_param.low + 1;
+ msr_param.res = r;
+
+ list_for_each_entry(d, &r->domains, list) {
+ cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask);
+ d->cbm[msr_param.low] = r->tmp_cbms[idx++];
+ }
+ cpu = get_cpu();
+ /* Update CBM on this cpu if it's in cpu_mask. */
+ if (cpumask_test_cpu(cpu, cpu_mask))
+ rdt_cbm_update(&msr_param);
+ /* Update CBM on other cpus. */
+ smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1);
+ put_cpu();
+
+ free_cpumask_var(cpu_mask);
+
+ return 0;
+}
+
+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ char *tok, *resname;
+ int closid, ret = 0;
+ u32 *l3_cbms = NULL;
+
+ /* Valid input requires a trailing newline */
+ if (nbytes == 0 || buf[nbytes - 1] != '\n')
+ return -EINVAL;
+ buf[nbytes - 1] = '\0';
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ rdtgroup_kn_unlock(of->kn);
+ return -ENOENT;
+ }
+
+ closid = rdtgrp->closid;
+
+ /* get scratch space to save all the masks while we validate input */
+ for_each_enabled_rdt_resource(r) {
+ r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms),
+ GFP_KERNEL);
+ if (!r->tmp_cbms) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ r->num_tmp_cbms = 0;
+ }
+
+ while ((tok = strsep(&buf, "\n")) != NULL) {
+ resname = strsep(&tok, ":");
+ if (!tok) {
+ ret = -EINVAL;
+ goto out;
+ }
+ for_each_enabled_rdt_resource(r) {
+ if (!strcmp(resname, r->name) &&
+ closid < r->num_closid) {
+ ret = parse_line(tok, r);
+ if (ret)
+ goto out;
+ break;
+ }
+ }
+ if (!r->name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Did the parser find all the masks we need? */
+ for_each_enabled_rdt_resource(r) {
+ if (r->num_tmp_cbms != r->num_domains) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ for_each_enabled_rdt_resource(r) {
+ ret = update_domains(r, closid);
+ if (ret)
+ goto out;
+ }
+
+out:
+ rdtgroup_kn_unlock(of->kn);
+ for_each_enabled_rdt_resource(r) {
+ kfree(r->tmp_cbms);
+ r->tmp_cbms = NULL;
+ }
+ return ret ?: nbytes;
+}
+
+static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid)
+{
+ struct rdt_domain *dom;
+ bool sep = false;
+
+ seq_printf(s, "%s:", r->name);
+ list_for_each_entry(dom, &r->domains, list) {
+ if (sep)
+ seq_puts(s, ";");
+ seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]);
+ sep = true;
+ }
+ seq_puts(s, "\n");
+}
+
+int rdtgroup_schemata_show(struct kernfs_open_file *of,
+ struct seq_file *s, void *v)
+{
+ struct rdtgroup *rdtgrp;
+ struct rdt_resource *r;
+ int closid, ret = 0;
+
+ rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (rdtgrp) {
+ closid = rdtgrp->closid;
+ for_each_enabled_rdt_resource(r) {
+ if (closid < r->num_closid)
+ show_doms(s, r, closid);
+ }
+ } else {
+ ret = -ENOENT;
+ }
+ rdtgroup_kn_unlock(of->kn);
+ return ret;
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 631356c8cca4..c7efbcfbeda6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
*msg = s->msg;
s->covered = 1;
if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) {
- if (panic_on_oops || tolerant < 1)
+ if (tolerant < 1)
return MCE_PANIC_SEVERITY;
}
return s->sev;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a7fdf453d895..00ef43233e03 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -43,6 +43,7 @@
#include <linux/export.h>
#include <linux/jump_label.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/tlbflush.h>
@@ -135,6 +136,9 @@ void mce_setup(struct mce *m)
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
+
+ if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
+ rdmsrl(MSR_PPIN, m->ppin);
}
DEFINE_PER_CPU(struct mce, injectm);
@@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log);
static struct notifier_block mce_srao_nb;
+static atomic_t num_notifiers;
+
void mce_register_decode_chain(struct notifier_block *nb)
{
+ atomic_inc(&num_notifiers);
+
/* Ensure SRAO notifier has the highest priority in the decode chain. */
if (nb != &mce_srao_nb && nb->priority == INT_MAX)
nb->priority -= 1;
@@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain);
void mce_unregister_decode_chain(struct notifier_block *nb)
{
+ atomic_dec(&num_notifiers);
+
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
}
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
@@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = {
.misc = misc_reg
};
-static void print_mce(struct mce *m)
+static void __print_mce(struct mce *m)
{
- int ret = 0;
-
- pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
- m->extcpu, m->mcgstatus, m->bank, m->status);
+ pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n",
+ m->extcpu,
+ (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""),
+ m->mcgstatus, m->bank, m->status);
if (m->ip) {
pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
- m->cs, m->ip);
+ m->cs, m->ip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
@@ -308,6 +318,13 @@ static void print_mce(struct mce *m)
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
cpu_data(m->extcpu).microcode);
+}
+
+static void print_mce(struct mce *m)
+{
+ int ret = 0;
+
+ __print_mce(m);
/*
* Print out human-readable details about the MCE error,
@@ -499,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c)
static void mce_schedule_work(void)
{
- if (!mce_gen_pool_empty() && keventd_up())
+ if (!mce_gen_pool_empty())
schedule_work(&mce_work);
}
@@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = {
.priority = INT_MAX,
};
+static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
+{
+ struct mce *m = (struct mce *)data;
+
+ if (!m)
+ return NOTIFY_DONE;
+
+ /*
+ * Run the default notifier if we have only the SRAO
+ * notifier and us registered.
+ */
+ if (atomic_read(&num_notifiers) > 2)
+ return NOTIFY_DONE;
+
+ __print_mce(m);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block mce_default_nb = {
+ .notifier_call = mce_default_notifier,
+ /* lowest prio, we want it to run last. */
+ .priority = 0,
+};
+
/*
* Read ADDR and MISC registers.
*/
@@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_gather_info(&m, NULL);
+ /*
+ * m.tsc was set in mce_setup(). Clear it if not requested.
+ *
+ * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid
+ * that dance.
+ */
+ if (!(flags & MCP_TIMESTAMP))
+ m.tsc = 0;
+
for (i = 0; i < mca_cfg.banks; i++) {
if (!mce_banks[i].ctl || !test_bit(i, *b))
continue;
@@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
m.misc = 0;
m.addr = 0;
m.bank = i;
- m.tsc = 0;
barrier();
m.status = mce_rdmsrl(msr_ops.status(i));
if (!(m.status & MCI_STATUS_VAL))
continue;
-
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_read_aux(&m, i);
- if (!(flags & MCP_TIMESTAMP))
- m.tsc = 0;
-
severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m))
@@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data)
iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+ machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
if (mce_intel_cmci_poll()) {
iv = mce_adjust_timer(iv);
@@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t)
add_timer_on(t, cpu);
}
+static void __mcheck_cpu_setup_timer(void)
+{
+ struct timer_list *t = this_cpu_ptr(&mce_timer);
+ unsigned int cpu = smp_processor_id();
+
+ setup_pinned_timer(t, mce_timer_fn, cpu);
+}
+
static void __mcheck_cpu_init_timer(void)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
@@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_clear_banks();
- __mcheck_cpu_init_timer();
+ __mcheck_cpu_setup_timer();
}
/*
@@ -2138,6 +2193,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
INIT_WORK(&mce_work, mce_process_work);
@@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = {
DEFINE_PER_CPU(struct device *, mce_device);
-void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
-
static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
{
return container_of(attr, struct mce_bank, attr);
@@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu)
if (!mce_available(&boot_cpu_data))
return -EIO;
+ dev = per_cpu(mce_device, cpu);
+ if (dev)
+ return 0;
+
dev = kzalloc(sizeof *dev, GFP_KERNEL);
if (!dev)
return -ENOMEM;
@@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu)
}
/* Make sure there are no machine checks on offlined CPUs. */
-static void mce_disable_cpu(void *h)
+static void mce_disable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
-
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_clear();
vendor_disable_error_reporting();
}
-static void mce_reenable_cpu(void *h)
+static void mce_reenable_cpu(void)
{
- unsigned long action = *(unsigned long *)h;
int i;
if (!mce_available(raw_cpu_ptr(&cpu_info)))
return;
- if (!(action & CPU_TASKS_FROZEN))
+ if (!cpuhp_tasks_frozen)
cmci_reenable();
for (i = 0; i < mca_cfg.banks; i++) {
struct mce_bank *b = &mce_banks[i];
@@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h)
}
}
-/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+static int mce_cpu_dead(unsigned int cpu)
+{
+ mce_intel_hcpu_update(cpu);
+
+ /* intentionally ignoring frozen here */
+ if (!cpuhp_tasks_frozen)
+ cmci_rediscover();
+ return 0;
+}
+
+static int mce_cpu_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
struct timer_list *t = &per_cpu(mce_timer, cpu);
+ int ret;
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- mce_device_create(cpu);
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- break;
- case CPU_DEAD:
- if (threshold_cpu_callback)
- threshold_cpu_callback(action, cpu);
- mce_device_remove(cpu);
- mce_intel_hcpu_update(cpu);
+ mce_device_create(cpu);
- /* intentionally ignoring frozen here */
- if (!(action & CPU_TASKS_FROZEN))
- cmci_rediscover();
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
- del_timer_sync(t);
- break;
- case CPU_DOWN_FAILED:
- smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
- mce_start_timer(cpu, t);
- break;
+ ret = mce_threshold_create_device(cpu);
+ if (ret) {
+ mce_device_remove(cpu);
+ return ret;
}
-
- return NOTIFY_OK;
+ mce_reenable_cpu();
+ mce_start_timer(cpu, t);
+ return 0;
}
-static struct notifier_block mce_cpu_notifier = {
- .notifier_call = mce_cpu_callback,
-};
+static int mce_cpu_pre_down(unsigned int cpu)
+{
+ struct timer_list *t = &per_cpu(mce_timer, cpu);
+
+ mce_disable_cpu();
+ del_timer_sync(t);
+ mce_threshold_remove_device(cpu);
+ mce_device_remove(cpu);
+ return 0;
+}
static __init void mce_init_banks(void)
{
@@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void)
static __init int mcheck_init_device(void)
{
+ enum cpuhp_state hp_online;
int err;
- int i = 0;
if (!mce_available(&boot_cpu_data)) {
err = -EIO;
@@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void)
if (err)
goto err_out_mem;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = mce_device_create(i);
- if (err) {
- /*
- * Register notifier anyway (and do not unreg it) so
- * that we don't leave undeleted timers, see notifier
- * callback above.
- */
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
- goto err_device_create;
- }
- }
+ err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL,
+ mce_cpu_dead);
+ if (err)
+ goto err_out_mem;
- __register_hotcpu_notifier(&mce_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online",
+ mce_cpu_online, mce_cpu_pre_down);
+ if (err < 0)
+ goto err_out_online;
+ hp_online = err;
register_syscore_ops(&mce_syscore_ops);
@@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void)
err_register:
unregister_syscore_ops(&mce_syscore_ops);
+ cpuhp_remove_state(hp_online);
-err_device_create:
- /*
- * We didn't keep track of which devices were created above, but
- * even if we had, the set of online cpus might have changed.
- * Play safe and remove for every possible cpu, since
- * mce_device_remove() will do the right thing.
- */
- for_each_possible_cpu(i)
- mce_device_remove(i);
+err_out_online:
+ cpuhp_remove_state(CPUHP_X86_MCE_DEAD);
err_out_mem:
free_cpumask_var(mce_device_initialized);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 9b5403462936..ffacfdcacb85 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -24,7 +24,6 @@
#include <asm/amd_nb.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -55,6 +54,8 @@
/* Threshold LVT offset is at MSR0xC0000410[15:12] */
#define SMCA_THR_LVT_OFF 0xF000
+static bool thresholding_en;
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = {
"misc_umc"
};
-struct smca_bank_name smca_bank_names[] = {
+struct smca_bank_name {
+ const char *name; /* Short name for sysfs */
+ const char *long_name; /* Long name for pretty-printing */
+};
+
+static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
@@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = {
[SMCA_PSP] = { "psp", "Platform Security Processor" },
[SMCA_SMU] = { "smu", "System Management Unit" },
};
-EXPORT_SYMBOL_GPL(smca_bank_names);
-static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
+const char *smca_get_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].name;
+}
+
+const char *smca_get_long_name(enum smca_bank_types t)
+{
+ if (t >= N_SMCA_BANK_TYPES)
+ return NULL;
+
+ return smca_names[t].long_name;
+}
+EXPORT_SYMBOL_GPL(smca_get_long_name);
+
+static struct smca_hwid smca_hwid_mcatypes[] = {
/* { bank_type, hwid_mcatype, xec_bitmap } */
/* ZN Core (HWID=0xB0) MCA types */
@@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = {
{ SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 },
};
-struct smca_bank_info smca_banks[MAX_NR_BANKS];
+struct smca_bank smca_banks[MAX_NR_BANKS];
EXPORT_SYMBOL_GPL(smca_banks);
/*
@@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void)
}
void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
-/*
- * CPU Initialization
- */
-
static void get_smca_bank_info(unsigned int bank)
{
unsigned int i, hwid_mcatype, cpu = smp_processor_id();
- struct smca_hwid_mcatype *type;
- u32 high, instanceId;
- u16 hwid, mcatype;
+ struct smca_hwid *s_hwid;
+ u32 high, instance_id;
/* Collect bank_info using CPU 0 for now. */
if (cpu)
return;
- if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) {
+ if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
return;
}
- hwid = high & MCI_IPID_HWID;
- mcatype = (high & MCI_IPID_MCATYPE) >> 16;
- hwid_mcatype = HWID_MCATYPE(hwid, mcatype);
+ hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID,
+ (high & MCI_IPID_MCATYPE) >> 16);
for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) {
- type = &smca_hwid_mcatypes[i];
- if (hwid_mcatype == type->hwid_mcatype) {
- smca_banks[bank].type = type;
- smca_banks[bank].type_instance = instanceId;
+ s_hwid = &smca_hwid_mcatypes[i];
+ if (hwid_mcatype == s_hwid->hwid_mcatype) {
+
+ WARN(smca_banks[bank].hwid,
+ "Bank %s already initialized!\n",
+ smca_get_name(s_hwid->bank_type));
+
+ smca_banks[bank].hwid = s_hwid;
+ smca_banks[bank].id = instance_id;
break;
}
}
@@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
deferred_error_interrupt_enable(c);
}
+int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
+{
+ u64 dram_base_addr, dram_limit_addr, dram_hole_base;
+ /* We start from the normalized address */
+ u64 ret_addr = norm_addr;
+
+ u32 tmp;
+
+ u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask;
+ u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets;
+ u8 intlv_addr_sel, intlv_addr_bit;
+ u8 num_intlv_bits, hashed_bit;
+ u8 lgcy_mmio_hole_en, base = 0;
+ u8 cs_mask, cs_id = 0;
+ bool hash_enabled = false;
+
+ /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */
+ if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp))
+ goto out_err;
+
+ /* Remove HiAddrOffset from normalized address, if enabled: */
+ if (tmp & BIT(0)) {
+ u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8;
+
+ if (norm_addr >= hi_addr_offset) {
+ ret_addr -= hi_addr_offset;
+ base = 1;
+ }
+ }
+
+ /* Read D18F0x110 (DramBaseAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ /* Check if address range is valid. */
+ if (!(tmp & BIT(0))) {
+ pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n",
+ __func__, tmp);
+ goto out_err;
+ }
+
+ lgcy_mmio_hole_en = tmp & BIT(1);
+ intlv_num_chan = (tmp >> 4) & 0xF;
+ intlv_addr_sel = (tmp >> 8) & 0x7;
+ dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16;
+
+ /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */
+ if (intlv_addr_sel > 3) {
+ pr_err("%s: Invalid interleave address select %d.\n",
+ __func__, intlv_addr_sel);
+ goto out_err;
+ }
+
+ /* Read D18F0x114 (DramLimitAddress). */
+ if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp))
+ goto out_err;
+
+ intlv_num_sockets = (tmp >> 8) & 0x1;
+ intlv_num_dies = (tmp >> 10) & 0x3;
+ dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0);
+
+ intlv_addr_bit = intlv_addr_sel + 8;
+
+ /* Re-use intlv_num_chan by setting it equal to log2(#channels) */
+ switch (intlv_num_chan) {
+ case 0: intlv_num_chan = 0; break;
+ case 1: intlv_num_chan = 1; break;
+ case 3: intlv_num_chan = 2; break;
+ case 5: intlv_num_chan = 3; break;
+ case 7: intlv_num_chan = 4; break;
+
+ case 8: intlv_num_chan = 1;
+ hash_enabled = true;
+ break;
+ default:
+ pr_err("%s: Invalid number of interleaved channels %d.\n",
+ __func__, intlv_num_chan);
+ goto out_err;
+ }
+
+ num_intlv_bits = intlv_num_chan;
+
+ if (intlv_num_dies > 2) {
+ pr_err("%s: Invalid number of interleaved nodes/dies %d.\n",
+ __func__, intlv_num_dies);
+ goto out_err;
+ }
+
+ num_intlv_bits += intlv_num_dies;
+
+ /* Add a bit if sockets are interleaved. */
+ num_intlv_bits += intlv_num_sockets;
+
+ /* Assert num_intlv_bits <= 4 */
+ if (num_intlv_bits > 4) {
+ pr_err("%s: Invalid interleave bits %d.\n",
+ __func__, num_intlv_bits);
+ goto out_err;
+ }
+
+ if (num_intlv_bits > 0) {
+ u64 temp_addr_x, temp_addr_i, temp_addr_y;
+ u8 die_id_bit, sock_id_bit, cs_fabric_id;
+
+ /*
+ * Read FabricBlockInstanceInformation3_CS[BlockFabricID].
+ * This is the fabric id for this coherent slave. Use
+ * umc/channel# as instance id of the coherent slave
+ * for FICAA.
+ */
+ if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp))
+ goto out_err;
+
+ cs_fabric_id = (tmp >> 8) & 0xFF;
+ die_id_bit = 0;
+
+ /* If interleaved over more than 1 channel: */
+ if (intlv_num_chan) {
+ die_id_bit = intlv_num_chan;
+ cs_mask = (1 << die_id_bit) - 1;
+ cs_id = cs_fabric_id & cs_mask;
+ }
+
+ sock_id_bit = die_id_bit;
+
+ /* Read D18F1x208 (SystemFabricIdMask). */
+ if (intlv_num_dies || intlv_num_sockets)
+ if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp))
+ goto out_err;
+
+ /* If interleaved over more than 1 die. */
+ if (intlv_num_dies) {
+ sock_id_bit = die_id_bit + intlv_num_dies;
+ die_id_shift = (tmp >> 24) & 0xF;
+ die_id_mask = (tmp >> 8) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit;
+ }
+
+ /* If interleaved over more than 1 socket. */
+ if (intlv_num_sockets) {
+ socket_id_shift = (tmp >> 28) & 0xF;
+ socket_id_mask = (tmp >> 16) & 0xFF;
+
+ cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit;
+ }
+
+ /*
+ * The pre-interleaved address consists of XXXXXXIIIYYYYY
+ * where III is the ID for this CS, and XXXXXXYYYYY are the
+ * address bits from the post-interleaved address.
+ * "num_intlv_bits" has been calculated to tell us how many "I"
+ * bits there are. "intlv_addr_bit" tells us how many "Y" bits
+ * there are (where "I" starts).
+ */
+ temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0);
+ temp_addr_i = (cs_id << intlv_addr_bit);
+ temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits;
+ ret_addr = temp_addr_x | temp_addr_i | temp_addr_y;
+ }
+
+ /* Add dram base address */
+ ret_addr += dram_base_addr;
+
+ /* If legacy MMIO hole enabled */
+ if (lgcy_mmio_hole_en) {
+ if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp))
+ goto out_err;
+
+ dram_hole_base = tmp & GENMASK(31, 24);
+ if (ret_addr >= dram_hole_base)
+ ret_addr += (BIT_ULL(32) - dram_hole_base);
+ }
+
+ if (hash_enabled) {
+ /* Save some parentheses and grab ls-bit at the end. */
+ hashed_bit = (ret_addr >> 12) ^
+ (ret_addr >> 18) ^
+ (ret_addr >> 21) ^
+ (ret_addr >> 30) ^
+ cs_id;
+
+ hashed_bit &= BIT(0);
+
+ if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0)))
+ ret_addr ^= BIT(intlv_addr_bit);
+ }
+
+ /* Is calculated system address is above DRAM limit address? */
+ if (ret_addr > dram_limit_addr)
+ goto out_err;
+
+ *sys_addr = ret_addr;
+ return 0;
+
+out_err:
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
+
static void
__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
{
@@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
unsigned int bank, block, cpu = smp_processor_id();
+ struct thresh_restart tr;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void)
log:
__log_error(bank, false, true, ((u64)high << 32) | low);
+
+ /* Reset threshold block after logging error. */
+ memset(&tr, 0, sizeof(tr));
+ tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block];
+ threshold_restart_bank(&tr);
}
/*
@@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return th_names[bank];
}
- if (!smca_banks[bank].type)
+ if (!smca_banks[bank].hwid)
return NULL;
- bank_type = smca_banks[bank].type->bank_type;
+ bank_type = smca_banks[bank].hwid->bank_type;
if (b && bank_type == SMCA_UMC) {
if (b->block < ARRAY_SIZE(smca_umc_block_names))
@@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
}
snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN,
- "%s_%x", smca_bank_names[bank_type].name,
- smca_banks[bank].type_instance);
+ "%s_%x", smca_get_name(bank_type),
+ smca_banks[bank].id);
return buf_mcatype;
}
@@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
return err;
}
-/* create dir/files for all valid threshold banks */
-static int threshold_create_device(unsigned int cpu)
-{
- unsigned int bank;
- struct threshold_bank **bp;
- int err = 0;
-
- bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
- GFP_KERNEL);
- if (!bp)
- return -ENOMEM;
-
- per_cpu(threshold_banks, cpu) = bp;
-
- for (bank = 0; bank < mca_cfg.banks; ++bank) {
- if (!(per_cpu(bank_map, cpu) & (1 << bank)))
- continue;
- err = threshold_create_bank(cpu, bank);
- if (err)
- return err;
- }
-
- return err;
-}
-
static void deallocate_threshold_block(unsigned int cpu,
unsigned int bank)
{
@@ -1102,48 +1304,71 @@ free_out:
per_cpu(threshold_banks, cpu)[bank] = NULL;
}
-static void threshold_remove_device(unsigned int cpu)
+int mce_threshold_remove_device(unsigned int cpu)
{
unsigned int bank;
+ if (!thresholding_en)
+ return 0;
+
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
threshold_remove_bank(cpu, bank);
}
kfree(per_cpu(threshold_banks, cpu));
+ per_cpu(threshold_banks, cpu) = NULL;
+ return 0;
}
-/* get notified when a cpu comes on/off */
-static void
-amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu)
+/* create dir/files for all valid threshold banks */
+int mce_threshold_create_device(unsigned int cpu)
{
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- threshold_create_device(cpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- threshold_remove_device(cpu);
- break;
- default:
- break;
+ unsigned int bank;
+ struct threshold_bank **bp;
+ int err = 0;
+
+ if (!thresholding_en)
+ return 0;
+
+ bp = per_cpu(threshold_banks, cpu);
+ if (bp)
+ return 0;
+
+ bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks,
+ GFP_KERNEL);
+ if (!bp)
+ return -ENOMEM;
+
+ per_cpu(threshold_banks, cpu) = bp;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ if (!(per_cpu(bank_map, cpu) & (1 << bank)))
+ continue;
+ err = threshold_create_bank(cpu, bank);
+ if (err)
+ goto err;
}
+ return err;
+err:
+ mce_threshold_remove_device(cpu);
+ return err;
}
static __init int threshold_init_device(void)
{
unsigned lcpu = 0;
+ if (mce_threshold_vector == amd_threshold_interrupt)
+ thresholding_en = true;
+
/* to hit CPUs online before the notifier is up */
for_each_online_cpu(lcpu) {
- int err = threshold_create_device(lcpu);
+ int err = mce_threshold_create_device(lcpu);
if (err)
return err;
}
- threshold_cpu_callback = amd_64_threshold_cpu_callback;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 1defb8ea882c..190b3e6cef4d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -11,6 +11,8 @@
#include <linux/sched.h>
#include <linux/cpumask.h>
#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/intel-family.h>
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/mce.h>
@@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void)
* Reset the counter if we've logged an error in the last poll
* during the storm.
*/
- if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
else
this_cpu_dec(cmci_backoff_cnt);
@@ -342,7 +344,7 @@ void cmci_recheck(void)
return;
local_irq_save(flags);
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ machine_check_poll(0, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
}
@@ -464,11 +466,46 @@ static void intel_clear_lmce(void)
wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
}
+static void intel_ppin_init(struct cpuinfo_x86 *c)
+{
+ unsigned long long val;
+
+ /*
+ * Even if testing the presence of the MSR would be enough, we don't
+ * want to risk the situation where other models reuse this MSR for
+ * other purposes.
+ */
+ switch (c->x86_model) {
+ case INTEL_FAM6_IVYBRIDGE_X:
+ case INTEL_FAM6_HASWELL_X:
+ case INTEL_FAM6_BROADWELL_XEON_D:
+ case INTEL_FAM6_BROADWELL_X:
+ case INTEL_FAM6_SKYLAKE_X:
+ if (rdmsrl_safe(MSR_PPIN_CTL, &val))
+ return;
+
+ if ((val & 3UL) == 1UL) {
+ /* PPIN available but disabled: */
+ return;
+ }
+
+ /* If PPIN is disabled, but not locked, try to enable: */
+ if (!(val & 3UL)) {
+ wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
+ rdmsrl_safe(MSR_PPIN_CTL, &val);
+ }
+
+ if ((val & 3UL) == 2UL)
+ set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
+ }
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
+ intel_ppin_init(c);
}
void mce_intel_feature_clear(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6b9dc4d18ccc..465aca8be009 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -26,7 +26,6 @@
#include <asm/processor.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
@@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev)
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
-static int
-thermal_throttle_cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+static int thermal_throttle_online(unsigned int cpu)
{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
- int err = 0;
-
- dev = get_cpu_device(cpu);
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- err = thermal_throttle_add_dev(dev, cpu);
- WARN_ON(err);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- thermal_throttle_remove_dev(dev);
- break;
- }
- return notifier_from_errno(err);
+ struct device *dev = get_cpu_device(cpu);
+
+ return thermal_throttle_add_dev(dev, cpu);
}
-static struct notifier_block thermal_throttle_cpu_notifier =
+static int thermal_throttle_offline(unsigned int cpu)
{
- .notifier_call = thermal_throttle_cpu_callback,
-};
+ struct device *dev = get_cpu_device(cpu);
+
+ thermal_throttle_remove_dev(dev);
+ return 0;
+}
static __init int thermal_throttle_init_device(void)
{
- unsigned int cpu = 0;
- int err;
+ int ret;
if (!atomic_read(&therm_throt_en))
return 0;
- cpu_notifier_register_begin();
-
- /* connect live CPUs to sysfs */
- for_each_online_cpu(cpu) {
- err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu);
- WARN_ON(err);
- }
-
- __register_hotcpu_notifier(&thermal_throttle_cpu_notifier);
- cpu_notifier_register_done();
-
- return 0;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
+ thermal_throttle_online,
+ thermal_throttle_offline);
+ return ret < 0 ? ret : 0;
}
device_initcall(thermal_throttle_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index fcf9ae9384f4..9beb092d68a5 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -6,7 +6,6 @@
#include <asm/irq_vectors.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
index 220b1a508513..ba12e8aa4a45 100644
--- a/arch/x86/kernel/cpu/microcode/Makefile
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -1,4 +1,4 @@
microcode-y := core.o
obj-$(CONFIG_MICROCODE) += microcode.o
-microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
+microcode-$(CONFIG_MICROCODE_INTEL) += intel.o
microcode-$(CONFIG_MICROCODE_AMD) += amd.o
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 017bda12caae..6a31e2691f3a 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -5,6 +5,7 @@
* CPUs and later.
*
* Copyright (C) 2008-2011 Advanced Micro Devices Inc.
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* Author: Peter Oruba <peter.oruba@amd.com>
*
@@ -39,64 +40,25 @@
static struct equiv_cpu_entry *equiv_cpu_table;
-struct ucode_patch {
- struct list_head plist;
- void *data;
- u32 patch_id;
- u16 equiv_cpu;
-};
-
-static LIST_HEAD(pcache);
-
/*
* This points to the current valid container of microcode patches which we will
- * save from the initrd before jettisoning its contents.
+ * save from the initrd/builtin before jettisoning its contents.
*/
-static u8 *container;
-static size_t container_size;
-static bool ucode_builtin;
+struct container {
+ u8 *data;
+ size_t size;
+} cont;
static u32 ucode_new_rev;
static u8 amd_ucode_patch[PATCH_MAX_SIZE];
static u16 this_equiv_id;
-static struct cpio_data ucode_cpio;
-
-static struct cpio_data __init find_ucode_in_initrd(void)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- char *path;
- void *start;
- size_t size;
-
- /*
- * Microcode patch container file is prepended to the initrd in cpio
- * format. See Documentation/x86/early-microcode.txt
- */
- static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin";
-
-#ifdef CONFIG_X86_32
- struct boot_params *p;
-
- /*
- * On 32-bit, early load occurs before paging is turned on so we need
- * to use physical addresses.
- */
- p = (struct boot_params *)__pa_nodebug(&boot_params);
- path = (char *)__pa_nodebug(ucode_path);
- start = (void *)p->hdr.ramdisk_image;
- size = p->hdr.ramdisk_size;
-#else
- path = ucode_path;
- start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
- size = boot_params.hdr.ramdisk_size;
-#endif /* !CONFIG_X86_32 */
-
- return find_cpio_data(path, start, size, NULL);
-#else
- return (struct cpio_data){ NULL, 0, "" };
-#endif
-}
+/*
+ * Microcode patch container file is prepended to the initrd in cpio
+ * format. See Documentation/x86/early-microcode.txt
+ */
+static const char
+ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin";
static size_t compute_container_size(u8 *data, u32 total_size)
{
@@ -135,48 +97,49 @@ static size_t compute_container_size(u8 *data, u32 total_size)
return size;
}
+static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
+ unsigned int sig)
+{
+ int i = 0;
+
+ if (!equiv_cpu_table)
+ return 0;
+
+ while (equiv_cpu_table[i].installed_cpu != 0) {
+ if (sig == equiv_cpu_table[i].installed_cpu)
+ return equiv_cpu_table[i].equiv_cpu;
+
+ i++;
+ }
+ return 0;
+}
+
/*
- * Early load occurs before we can vmalloc(). So we look for the microcode
- * patch container file in initrd, traverse equivalent cpu table, look for a
- * matching microcode patch, and update, all in initrd memory in place.
- * When vmalloc() is available for use later -- on 64-bit during first AP load,
- * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
- * load_microcode_amd() to save equivalent cpu table and microcode patches in
- * kernel heap memory.
+ * This scans the ucode blob for the proper container as we can have multiple
+ * containers glued together. Returns the equivalence ID from the equivalence
+ * table or 0 if none found.
*/
-static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
+static u16
+find_proper_container(u8 *ucode, size_t size, struct container *ret_cont)
{
+ struct container ret = { NULL, 0 };
+ u32 eax, ebx, ecx, edx;
struct equiv_cpu_entry *eq;
- size_t *cont_sz;
- u32 *header;
- u8 *data, **cont;
- u8 (*patch)[PATCH_MAX_SIZE];
- u16 eq_id = 0;
int offset, left;
- u32 rev, eax, ebx, ecx, edx;
- u32 *new_rev;
-
-#ifdef CONFIG_X86_32
- new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
- cont_sz = (size_t *)__pa_nodebug(&container_size);
- cont = (u8 **)__pa_nodebug(&container);
- patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
-#else
- new_rev = &ucode_new_rev;
- cont_sz = &container_size;
- cont = &container;
- patch = &amd_ucode_patch;
-#endif
+ u16 eq_id = 0;
+ u32 *header;
+ u8 *data;
data = ucode;
left = size;
header = (u32 *)data;
+
/* find equiv cpu table */
if (header[0] != UCODE_MAGIC ||
header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
header[2] == 0) /* size */
- return;
+ return eq_id;
eax = 0x00000001;
ecx = 0;
@@ -185,7 +148,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
while (left > 0) {
eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
- *cont = data;
+ ret.data = data;
/* Advance past the container header */
offset = header[2] + CONTAINER_HDR_SZ;
@@ -194,15 +157,16 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
eq_id = find_equiv_id(eq, eax);
if (eq_id) {
- this_equiv_id = eq_id;
- *cont_sz = compute_container_size(*cont, left + offset);
+ ret.size = compute_container_size(ret.data, left + offset);
/*
* truncate how much we need to iterate over in the
* ucode update loop below
*/
- left = *cont_sz - offset;
- break;
+ left = ret.size - offset;
+
+ *ret_cont = ret;
+ return eq_id;
}
/*
@@ -212,6 +176,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
*/
while (left > 0) {
header = (u32 *)data;
+
if (header[0] == UCODE_MAGIC &&
header[1] == UCODE_EQUIV_CPU_TABLE_TYPE)
break;
@@ -226,14 +191,65 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
ucode = data;
}
- if (!eq_id) {
- *cont = NULL;
- *cont_sz = 0;
- return;
- }
+ return eq_id;
+}
+
+static int __apply_microcode_amd(struct microcode_amd *mc_amd)
+{
+ u32 rev, dummy;
+
+ native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
+
+ /* verify patch application was successful */
+ native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+ if (rev != mc_amd->hdr.patch_id)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Early load occurs before we can vmalloc(). So we look for the microcode
+ * patch container file in initrd, traverse equivalent cpu table, look for a
+ * matching microcode patch, and update, all in initrd memory in place.
+ * When vmalloc() is available for use later -- on 64-bit during first AP load,
+ * and on 32-bit during save_microcode_in_initrd_amd() -- we can call
+ * load_microcode_amd() to save equivalent cpu table and microcode patches in
+ * kernel heap memory.
+ *
+ * Returns true if container found (sets @ret_cont), false otherwise.
+ */
+static bool apply_microcode_early_amd(void *ucode, size_t size, bool save_patch,
+ struct container *ret_cont)
+{
+ u8 (*patch)[PATCH_MAX_SIZE];
+ u32 rev, *header, *new_rev;
+ struct container ret;
+ int offset, left;
+ u16 eq_id = 0;
+ u8 *data;
+
+#ifdef CONFIG_X86_32
+ new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
+ patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch);
+#else
+ new_rev = &ucode_new_rev;
+ patch = &amd_ucode_patch;
+#endif
if (check_current_patch_level(&rev, true))
- return;
+ return false;
+
+ eq_id = find_proper_container(ucode, size, &ret);
+ if (!eq_id)
+ return false;
+
+ this_equiv_id = eq_id;
+ header = (u32 *)ret.data;
+
+ /* We're pointing to an equiv table, skip over it. */
+ data = ret.data + header[2] + CONTAINER_HDR_SZ;
+ left = ret.size - (header[2] + CONTAINER_HDR_SZ);
while (left > 0) {
struct microcode_amd *mc;
@@ -252,8 +268,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
*new_rev = rev;
if (save_patch)
- memcpy(patch, mc,
- min_t(u32, header[1], PATCH_MAX_SIZE));
+ memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE));
}
}
@@ -261,10 +276,14 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch)
data += offset;
left -= offset;
}
+
+ if (ret_cont)
+ *ret_cont = ret;
+
+ return true;
}
-static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
- unsigned int family)
+static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family)
{
#ifdef CONFIG_X86_64
char fw_name[36] = "amd-ucode/microcode_amd.bin";
@@ -281,47 +300,49 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp,
void __init load_ucode_amd_bsp(unsigned int family)
{
+ struct ucode_cpu_info *uci;
+ u32 eax, ebx, ecx, edx;
struct cpio_data cp;
- bool *builtin;
- void **data;
- size_t *size;
+ const char *path;
+ bool use_pa;
-#ifdef CONFIG_X86_32
- data = (void **)__pa_nodebug(&ucode_cpio.data);
- size = (size_t *)__pa_nodebug(&ucode_cpio.size);
- builtin = (bool *)__pa_nodebug(&ucode_builtin);
-#else
- data = &ucode_cpio.data;
- size = &ucode_cpio.size;
- builtin = &ucode_builtin;
-#endif
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info);
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ uci = ucode_cpu_info;
+ path = ucode_path;
+ use_pa = false;
+ }
- *builtin = load_builtin_amd_microcode(&cp, family);
- if (!*builtin)
- cp = find_ucode_in_initrd();
+ if (!get_builtin_microcode(&cp, family))
+ cp = find_microcode_in_initrd(path, use_pa);
if (!(cp.data && cp.size))
return;
- *data = cp.data;
- *size = cp.size;
+ /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */
+ eax = 1;
+ ecx = 0;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
+ uci->cpu_sig.sig = eax;
- apply_ucode_in_initrd(cp.data, cp.size, true);
+ apply_microcode_early_amd(cp.data, cp.size, true, NULL);
}
#ifdef CONFIG_X86_32
/*
* On 32-bit, since AP's early load occurs before paging is turned on, we
- * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
- * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
- * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
+ * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory.
+ * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP.
+ * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
* which is used upon resume from suspend.
*/
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
{
struct microcode_amd *mc;
- size_t *usize;
- void **ucode;
+ struct cpio_data cp;
mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch);
if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
@@ -329,60 +350,62 @@ void load_ucode_amd_ap(void)
return;
}
- ucode = (void *)__pa_nodebug(&container);
- usize = (size_t *)__pa_nodebug(&container_size);
+ if (!get_builtin_microcode(&cp, family))
+ cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true);
- if (!*ucode || !*usize)
+ if (!(cp.data && cp.size))
return;
- apply_ucode_in_initrd(*ucode, *usize, false);
-}
-
-static void __init collect_cpu_sig_on_bsp(void *arg)
-{
- unsigned int cpu = smp_processor_id();
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- uci->cpu_sig.sig = cpuid_eax(0x00000001);
-}
-
-static void __init get_bsp_sig(void)
-{
- unsigned int bsp = boot_cpu_data.cpu_index;
- struct ucode_cpu_info *uci = ucode_cpu_info + bsp;
-
- if (!uci->cpu_sig.sig)
- smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
+ /*
+ * This would set amd_ucode_patch above so that the following APs can
+ * use it directly instead of going down this path again.
+ */
+ apply_microcode_early_amd(cp.data, cp.size, true, NULL);
}
#else
-void load_ucode_amd_ap(void)
+void load_ucode_amd_ap(unsigned int family)
{
- unsigned int cpu = smp_processor_id();
struct equiv_cpu_entry *eq;
struct microcode_amd *mc;
- u8 *cont = container;
u32 rev, eax;
u16 eq_id;
- /* Exit if called on the BSP. */
- if (!cpu)
+ /* 64-bit runs with paging enabled, thus early==false. */
+ if (check_current_patch_level(&rev, false))
return;
- if (!container)
- return;
+ /* First AP hasn't cached it yet, go through the blob. */
+ if (!cont.data) {
+ struct cpio_data cp = { NULL, 0, "" };
- /*
- * 64-bit runs with paging enabled, thus early==false.
- */
- if (check_current_patch_level(&rev, false))
- return;
+ if (cont.size == -1)
+ return;
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- cont += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+reget:
+ if (!get_builtin_microcode(&cp, family)) {
+#ifdef CONFIG_BLK_DEV_INITRD
+ cp = find_cpio_data(ucode_path, (void *)initrd_start,
+ initrd_end - initrd_start, NULL);
+#endif
+ if (!(cp.data && cp.size)) {
+ /*
+ * Mark it so that other APs do not scan again
+ * for no real reason and slow down boot
+ * needlessly.
+ */
+ cont.size = -1;
+ return;
+ }
+ }
+
+ if (!apply_microcode_early_amd(cp.data, cp.size, false, &cont)) {
+ cont.size = -1;
+ return;
+ }
+ }
eax = cpuid_eax(0x00000001);
- eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ);
+ eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ);
eq_id = find_equiv_id(eq, eax);
if (!eq_id)
@@ -397,61 +420,50 @@ void load_ucode_amd_ap(void)
}
} else {
- if (!ucode_cpio.data)
- return;
/*
* AP has a different equivalence ID than BSP, looks like
* mixed-steppings silicon so go through the ucode blob anew.
*/
- apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false);
+ goto reget;
}
}
-#endif
+#endif /* CONFIG_X86_32 */
+
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
-int __init save_microcode_in_initrd_amd(void)
+int __init save_microcode_in_initrd_amd(unsigned int fam)
{
- unsigned long cont;
- int retval = 0;
enum ucode_state ret;
- u8 *cont_va;
- u32 eax;
+ int retval = 0;
+ u16 eq_id;
- if (!container)
- return -EINVAL;
+ if (!cont.data) {
+ if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) {
+ struct cpio_data cp = { NULL, 0, "" };
-#ifdef CONFIG_X86_32
- get_bsp_sig();
- cont = (unsigned long)container;
- cont_va = __va(container);
-#else
- /*
- * We need the physical address of the container for both bitness since
- * boot_params.hdr.ramdisk_image is a physical address.
- */
- cont = __pa_nodebug(container);
- cont_va = container;
+#ifdef CONFIG_BLK_DEV_INITRD
+ cp = find_cpio_data(ucode_path, (void *)initrd_start,
+ initrd_end - initrd_start, NULL);
#endif
- /*
- * Take into account the fact that the ramdisk might get relocated and
- * therefore we need to recompute the container's position in virtual
- * memory space.
- */
- if (relocated_ramdisk)
- container = (u8 *)(__va(relocated_ramdisk) +
- (cont - boot_params.hdr.ramdisk_image));
- else
- container = cont_va;
+ if (!(cp.data && cp.size)) {
+ cont.size = -1;
+ return -EINVAL;
+ }
- /* Add CONFIG_RANDOMIZE_MEMORY offset. */
- if (!ucode_builtin)
- container += PAGE_OFFSET - __PAGE_OFFSET_BASE;
+ eq_id = find_proper_container(cp.data, cp.size, &cont);
+ if (!eq_id) {
+ cont.size = -1;
+ return -EINVAL;
+ }
- eax = cpuid_eax(0x00000001);
- eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+ } else
+ return -EINVAL;
+ }
- ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+ ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size);
if (ret != UCODE_OK)
retval = -EINVAL;
@@ -459,8 +471,8 @@ int __init save_microcode_in_initrd_amd(void)
* This will be freed any msec now, stash patches for the current
* family and switch to patch cache for cpu hotplug, etc later.
*/
- container = NULL;
- container_size = 0;
+ cont.data = NULL;
+ cont.size = 0;
return retval;
}
@@ -478,8 +490,10 @@ void reload_ucode_amd(void)
return;
mc = (struct microcode_amd *)amd_ucode_patch;
+ if (!mc)
+ return;
- if (mc && rev < mc->hdr.patch_id) {
+ if (rev < mc->hdr.patch_id) {
if (!__apply_microcode_amd(mc)) {
ucode_new_rev = mc->hdr.patch_id;
pr_info("reload patch_level=0x%08x\n", ucode_new_rev);
@@ -513,7 +527,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist)
+ list_for_each_entry(p, &microcode_cache, plist)
if (p->equiv_cpu == equiv_cpu)
return p;
return NULL;
@@ -523,7 +537,7 @@ static void update_cache(struct ucode_patch *new_patch)
{
struct ucode_patch *p;
- list_for_each_entry(p, &pcache, plist) {
+ list_for_each_entry(p, &microcode_cache, plist) {
if (p->equiv_cpu == new_patch->equiv_cpu) {
if (p->patch_id >= new_patch->patch_id)
/* we already have the latest patch */
@@ -536,14 +550,14 @@ static void update_cache(struct ucode_patch *new_patch)
}
}
/* no patch found, add it */
- list_add_tail(&new_patch->plist, &pcache);
+ list_add_tail(&new_patch->plist, &microcode_cache);
}
static void free_cache(void)
{
struct ucode_patch *p, *tmp;
- list_for_each_entry_safe(p, tmp, &pcache, plist) {
+ list_for_each_entry_safe(p, tmp, &microcode_cache, plist) {
__list_del(p->plist.prev, p->plist.next);
kfree(p->data);
kfree(p);
@@ -663,21 +677,7 @@ bool check_current_patch_level(u32 *rev, bool early)
return ret;
}
-int __apply_microcode_amd(struct microcode_amd *mc_amd)
-{
- u32 rev, dummy;
-
- native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
-
- /* verify patch application was successful */
- native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
- if (rev != mc_amd->hdr.patch_id)
- return -1;
-
- return 0;
-}
-
-int apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
{
struct cpuinfo_x86 *c = &cpu_data(cpu);
struct microcode_amd *mc_amd;
@@ -860,7 +860,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
return UCODE_OK;
}
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
{
enum ucode_state ret;
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 5ce5155f0695..2af69d27da62 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -3,7 +3,7 @@
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* 2006 Shaohua Li <shaohua.li@intel.com>
- * 2013-2015 Borislav Petkov <bp@alien8.de>
+ * 2013-2016 Borislav Petkov <bp@alien8.de>
*
* X86 CPU microcode early update for Linux:
*
@@ -39,11 +39,14 @@
#include <asm/microcode.h>
#include <asm/processor.h>
#include <asm/cmdline.h>
+#include <asm/setup.h>
-#define MICROCODE_VERSION "2.01"
+#define DRIVER_VERSION "2.2"
static struct microcode_ops *microcode_ops;
-static bool dis_ucode_ldr;
+static bool dis_ucode_ldr = true;
+
+LIST_HEAD(microcode_cache);
/*
* Synchronization.
@@ -73,6 +76,7 @@ struct cpu_info_ctx {
static bool __init check_loader_disabled_bsp(void)
{
static const char *__dis_opt_str = "dis_ucode_ldr";
+ u32 a, b, c, d;
#ifdef CONFIG_X86_32
const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
@@ -85,8 +89,23 @@ static bool __init check_loader_disabled_bsp(void)
bool *res = &dis_ucode_ldr;
#endif
- if (cmdline_find_option_bool(cmdline, option))
- *res = true;
+ if (!have_cpuid_p())
+ return *res;
+
+ a = 1;
+ c = 0;
+ native_cpuid(&a, &b, &c, &d);
+
+ /*
+ * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+ * completely accurate as xen pv guests don't see that CPUID bit set but
+ * that's good enough as they don't land on the BSP path anyway.
+ */
+ if (c & BIT(31))
+ return *res;
+
+ if (cmdline_find_option_bool(cmdline, option) <= 0)
+ *res = false;
return *res;
}
@@ -118,9 +137,6 @@ void __init load_ucode_bsp(void)
if (check_loader_disabled_bsp())
return;
- if (!have_cpuid_p())
- return;
-
vendor = x86_cpuid_vendor();
family = x86_cpuid_family();
@@ -154,9 +170,6 @@ void load_ucode_ap(void)
if (check_loader_disabled_ap())
return;
- if (!have_cpuid_p())
- return;
-
vendor = x86_cpuid_vendor();
family = x86_cpuid_family();
@@ -167,7 +180,7 @@ void load_ucode_ap(void)
break;
case X86_VENDOR_AMD:
if (family >= 0x10)
- load_ucode_amd_ap();
+ load_ucode_amd_ap(family);
break;
default:
break;
@@ -185,7 +198,7 @@ static int __init save_microcode_in_initrd(void)
break;
case X86_VENDOR_AMD:
if (c->x86 >= 0x10)
- return save_microcode_in_initrd_amd();
+ return save_microcode_in_initrd_amd(c->x86);
break;
default:
break;
@@ -194,6 +207,56 @@ static int __init save_microcode_in_initrd(void)
return -EINVAL;
}
+struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa)
+{
+#ifdef CONFIG_BLK_DEV_INITRD
+ unsigned long start = 0;
+ size_t size;
+
+#ifdef CONFIG_X86_32
+ struct boot_params *params;
+
+ if (use_pa)
+ params = (struct boot_params *)__pa_nodebug(&boot_params);
+ else
+ params = &boot_params;
+
+ size = params->hdr.ramdisk_size;
+
+ /*
+ * Set start only if we have an initrd image. We cannot use initrd_start
+ * because it is not set that early yet.
+ */
+ if (size)
+ start = params->hdr.ramdisk_image;
+
+# else /* CONFIG_X86_64 */
+ size = (unsigned long)boot_params.ext_ramdisk_size << 32;
+ size |= boot_params.hdr.ramdisk_size;
+
+ if (size) {
+ start = (unsigned long)boot_params.ext_ramdisk_image << 32;
+ start |= boot_params.hdr.ramdisk_image;
+
+ start += PAGE_OFFSET;
+ }
+# endif
+
+ /*
+ * Fixup the start address: after reserve_initrd() runs, initrd_start
+ * has the virtual address of the beginning of the initrd. It also
+ * possibly relocates the ramdisk. In either case, initrd_start contains
+ * the updated address so use that instead.
+ */
+ if (!use_pa && initrd_start)
+ start = initrd_start;
+
+ return find_cpio_data(path, (void *)start, size, NULL);
+#else /* !CONFIG_BLK_DEV_INITRD */
+ return (struct cpio_data){ NULL, 0, "" };
+#endif
+}
+
void reload_early_microcode(void)
{
int vendor, family;
@@ -453,16 +516,17 @@ static struct attribute_group mc_attr_group = {
static void microcode_fini_cpu(int cpu)
{
- microcode_ops->microcode_fini_cpu(cpu);
+ if (microcode_ops->microcode_fini_cpu)
+ microcode_ops->microcode_fini_cpu(cpu);
}
static enum ucode_state microcode_resume_cpu(int cpu)
{
- pr_debug("CPU%d updated upon resume\n", cpu);
-
if (apply_microcode_on_target(cpu))
return UCODE_ERROR;
+ pr_debug("CPU%d updated upon resume\n", cpu);
+
return UCODE_OK;
}
@@ -496,6 +560,9 @@ static enum ucode_state microcode_update_cpu(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+ /* Refresh CPU microcode revision after resume. */
+ collect_cpu_info(cpu);
+
if (uci->valid)
return microcode_resume_cpu(cpu);
@@ -579,12 +646,7 @@ static int mc_cpu_down_prep(unsigned int cpu)
/* Suspend is in progress, only remove the interface */
sysfs_remove_group(&dev->kobj, &mc_attr_group);
pr_debug("CPU%d removed\n", cpu);
- /*
- * When a CPU goes offline, don't free up or invalidate the copy of
- * the microcode in kernel memory, so that we can reuse it when the
- * CPU comes back online without unnecessarily requesting the userspace
- * for it again.
- */
+
return 0;
}
@@ -649,8 +711,7 @@ int __init microcode_init(void)
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online",
mc_cpu_online, mc_cpu_down_prep);
- pr_info("Microcode Update Driver: v" MICROCODE_VERSION
- " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
+ pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION);
return 0;
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index cdc0deab00c9..b624b54912e1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,125 +39,83 @@
#include <asm/setup.h>
#include <asm/msr.h>
-/*
- * Temporary microcode blobs pointers storage. We note here during early load
- * the pointers to microcode blobs we've got from whatever storage (detached
- * initrd, builtin). Later on, we put those into final storage
- * mc_saved_data.mc_saved.
- *
- * Important: those are offsets from the beginning of initrd or absolute
- * addresses within the kernel image when built-in.
- */
-static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT];
-
-static struct mc_saved_data {
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
-} mc_saved_data;
+static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin";
-/* Microcode blobs within the initrd. 0 if builtin. */
-static struct ucode_blobs {
- unsigned long start;
- bool valid;
-} blobs;
+/* Current microcode patch used in early patching */
+struct microcode_intel *intel_ucode_patch;
-/* Go through saved patches and find the one suitable for the current CPU. */
-static enum ucode_state
-find_microcode_patch(struct microcode_intel **saved,
- unsigned int num_saved, struct ucode_cpu_info *uci)
+static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
+ unsigned int s2, unsigned int p2)
{
- struct microcode_intel *ucode_ptr, *new_mc = NULL;
- struct microcode_header_intel *mc_hdr;
- int new_rev, ret, i;
-
- new_rev = uci->cpu_sig.rev;
-
- for (i = 0; i < num_saved; i++) {
- ucode_ptr = saved[i];
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
-
- ret = has_newer_microcode(ucode_ptr,
- uci->cpu_sig.sig,
- uci->cpu_sig.pf,
- new_rev);
- if (!ret)
- continue;
-
- new_rev = mc_hdr->rev;
- new_mc = ucode_ptr;
- }
+ if (s1 != s2)
+ return false;
- if (!new_mc)
- return UCODE_NFOUND;
+ /* Processor flags are either both 0 ... */
+ if (!p1 && !p2)
+ return true;
- uci->mc = (struct microcode_intel *)new_mc;
- return UCODE_OK;
+ /* ... or they intersect. */
+ return p1 & p2;
}
-static inline void
-copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs,
- unsigned long off, int num_saved)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int find_matching_signature(void *mc, unsigned int csig, int cpf)
{
+ struct microcode_header_intel *mc_hdr = mc;
+ struct extended_sigtable *ext_hdr;
+ struct extended_signature *ext_sig;
int i;
- for (i = 0; i < num_saved; i++)
- mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off);
-}
-
-#ifdef CONFIG_X86_32
-static void
-microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs)
-{
- int i;
- struct microcode_intel ***mc_saved;
+ if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
+ return 1;
- mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved);
+ /* Look for ext. headers: */
+ if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
+ return 0;
- for (i = 0; i < mcs->num_saved; i++) {
- struct microcode_intel *p;
+ ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
- p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i);
- mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p);
+ for (i = 0; i < ext_hdr->count; i++) {
+ if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
+ return 1;
+ ext_sig++;
}
+ return 0;
}
-#endif
-static enum ucode_state
-load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- unsigned long offset, struct ucode_cpu_info *uci)
+/*
+ * Returns 1 if update has been found, 0 otherwise.
+ */
+static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int count = mcs->num_saved;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!mcs->mc_saved) {
- copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count);
+ if (mc_hdr->rev <= new_rev)
+ return 0;
- return find_microcode_patch(mc_saved_tmp, count, uci);
- } else {
-#ifdef CONFIG_X86_32
- microcode_phys(mc_saved_tmp, mcs);
- return find_microcode_patch(mc_saved_tmp, count, uci);
-#else
- return find_microcode_patch(mcs->mc_saved, count, uci);
-#endif
- }
+ return find_matching_signature(mc, csig, cpf);
}
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
+ *
+ * %true - if there's a match
+ * %false - otherwise
*/
-static enum ucode_state
-matching_model_microcode(struct microcode_header_intel *mc_header,
- unsigned long sig)
+static bool microcode_matches(struct microcode_header_intel *mc_header,
+ unsigned long sig)
{
- unsigned int fam, model;
- unsigned int fam_ucode, model_ucode;
- struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
- int ext_sigcount, i;
+ struct extended_sigtable *ext_header;
+ unsigned int fam_ucode, model_ucode;
struct extended_signature *ext_sig;
+ unsigned int fam, model;
+ int ext_sigcount, i;
fam = x86_family(sig);
model = x86_model(sig);
@@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(mc_header->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
- return UCODE_NFOUND;
+ return false;
ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -181,192 +139,262 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
model_ucode = x86_model(ext_sig->sig);
if (fam == fam_ucode && model == model_ucode)
- return UCODE_OK;
+ return true;
ext_sig++;
}
- return UCODE_NFOUND;
+ return false;
}
-static int
-save_microcode(struct mc_saved_data *mcs,
- struct microcode_intel **mc_saved_src,
- unsigned int num_saved)
+static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size)
{
- int i, j;
- struct microcode_intel **saved_ptr;
- int ret;
+ struct ucode_patch *p;
- if (!num_saved)
- return -EINVAL;
+ p = kzalloc(size, GFP_KERNEL);
+ if (!p)
+ return ERR_PTR(-ENOMEM);
- /*
- * Copy new microcode data.
- */
- saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL);
- if (!saved_ptr)
- return -ENOMEM;
-
- for (i = 0; i < num_saved; i++) {
- struct microcode_header_intel *mc_hdr;
- struct microcode_intel *mc;
- unsigned long size;
-
- if (!mc_saved_src[i]) {
- ret = -EINVAL;
- goto err;
- }
+ p->data = kmemdup(data, size, GFP_KERNEL);
+ if (!p->data) {
+ kfree(p);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return p;
+}
+
+static void save_microcode_patch(void *data, unsigned int size)
+{
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ struct ucode_patch *iter, *tmp, *p;
+ bool prev_found = false;
+ unsigned int sig, pf;
+
+ mc_hdr = (struct microcode_header_intel *)data;
+
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
+ mc_saved_hdr = (struct microcode_header_intel *)iter->data;
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+
+ if (find_matching_signature(data, sig, pf)) {
+ prev_found = true;
- mc = mc_saved_src[i];
- mc_hdr = &mc->hdr;
- size = get_totalsize(mc_hdr);
+ if (mc_hdr->rev <= mc_saved_hdr->rev)
+ continue;
- saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL);
- if (!saved_ptr[i]) {
- ret = -ENOMEM;
- goto err;
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer %p\n", data);
+ else
+ list_replace(&iter->plist, &p->plist);
}
}
/*
- * Point to newly saved microcode.
+ * There weren't any previous patches found in the list cache; save the
+ * newly found.
*/
- mcs->mc_saved = saved_ptr;
- mcs->num_saved = num_saved;
-
- return 0;
-
-err:
- for (j = 0; j <= i; j++)
- kfree(saved_ptr[j]);
- kfree(saved_ptr);
-
- return ret;
+ if (!prev_found) {
+ p = __alloc_microcode_buf(data, size);
+ if (IS_ERR(p))
+ pr_err("Error allocating buffer for %p\n", data);
+ else
+ list_add_tail(&p->plist, &microcode_cache);
+ }
}
-/*
- * A microcode patch in ucode_ptr is saved into mc_saved
- * - if it has matching signature and newer revision compared to an existing
- * patch mc_saved.
- * - or if it is a newly discovered microcode patch.
- *
- * The microcode patch should have matching model with CPU.
- *
- * Returns: The updated number @num_saved of saved microcode patches.
- */
-static unsigned int _save_mc(struct microcode_intel **mc_saved,
- u8 *ucode_ptr, unsigned int num_saved)
+static int microcode_sanity_check(void *mc, int print_err)
{
- struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
- unsigned int sig, pf;
- int found = 0, i;
+ unsigned long total_size, data_size, ext_table_size;
+ struct microcode_header_intel *mc_header = mc;
+ struct extended_sigtable *ext_header = NULL;
+ u32 sum, orig_sum, ext_sigcount = 0, i;
+ struct extended_signature *ext_sig;
- mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+ total_size = get_totalsize(mc_header);
+ data_size = get_datasize(mc_header);
- for (i = 0; i < num_saved; i++) {
- mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_hdr->sig;
- pf = mc_saved_hdr->pf;
+ if (data_size + MC_HEADER_SIZE > total_size) {
+ if (print_err)
+ pr_err("Error: bad microcode data file size.\n");
+ return -EINVAL;
+ }
- if (!find_matching_signature(ucode_ptr, sig, pf))
- continue;
+ if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+ if (print_err)
+ pr_err("Error: invalid/unknown microcode update format.\n");
+ return -EINVAL;
+ }
- found = 1;
+ ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+ if (ext_table_size) {
+ u32 ext_table_sum = 0;
+ u32 *ext_tablep;
- if (mc_hdr->rev <= mc_saved_hdr->rev)
- continue;
+ if ((ext_table_size < EXT_HEADER_SIZE)
+ || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+ if (print_err)
+ pr_err("Error: truncated extended signature table.\n");
+ return -EINVAL;
+ }
+
+ ext_header = mc + MC_HEADER_SIZE + data_size;
+ if (ext_table_size != exttable_size(ext_header)) {
+ if (print_err)
+ pr_err("Error: extended signature table size mismatch.\n");
+ return -EFAULT;
+ }
+
+ ext_sigcount = ext_header->count;
/*
- * Found an older ucode saved earlier. Replace it with
- * this newer one.
+ * Check extended table checksum: the sum of all dwords that
+ * comprise a valid table must be 0.
*/
- mc_saved[i] = (struct microcode_intel *)ucode_ptr;
- break;
+ ext_tablep = (u32 *)ext_header;
+
+ i = ext_table_size / sizeof(u32);
+ while (i--)
+ ext_table_sum += ext_tablep[i];
+
+ if (ext_table_sum) {
+ if (print_err)
+ pr_warn("Bad extended signature table checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Calculate the checksum of update data and header. The checksum of
+ * valid update data and header including the extended signature table
+ * must be 0.
+ */
+ orig_sum = 0;
+ i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
+ while (i--)
+ orig_sum += ((u32 *)mc)[i];
+
+ if (orig_sum) {
+ if (print_err)
+ pr_err("Bad microcode data checksum, aborting.\n");
+ return -EINVAL;
}
- /* Newly detected microcode, save it to memory. */
- if (i >= num_saved && !found)
- mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
+ if (!ext_table_size)
+ return 0;
- return num_saved;
+ /*
+ * Check extended signature checksum: 0 => valid.
+ */
+ for (i = 0; i < ext_sigcount; i++) {
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+ EXT_SIGNATURE_SIZE * i;
+
+ sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+ if (sum) {
+ if (print_err)
+ pr_err("Bad extended signature checksum, aborting.\n");
+ return -EINVAL;
+ }
+ }
+ return 0;
}
/*
* Get microcode matching with BSP's model. Only CPUs with the same model as
* BSP can stay in the platform.
*/
-static enum ucode_state __init
-get_matching_model_microcode(unsigned long start, void *data, size_t size,
- struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci)
+static struct microcode_intel *
+scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save)
{
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
struct microcode_header_intel *mc_header;
- unsigned int num_saved = mcs->num_saved;
- enum ucode_state state = UCODE_OK;
- unsigned int leftover = size;
- u8 *ucode_ptr = data;
+ struct microcode_intel *patch = NULL;
unsigned int mc_size;
- int i;
-
- while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) {
- if (leftover < sizeof(mc_header))
+ while (size) {
+ if (size < sizeof(struct microcode_header_intel))
break;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
+ mc_header = (struct microcode_header_intel *)data;
mc_size = get_totalsize(mc_header);
- if (!mc_size || mc_size > leftover ||
- microcode_sanity_check(ucode_ptr, 0) < 0)
+ if (!mc_size ||
+ mc_size > size ||
+ microcode_sanity_check(data, 0) < 0)
break;
- leftover -= mc_size;
+ size -= mc_size;
- /*
- * Since APs with same family and model as the BSP may boot in
- * the platform, we need to find and save microcode patches
- * with the same family and model as the BSP.
- */
- if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) {
- ucode_ptr += mc_size;
+ if (!microcode_matches(mc_header, uci->cpu_sig.sig)) {
+ data += mc_size;
continue;
}
- num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved);
+ if (save) {
+ save_microcode_patch(data, mc_size);
+ goto next;
+ }
- ucode_ptr += mc_size;
- }
- if (leftover) {
- state = UCODE_ERROR;
- return state;
- }
+ if (!patch) {
+ if (!has_newer_microcode(data,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ uci->cpu_sig.rev))
+ goto next;
+
+ } else {
+ struct microcode_header_intel *phdr = &patch->hdr;
+
+ if (!has_newer_microcode(data,
+ phdr->sig,
+ phdr->pf,
+ phdr->rev))
+ goto next;
+ }
- if (!num_saved) {
- state = UCODE_NFOUND;
- return state;
+ /* We have a newer patch, save it. */
+ patch = data;
+
+next:
+ data += mc_size;
}
- for (i = 0; i < num_saved; i++)
- mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start;
+ if (size)
+ return NULL;
+
+ return patch;
+}
- mcs->num_saved = num_saved;
+static void cpuid_1(void)
+{
+ /*
+ * According to the Intel SDM, Volume 3, 9.11.7:
+ *
+ * CPUID returns a value in a model specific register in
+ * addition to its usual register return values. The
+ * semantics of CPUID cause it to deposit an update ID value
+ * in the 64-bit model-specific register at address 08BH
+ * (IA32_BIOS_SIGN_ID). If no update is present in the
+ * processor, the value in the MSR remains unmodified.
+ *
+ * Use native_cpuid -- this code runs very early and we don't
+ * want to mess with paravirt.
+ */
+ unsigned int eax = 1, ebx, ecx = 0, edx;
- return state;
+ native_cpuid(&eax, &ebx, &ecx, &edx);
}
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
unsigned int family, model;
- struct cpu_signature csig;
+ struct cpu_signature csig = { 0 };
unsigned int eax, ebx, ecx, edx;
- csig.sig = 0;
- csig.pf = 0;
- csig.rev = 0;
-
memset(uci, 0, sizeof(*uci));
eax = 0x00000001;
@@ -374,8 +402,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- family = x86_family(csig.sig);
- model = x86_model(csig.sig);
+ family = x86_family(eax);
+ model = x86_model(eax);
if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
@@ -385,7 +413,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
+ cpuid_1();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -401,40 +429,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
static void show_saved_mc(void)
{
#ifdef DEBUG
- int i, j;
+ int i = 0, j;
unsigned int sig, pf, rev, total_size, data_size, date;
struct ucode_cpu_info uci;
+ struct ucode_patch *p;
- if (!mc_saved_data.num_saved) {
+ if (list_empty(&microcode_cache)) {
pr_debug("no microcode data saved.\n");
return;
}
- pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved);
collect_cpu_info_early(&uci);
- sig = uci.cpu_sig.sig;
- pf = uci.cpu_sig.pf;
- rev = uci.cpu_sig.rev;
+ sig = uci.cpu_sig.sig;
+ pf = uci.cpu_sig.pf;
+ rev = uci.cpu_sig.rev;
pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
- for (i = 0; i < mc_saved_data.num_saved; i++) {
+ list_for_each_entry(p, &microcode_cache, plist) {
struct microcode_header_intel *mc_saved_header;
struct extended_sigtable *ext_header;
- int ext_sigcount;
struct extended_signature *ext_sig;
+ int ext_sigcount;
+
+ mc_saved_header = (struct microcode_header_intel *)p->data;
- mc_saved_header = (struct microcode_header_intel *)
- mc_saved_data.mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- rev = mc_saved_header->rev;
- total_size = get_totalsize(mc_saved_header);
- data_size = get_datasize(mc_saved_header);
- date = mc_saved_header->date;
+ sig = mc_saved_header->sig;
+ pf = mc_saved_header->pf;
+ rev = mc_saved_header->rev;
+ date = mc_saved_header->date;
+
+ total_size = get_totalsize(mc_saved_header);
+ data_size = get_datasize(mc_saved_header);
pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n",
- i, sig, pf, rev, total_size,
+ i++, sig, pf, rev, total_size,
date & 0xffff,
date >> 24,
(date >> 16) & 0xff);
@@ -443,7 +472,7 @@ static void show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -456,85 +485,43 @@ static void show_saved_mc(void)
ext_sig++;
}
-
}
#endif
}
/*
- * Save this mc into mc_saved_data. So it will be loaded early when a CPU is
- * hot added or resumes.
- *
- * Please make sure this mc should be a valid microcode patch before calling
- * this function.
+ * Save this microcode patch. It will be loaded early when a CPU is
+ * hot-added or resumes.
*/
-static void save_mc_for_early(u8 *mc)
+static void save_mc_for_early(u8 *mc, unsigned int size)
{
#ifdef CONFIG_HOTPLUG_CPU
/* Synchronization during CPU hotplug. */
static DEFINE_MUTEX(x86_cpu_microcode_mutex);
- struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
- unsigned int mc_saved_count_init;
- unsigned int num_saved;
- struct microcode_intel **mc_saved;
- int ret, i;
-
mutex_lock(&x86_cpu_microcode_mutex);
- mc_saved_count_init = mc_saved_data.num_saved;
- num_saved = mc_saved_data.num_saved;
- mc_saved = mc_saved_data.mc_saved;
-
- if (mc_saved && num_saved)
- memcpy(mc_saved_tmp, mc_saved,
- num_saved * sizeof(struct microcode_intel *));
- /*
- * Save the microcode patch mc in mc_save_tmp structure if it's a newer
- * version.
- */
- num_saved = _save_mc(mc_saved_tmp, mc, num_saved);
-
- /*
- * Save the mc_save_tmp in global mc_saved_data.
- */
- ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved);
- if (ret) {
- pr_err("Cannot save microcode patch.\n");
- goto out;
- }
-
+ save_microcode_patch(mc, size);
show_saved_mc();
- /*
- * Free old saved microcode data.
- */
- if (mc_saved) {
- for (i = 0; i < mc_saved_count_init; i++)
- kfree(mc_saved[i]);
- kfree(mc_saved);
- }
-
-out:
mutex_unlock(&x86_cpu_microcode_mutex);
#endif
}
-static bool __init load_builtin_intel_microcode(struct cpio_data *cp)
+static bool load_builtin_intel_microcode(struct cpio_data *cp)
{
-#ifdef CONFIG_X86_64
- unsigned int eax = 0x00000001, ebx, ecx = 0, edx;
+ unsigned int eax = 1, ebx, ecx = 0, edx;
char name[30];
+ if (IS_ENABLED(CONFIG_X86_32))
+ return false;
+
native_cpuid(&eax, &ebx, &ecx, &edx);
sprintf(name, "intel-ucode/%02x-%02x-%02x",
x86_family(eax), x86_model(eax), x86_stepping(eax));
return get_builtin_firmware(cp, name);
-#else
- return false;
-#endif
}
/*
@@ -570,8 +557,7 @@ void show_ucode_info_early(void)
}
/*
- * At this point, we can not call printk() yet. Keep microcode patch number in
- * mc_saved_data.mc_saved and delay printing microcode info in
+ * At this point, we can not call printk() yet. Delay printing microcode info in
* show_ucode_info_early() until printk() works.
*/
static void print_ucode(struct ucode_cpu_info *uci)
@@ -627,7 +613,7 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
native_wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
+ cpuid_1();
/* get the current revision from MSR 0x8B */
native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -648,206 +634,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
return 0;
}
-/*
- * This function converts microcode patch offsets previously stored in
- * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data.
- */
int __init save_microcode_in_initrd_intel(void)
{
- struct microcode_intel *mc_saved[MAX_UCODE_COUNT];
- unsigned int count = mc_saved_data.num_saved;
- unsigned long offset = 0;
- int ret;
-
- if (!count)
- return 0;
+ struct ucode_cpu_info uci;
+ struct cpio_data cp;
/*
- * We have found a valid initrd but it might've been relocated in the
- * meantime so get its updated address.
+ * AP loading didn't find any microcode patch, no need to save anything.
*/
- if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid)
- offset = initrd_start;
-
- copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count);
+ if (!intel_ucode_patch || IS_ERR(intel_ucode_patch))
+ return 0;
- ret = save_microcode(&mc_saved_data, mc_saved, count);
- if (ret)
- pr_err("Cannot save microcode patches from initrd.\n");
- else
- show_saved_mc();
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(ucode_path, false);
- return ret;
-}
+ if (!(cp.data && cp.size))
+ return 0;
-static __init enum ucode_state
-__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp)
-{
-#ifdef CONFIG_BLK_DEV_INITRD
- static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
- char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name)
- : ucode_name;
-# ifdef CONFIG_X86_32
- unsigned long start = 0, size;
- struct boot_params *params;
+ collect_cpu_info_early(&uci);
- params = (struct boot_params *)__pa_nodebug(&boot_params);
- size = params->hdr.ramdisk_size;
+ scan_microcode(cp.data, cp.size, &uci, true);
- /*
- * Set start only if we have an initrd image. We cannot use initrd_start
- * because it is not set that early yet.
- */
- start = (size ? params->hdr.ramdisk_image : 0);
+ show_saved_mc();
-# else /* CONFIG_X86_64 */
- unsigned long start = 0, size;
+ return 0;
+}
- size = (u64)boot_params.ext_ramdisk_size << 32;
- size |= boot_params.hdr.ramdisk_size;
- if (size) {
- start = (u64)boot_params.ext_ramdisk_image << 32;
- start |= boot_params.hdr.ramdisk_image;
+/*
+ * @res_patch, output: a pointer to the patch we found.
+ */
+static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci)
+{
+ static const char *path;
+ struct cpio_data cp;
+ bool use_pa;
- start += PAGE_OFFSET;
+ if (IS_ENABLED(CONFIG_X86_32)) {
+ path = (const char *)__pa_nodebug(ucode_path);
+ use_pa = true;
+ } else {
+ path = ucode_path;
+ use_pa = false;
}
-# endif
-
- *cd = find_cpio_data(p, (void *)start, size, NULL);
- if (cd->data) {
- blbp->start = start;
- blbp->valid = true;
- return UCODE_OK;
- } else
-#endif /* CONFIG_BLK_DEV_INITRD */
- return UCODE_ERROR;
-}
+ /* try built-in microcode first */
+ if (!load_builtin_intel_microcode(&cp))
+ cp = find_microcode_in_initrd(path, use_pa);
-static __init enum ucode_state
-scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_cpu_info *uci, struct ucode_blobs *blbp)
-{
- struct cpio_data cd = { NULL, 0, "" };
- enum ucode_state ret;
+ if (!(cp.data && cp.size))
+ return NULL;
- /* try built-in microcode first */
- if (load_builtin_intel_microcode(&cd))
- /*
- * Invalidate blobs as we might've gotten an initrd too,
- * supplied by the boot loader, by mistake or simply forgotten
- * there. That's fine, we ignore it since we've found builtin
- * microcode already.
- */
- blbp->valid = false;
- else {
- ret = __scan_microcode_initrd(&cd, blbp);
- if (ret != UCODE_OK)
- return ret;
- }
+ collect_cpu_info_early(uci);
- return get_matching_model_microcode(blbp->start, cd.data, cd.size,
- mcs, mc_ptrs, uci);
+ return scan_microcode(cp.data, cp.size, uci, false);
}
-static void __init
-_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs,
- struct ucode_blobs *blbp)
+void __init load_ucode_intel_bsp(void)
{
+ struct microcode_intel *patch;
struct ucode_cpu_info uci;
- enum ucode_state ret;
- collect_cpu_info_early(&uci);
-
- ret = scan_microcode(mcs, mc_ptrs, &uci, blbp);
- if (ret != UCODE_OK)
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
return;
- ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci);
- if (ret != UCODE_OK)
- return;
+ uci.mc = patch;
apply_microcode_early(&uci, true);
}
-void __init load_ucode_intel_bsp(void)
+void load_ucode_intel_ap(void)
{
- struct ucode_blobs *blobs_p;
- struct mc_saved_data *mcs;
- unsigned long *ptrs;
+ struct microcode_intel *patch, **iup;
+ struct ucode_cpu_info uci;
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
+ if (IS_ENABLED(CONFIG_X86_32))
+ iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch);
+ else
+ iup = &intel_ucode_patch;
+
+reget:
+ if (!*iup) {
+ patch = __load_ucode_intel(&uci);
+ if (!patch)
+ return;
+
+ *iup = patch;
+ }
+
+ uci.mc = *iup;
- _load_ucode_intel_bsp(mcs, ptrs, blobs_p);
+ if (apply_microcode_early(&uci, true)) {
+ /* Mixed-silicon system? Try to refetch the proper patch: */
+ *iup = NULL;
+
+ goto reget;
+ }
}
-void load_ucode_intel_ap(void)
+static struct microcode_intel *find_patch(struct ucode_cpu_info *uci)
{
- struct ucode_blobs *blobs_p;
- unsigned long *ptrs, start = 0;
- struct mc_saved_data *mcs;
- struct ucode_cpu_info uci;
- enum ucode_state ret;
-
-#ifdef CONFIG_X86_32
- mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data);
- ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs);
- blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs);
-#else
- mcs = &mc_saved_data;
- ptrs = mc_tmp_ptrs;
- blobs_p = &blobs;
-#endif
+ struct microcode_header_intel *phdr;
+ struct ucode_patch *iter, *tmp;
- /*
- * If there is no valid ucode previously saved in memory, no need to
- * update ucode on this AP.
- */
- if (!mcs->num_saved)
- return;
+ list_for_each_entry_safe(iter, tmp, &microcode_cache, plist) {
- if (blobs_p->valid) {
- start = blobs_p->start;
+ phdr = (struct microcode_header_intel *)iter->data;
- /*
- * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles
- * physmem mapping too and there we have the initrd.
- */
- start += PAGE_OFFSET - __PAGE_OFFSET_BASE;
- }
+ if (phdr->rev <= uci->cpu_sig.rev)
+ continue;
- collect_cpu_info_early(&uci);
- ret = load_microcode(mcs, ptrs, start, &uci);
- if (ret != UCODE_OK)
- return;
+ if (!find_matching_signature(phdr,
+ uci->cpu_sig.sig,
+ uci->cpu_sig.pf))
+ continue;
- apply_microcode_early(&uci, true);
+ return iter->data;
+ }
+ return NULL;
}
void reload_ucode_intel(void)
{
+ struct microcode_intel *p;
struct ucode_cpu_info uci;
- enum ucode_state ret;
-
- if (!mc_saved_data.num_saved)
- return;
collect_cpu_info_early(&uci);
- ret = find_microcode_patch(mc_saved_data.mc_saved,
- mc_saved_data.num_saved, &uci);
- if (ret != UCODE_OK)
+ p = find_patch(&uci);
+ if (!p)
return;
+ uci.mc = p;
+
apply_microcode_early(&uci, false);
}
@@ -879,24 +799,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
return 0;
}
-/*
- * return 0 - no update found
- * return 1 - found update
- */
-static int get_matching_mc(struct microcode_intel *mc, int cpu)
-{
- struct cpu_signature cpu_sig;
- unsigned int csig, cpf, crev;
-
- collect_cpu_info(cpu, &cpu_sig);
-
- csig = cpu_sig.sig;
- cpf = cpu_sig.pf;
- crev = cpu_sig.rev;
-
- return has_newer_microcode(mc, csig, cpf, crev);
-}
-
static int apply_microcode_intel(int cpu)
{
struct microcode_intel *mc;
@@ -911,23 +813,19 @@ static int apply_microcode_intel(int cpu)
uci = ucode_cpu_info + cpu;
mc = uci->mc;
- if (!mc)
- return 0;
-
- /*
- * Microcode on this CPU could be updated earlier. Only apply the
- * microcode patch in mc when it is newer than the one on this
- * CPU.
- */
- if (!get_matching_mc(mc, cpu))
- return 0;
+ if (!mc) {
+ /* Look for a newer patch in our cache: */
+ mc = find_patch(uci);
+ if (!mc)
+ return 0;
+ }
/* write microcode via MSR 0x79 */
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
wrmsrl(MSR_IA32_UCODE_REV, 0);
/* As documented in the SDM: Do a CPUID 1 here */
- sync_core();
+ cpuid_1();
/* get the current revision from MSR 0x8B */
rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
@@ -962,7 +860,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
- enum ucode_state state = UCODE_OK;
unsigned int curr_mc_size = 0;
unsigned int csig, cpf;
@@ -1015,14 +912,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
if (leftover) {
vfree(new_mc);
- state = UCODE_ERROR;
- goto out;
+ return UCODE_ERROR;
}
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
- }
+ if (!new_mc)
+ return UCODE_NFOUND;
vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
@@ -1032,12 +926,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
* permanent memory. So it will be loaded early when a CPU is hot added
* or resumes.
*/
- save_mc_for_early(new_mc);
+ save_mc_for_early(new_mc, curr_mc_size);
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
cpu, new_rev, uci->cpu_sig.rev);
-out:
- return state;
+
+ return UCODE_OK;
}
static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -1081,20 +975,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size)
return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
}
-static void microcode_fini_cpu(int cpu)
-{
- struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
- vfree(uci->mc);
- uci->mc = NULL;
-}
-
static struct microcode_ops microcode_intel_ops = {
.request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
- .microcode_fini_cpu = microcode_fini_cpu,
};
struct microcode_ops * __init init_intel_microcode(void)
@@ -1109,4 +994,3 @@ struct microcode_ops * __init init_intel_microcode(void)
return &microcode_intel_ops;
}
-
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
deleted file mode 100644
index 406cb6c0d9dd..000000000000
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Intel CPU Microcode Update Driver for Linux
- *
- * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com>
- * H Peter Anvin" <hpa@zytor.com>
- *
- * This driver allows to upgrade microcode on Intel processors
- * belonging to IA-32 family - PentiumPro, Pentium II,
- * Pentium III, Xeon, Pentium 4, etc.
- *
- * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- * Software Developer's Manual
- * Order Number 253668 or free download from:
- *
- * http://developer.intel.com/Assets/PDF/manual/253668.pdf
- *
- * For more information, go to http://www.urbanmyth.org/microcode
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- */
-#include <linux/firmware.h>
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-
-#include <asm/microcode_intel.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
-
-static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1,
- unsigned int s2, unsigned int p2)
-{
- if (s1 != s2)
- return false;
-
- /* Processor flags are either both 0 ... */
- if (!p1 && !p2)
- return true;
-
- /* ... or they intersect. */
- return p1 & p2;
-}
-
-int microcode_sanity_check(void *mc, int print_err)
-{
- unsigned long total_size, data_size, ext_table_size;
- struct microcode_header_intel *mc_header = mc;
- struct extended_sigtable *ext_header = NULL;
- u32 sum, orig_sum, ext_sigcount = 0, i;
- struct extended_signature *ext_sig;
-
- total_size = get_totalsize(mc_header);
- data_size = get_datasize(mc_header);
-
- if (data_size + MC_HEADER_SIZE > total_size) {
- if (print_err)
- pr_err("Error: bad microcode data file size.\n");
- return -EINVAL;
- }
-
- if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
- if (print_err)
- pr_err("Error: invalid/unknown microcode update format.\n");
- return -EINVAL;
- }
-
- ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
- if (ext_table_size) {
- u32 ext_table_sum = 0;
- u32 *ext_tablep;
-
- if ((ext_table_size < EXT_HEADER_SIZE)
- || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
- if (print_err)
- pr_err("Error: truncated extended signature table.\n");
- return -EINVAL;
- }
-
- ext_header = mc + MC_HEADER_SIZE + data_size;
- if (ext_table_size != exttable_size(ext_header)) {
- if (print_err)
- pr_err("Error: extended signature table size mismatch.\n");
- return -EFAULT;
- }
-
- ext_sigcount = ext_header->count;
-
- /*
- * Check extended table checksum: the sum of all dwords that
- * comprise a valid table must be 0.
- */
- ext_tablep = (u32 *)ext_header;
-
- i = ext_table_size / sizeof(u32);
- while (i--)
- ext_table_sum += ext_tablep[i];
-
- if (ext_table_sum) {
- if (print_err)
- pr_warn("Bad extended signature table checksum, aborting.\n");
- return -EINVAL;
- }
- }
-
- /*
- * Calculate the checksum of update data and header. The checksum of
- * valid update data and header including the extended signature table
- * must be 0.
- */
- orig_sum = 0;
- i = (MC_HEADER_SIZE + data_size) / sizeof(u32);
- while (i--)
- orig_sum += ((u32 *)mc)[i];
-
- if (orig_sum) {
- if (print_err)
- pr_err("Bad microcode data checksum, aborting.\n");
- return -EINVAL;
- }
-
- if (!ext_table_size)
- return 0;
-
- /*
- * Check extended signature checksum: 0 => valid.
- */
- for (i = 0; i < ext_sigcount; i++) {
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
- EXT_SIGNATURE_SIZE * i;
-
- sum = (mc_header->sig + mc_header->pf + mc_header->cksum) -
- (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
- if (sum) {
- if (print_err)
- pr_err("Bad extended signature checksum, aborting.\n");
- return -EINVAL;
- }
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int find_matching_signature(void *mc, unsigned int csig, int cpf)
-{
- struct microcode_header_intel *mc_hdr = mc;
- struct extended_sigtable *ext_hdr;
- struct extended_signature *ext_sig;
- int i;
-
- if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf))
- return 1;
-
- /* Look for ext. headers: */
- if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE)
- return 0;
-
- ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE;
- ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE;
-
- for (i = 0; i < ext_hdr->count; i++) {
- if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf))
- return 1;
- ext_sig++;
- }
- return 0;
-}
-
-/*
- * Returns 1 if update has been found, 0 otherwise.
- */
-int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev)
-{
- struct microcode_header_intel *mc_hdr = mc;
-
- if (mc_hdr->rev <= new_rev)
- return 0;
-
- return find_matching_signature(mc, csig, cpf);
-}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 8f44c5a50ab8..f37e02e41a77 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -25,12 +25,12 @@
#include <asm/hyperv.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
-#include <asm/idle.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
#include <asm/timer.h>
#include <asm/reboot.h>
+#include <asm/nmi.h>
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -158,6 +158,26 @@ static unsigned char hv_get_nmi_reason(void)
return 0;
}
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Prior to WS2016 Debug-VM sends NMIs to all CPUs which makes
+ * it dificult to process CHANNELMSG_UNLOAD in case of crash. Handle
+ * unknown NMI on the first CPU which gets it.
+ */
+static int hv_nmi_unknown(unsigned int val, struct pt_regs *regs)
+{
+ static atomic_t nmi_cpu = ATOMIC_INIT(-1);
+
+ if (!unknown_nmi_panic)
+ return NMI_DONE;
+
+ if (atomic_cmpxchg(&nmi_cpu, -1, raw_smp_processor_id()) != -1)
+ return NMI_HANDLED;
+
+ return NMI_DONE;
+}
+#endif
+
static void __init ms_hyperv_init_platform(void)
{
/*
@@ -183,6 +203,9 @@ static void __init ms_hyperv_init_platform(void)
pr_info("HyperV: LAPIC Timer Frequency: %#x\n",
lapic_timer_frequency);
}
+
+ register_nmi_handler(NMI_UNKNOWN, hv_nmi_unknown, NMI_FLAG_FIRST,
+ "hv_nmi_unknown");
#endif
if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE)
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 1db8dc490b66..d9794060fe22 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -17,11 +17,20 @@ struct cpuid_bit {
u32 sub_leaf;
};
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
+/* Please keep the leaf sorted by cpuid_bit.level for faster search. */
+static const struct cpuid_bit cpuid_bits[] = {
+ { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 },
+ { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 },
+ { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 },
+ { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 },
+ { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 },
+ { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 },
+ { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
+ { 0, 0, 0, 0, 0 }
};
void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
@@ -30,18 +39,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
u32 regs[4];
const struct cpuid_bit *cb;
- static const struct cpuid_bit cpuid_bits[] = {
- { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 },
- { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 },
- { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
- { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
- { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
- { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
- { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 },
- { 0, 0, 0, 0, 0 }
- };
-
for (cb = cpuid_bits; cb->feature; cb++) {
/* Verify that the level is valid */
@@ -50,10 +47,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
max_level > (cb->level | 0xffff))
continue;
- cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
- &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CPUID_EAX],
+ &regs[CPUID_EBX], &regs[CPUID_ECX],
+ &regs[CPUID_EDX]);
if (regs[cb->reg] & (1 << cb->bit))
set_cpu_cap(c, cb->feature);
}
}
+
+u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf,
+ enum cpuid_regs_idx reg)
+{
+ const struct cpuid_bit *cb;
+ u32 cpuid_val = 0;
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ if (level > cb->level)
+ continue;
+
+ if (level < cb->level)
+ break;
+
+ if (reg == cb->reg && sub_leaf == cb->sub_leaf) {
+ if (cpu_has(&boot_cpu_data, cb->feature))
+ cpuid_val |= BIT(cb->bit);
+ }
+ }
+
+ return cpuid_val;
+}
+EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 5130985b758b..891f4dad7b2c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,11 +24,16 @@
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/clocksource.h>
#include <asm/div64.h>
#include <asm/x86_init.h>
#include <asm/hypervisor.h>
#include <asm/timer.h>
#include <asm/apic.h>
+#include <asm/timer.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "vmware: " fmt
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -48,6 +53,8 @@
"2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \
"memory");
+static unsigned long vmware_tsc_khz __ro_after_init;
+
static inline int __vmware_platform(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -57,35 +64,80 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz, lpj;
- uint32_t eax, ebx, ecx, edx;
+ return vmware_tsc_khz;
+}
- VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
+#ifdef CONFIG_PARAVIRT
+static struct cyc2ns_data vmware_cyc2ns __ro_after_init;
+static int vmw_sched_clock __initdata = 1;
- tsc_hz = eax | (((uint64_t)ebx) << 32);
- do_div(tsc_hz, 1000);
- BUG_ON(tsc_hz >> 32);
- pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
- (unsigned long) tsc_hz / 1000,
- (unsigned long) tsc_hz % 1000);
-
- if (!preset_lpj) {
- lpj = ((u64)tsc_hz * 1000);
- do_div(lpj, HZ);
- preset_lpj = lpj;
- }
+static __init int setup_vmw_sched_clock(char *s)
+{
+ vmw_sched_clock = 0;
+ return 0;
+}
+early_param("no-vmw-sched-clock", setup_vmw_sched_clock);
+
+static unsigned long long vmware_sched_clock(void)
+{
+ unsigned long long ns;
- return tsc_hz;
+ ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul,
+ vmware_cyc2ns.cyc2ns_shift);
+ ns -= vmware_cyc2ns.cyc2ns_offset;
+ return ns;
}
+static void __init vmware_sched_clock_setup(void)
+{
+ struct cyc2ns_data *d = &vmware_cyc2ns;
+ unsigned long long tsc_now = rdtsc();
+
+ clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift,
+ vmware_tsc_khz, NSEC_PER_MSEC, 0);
+ d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul,
+ d->cyc2ns_shift);
+
+ pv_time_ops.sched_clock = vmware_sched_clock;
+ pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset);
+}
+
+static void __init vmware_paravirt_ops_setup(void)
+{
+ pv_info.name = "VMware hypervisor";
+ pv_cpu_ops.io_delay = paravirt_nop;
+
+ if (vmware_tsc_khz && vmw_sched_clock)
+ vmware_sched_clock_setup();
+}
+#else
+#define vmware_paravirt_ops_setup() do {} while (0)
+#endif
+
static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
+ uint64_t lpj, tsc_khz;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
if (ebx != UINT_MAX) {
+ lpj = tsc_khz = eax | (((uint64_t)ebx) << 32);
+ do_div(tsc_khz, 1000);
+ WARN_ON(tsc_khz >> 32);
+ pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n",
+ (unsigned long) tsc_khz / 1000,
+ (unsigned long) tsc_khz % 1000);
+
+ if (!preset_lpj) {
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
+ vmware_tsc_khz = tsc_khz;
x86_platform.calibrate_tsc = vmware_get_tsc_khz;
+ x86_platform.calibrate_cpu = vmware_get_tsc_khz;
+
#ifdef CONFIG_X86_LOCAL_APIC
/* Skip lapic calibration since we know the bus frequency. */
lapic_timer_frequency = ecx / HZ;
@@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void)
pr_warn("Failed to get TSC freq from the hypervisor\n");
}
+ vmware_paravirt_ops_setup();
+
#ifdef CONFIG_X86_IO_APIC
no_timer_check = 1;
#endif
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2836de390f95..0931a105ffe1 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -45,10 +45,7 @@
#include <asm/msr.h>
static struct class *cpuid_class;
-
-struct cpuid_regs {
- u32 eax, ebx, ecx, edx;
-};
+static enum cpuhp_state cpuhp_cpuid_state;
static void cpuid_smp_cpuid(void *cmd_block)
{
@@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = {
.open = cpuid_open,
};
-static int cpuid_device_create(int cpu)
+static int cpuid_device_create(unsigned int cpu)
{
struct device *dev;
@@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void cpuid_device_destroy(int cpu)
+static int cpuid_device_destroy(unsigned int cpu)
{
device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
+ return 0;
}
-static int cpuid_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = cpuid_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- cpuid_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block cpuid_class_cpu_notifier =
-{
- .notifier_call = cpuid_class_cpu_callback,
-};
-
static char *cpuid_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
@@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode)
static int __init cpuid_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
"cpu/cpuid", &cpuid_fops)) {
printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
CPUID_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
cpuid_class = class_create(THIS_MODULE, "cpuid");
if (IS_ERR(cpuid_class)) {
@@ -177,45 +149,28 @@ static int __init cpuid_init(void)
}
cpuid_class->devnode = cpuid_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = cpuid_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online",
+ cpuid_device_create, cpuid_device_destroy);
+ if (err < 0)
+ goto out_class;
- err = 0;
- goto out;
+ cpuhp_cpuid_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i) {
- cpuid_device_destroy(i);
- }
- cpu_notifier_register_done();
class_destroy(cpuid_class);
out_chrdev:
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
-out:
return err;
}
+module_init(cpuid_init);
static void __exit cpuid_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- cpuid_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_cpuid_state);
class_destroy(cpuid_class);
__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
- __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(cpuid_init);
module_exit(cpuid_exit);
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 650830e39e3a..3741461c63a0 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -631,9 +631,9 @@ static int determine_backup_region(u64 start, u64 end, void *arg)
int crash_load_segments(struct kimage *image)
{
- unsigned long src_start, src_sz, elf_sz;
- void *elf_addr;
int ret;
+ struct kexec_buf kbuf = { .image = image, .buf_min = 0,
+ .buf_max = ULONG_MAX, .top_down = false };
/*
* Determine and load a segment for backup area. First 640K RAM
@@ -647,43 +647,44 @@ int crash_load_segments(struct kimage *image)
if (ret < 0)
return ret;
- src_start = image->arch.backup_src_start;
- src_sz = image->arch.backup_src_sz;
-
/* Add backup segment. */
- if (src_sz) {
+ if (image->arch.backup_src_sz) {
+ kbuf.buffer = &crash_zero_bytes;
+ kbuf.bufsz = sizeof(crash_zero_bytes);
+ kbuf.memsz = image->arch.backup_src_sz;
+ kbuf.buf_align = PAGE_SIZE;
/*
* Ideally there is no source for backup segment. This is
* copied in purgatory after crash. Just add a zero filled
* segment for now to make sure checksum logic works fine.
*/
- ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
- sizeof(crash_zero_bytes), src_sz,
- PAGE_SIZE, 0, -1, 0,
- &image->arch.backup_load_addr);
+ ret = kexec_add_buffer(&kbuf);
if (ret)
return ret;
+ image->arch.backup_load_addr = kbuf.mem;
pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
- image->arch.backup_load_addr, src_start, src_sz);
+ image->arch.backup_load_addr,
+ image->arch.backup_src_start, kbuf.memsz);
}
/* Prepare elf headers and add a segment */
- ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
+ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz);
if (ret)
return ret;
- image->arch.elf_headers = elf_addr;
- image->arch.elf_headers_sz = elf_sz;
+ image->arch.elf_headers = kbuf.buffer;
+ image->arch.elf_headers_sz = kbuf.bufsz;
- ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
- ELF_CORE_HEADER_ALIGN, 0, -1, 0,
- &image->arch.elf_load_addr);
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
+ ret = kexec_add_buffer(&kbuf);
if (ret) {
vfree((void *)image->arch.elf_headers);
return ret;
}
+ image->arch.elf_load_addr = kbuf.mem;
pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- image->arch.elf_load_addr, elf_sz, elf_sz);
+ image->arch.elf_load_addr, kbuf.bufsz, kbuf.bufsz);
return ret;
}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 85f854b98a9d..0cfd01d2754c 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -22,7 +22,6 @@
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
unsigned int code_bytes = 64;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
bool in_task_stack(unsigned long *stack, struct task_struct *task,
@@ -46,14 +45,7 @@ static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
{
touch_nmi_watchdog();
- printk("%s [<%p>] %s%pB\n",
- log_lvl, (void *)address, reliable ? "" : "? ",
- (void *)address);
-}
-
-void printk_address(unsigned long address)
-{
- pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address);
+ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -67,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%sCall Trace:\n", log_lvl);
unwind_start(&state, task, regs, stack);
+ stack = stack ? : get_stack_pointer(task, regs);
/*
* Iterate through the stacks, starting with the current stack pointer.
@@ -82,8 +75,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* - softirq stack
* - hardirq stack
*/
- for (; stack; stack = stack_info.next_sp) {
- const char *str_begin, *str_end;
+ for (regs = NULL; stack; stack = stack_info.next_sp) {
+ const char *stack_name;
/*
* If we overflowed the task stack into a guard page, jump back
@@ -95,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (get_stack_info(stack, task, &stack_info, &visit_mask))
break;
- stack_type_str(stack_info.type, &str_begin, &str_end);
- if (str_begin)
- printk("%s <%s> ", log_lvl, str_begin);
+ stack_name = stack_type_name(stack_info.type);
+ if (stack_name)
+ printk("%s <%s>\n", log_lvl, stack_name);
/*
* Scan the stack, printing any text addresses we find. At the
@@ -119,6 +112,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
if (!__kernel_text_address(addr))
continue;
+ /*
+ * Don't print regs->ip again if it was already printed
+ * by __show_regs() below.
+ */
+ if (regs && stack == &regs->ip) {
+ unwind_next_frame(&state);
+ continue;
+ }
+
if (stack == ret_addr_p)
reliable = 1;
@@ -146,10 +148,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
* of the addresses will just be printed as unreliable.
*/
unwind_next_frame(&state);
+
+ /* if the frame has entry regs, print them */
+ regs = unwind_get_entry_regs(&state);
+ if (regs)
+ __show_regs(regs, 0);
}
- if (str_end)
- printk("%s <%s> ", log_lvl, str_end);
+ if (stack_name)
+ printk("%s </%s>\n", log_lvl, stack_name);
}
}
@@ -164,12 +171,12 @@ void show_stack(struct task_struct *task, unsigned long *sp)
if (!sp && task == current)
sp = get_stack_pointer(current, NULL);
- show_stack_log_lvl(task, NULL, sp, "");
+ show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
}
void show_stack_regs(struct pt_regs *regs)
{
- show_stack_log_lvl(current, regs, NULL, "");
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
}
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -261,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
sp = kernel_stack_pointer(regs);
savesegment(ss, ss);
}
- printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
- print_symbol("%s", regs->ip);
- printk(" SS:ESP %04x:%08lx\n", ss, sp);
+ printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n",
+ (void *)regs->ip, ss, sp);
#else
/* Executive summary in case the oops scrolled away */
- printk(KERN_ALERT "RIP ");
- printk_address(regs->ip);
- printk(" RSP <%016lx>\n", regs->sp);
+ printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp);
#endif
return 0;
}
@@ -291,22 +295,6 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
-static int __init kstack_setup(char *s)
-{
- ssize_t ret;
- unsigned long val;
-
- if (!s)
- return -EINVAL;
-
- ret = kstrtoul(s, 0, &val);
- if (ret)
- return ret;
- kstack_depth_to_print = val;
- return 0;
-}
-early_param("kstack", kstack_setup);
-
static int __init code_bytes_setup(char *s)
{
ssize_t ret;
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 06eb322b5f9f..bb3b5b9a6899 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,18 +16,15 @@
#include <asm/stacktrace.h>
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
- switch (type) {
- case STACK_TYPE_IRQ:
- case STACK_TYPE_SOFTIRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type == STACK_TYPE_SOFTIRQ)
+ return "SOFTIRQ";
+
+ return NULL;
}
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
@@ -109,8 +106,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -121,36 +120,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- if (kstack_end(stack))
- break;
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %08lx", log_lvl, *stack++);
- } else
- pr_cont(" %08lx", *stack++);
- touch_nmi_watchdog();
- }
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -168,8 +137,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- pr_emerg("Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_EMERG);
+ show_trace_log_lvl(current, regs, NULL, KERN_EMERG);
pr_emerg("Code:");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 36cf1a498227..fac189efcc34 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-void stack_type_str(enum stack_type type, const char **begin, const char **end)
+const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
- switch (type) {
- case STACK_TYPE_IRQ:
- *begin = "IRQ";
- *end = "EOI";
- break;
- case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST:
- *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION];
- *end = "EOE";
- break;
- default:
- *begin = NULL;
- *end = NULL;
- }
+ if (type == STACK_TYPE_IRQ)
+ return "IRQ";
+
+ if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)
+ return exception_stack_names[type - STACK_TYPE_EXCEPTION];
+
+ return NULL;
}
static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
@@ -128,8 +122,10 @@ recursion_check:
* just break out and report an unknown stack type.
*/
if (visit_mask) {
- if (*visit_mask & (1UL << info->type))
+ if (*visit_mask & (1UL << info->type)) {
+ printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
+ }
*visit_mask |= 1UL << info->type;
}
@@ -140,56 +136,6 @@ unknown:
return -EINVAL;
}
-void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, char *log_lvl)
-{
- unsigned long *irq_stack_end;
- unsigned long *irq_stack;
- unsigned long *stack;
- int i;
-
- if (!try_get_task_stack(task))
- return;
-
- irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr);
- irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long));
-
- sp = sp ? : get_stack_pointer(task, regs);
-
- stack = sp;
- for (i = 0; i < kstack_depth_to_print; i++) {
- unsigned long word;
-
- if (stack >= irq_stack && stack <= irq_stack_end) {
- if (stack == irq_stack_end) {
- stack = (unsigned long *) (irq_stack_end[-1]);
- pr_cont(" <EOI> ");
- }
- } else {
- if (kstack_end(stack))
- break;
- }
-
- if (probe_kernel_address(stack, word))
- break;
-
- if ((i % STACKSLOTS_PER_LINE) == 0) {
- if (i != 0)
- pr_cont("\n");
- printk("%s %016lx", log_lvl, word);
- } else
- pr_cont(" %016lx", word);
-
- stack++;
- touch_nmi_watchdog();
- }
-
- pr_cont("\n");
- show_trace_log_lvl(task, regs, sp, log_lvl);
-
- put_task_stack(task);
-}
-
void show_regs(struct pt_regs *regs)
{
int i;
@@ -207,8 +153,7 @@ void show_regs(struct pt_regs *regs)
unsigned char c;
u8 *ip;
- printk(KERN_DEFAULT "Stack:\n");
- show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT);
+ show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT);
printk(KERN_DEFAULT "Code: ");
diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c
index aad34aafc0e0..d913047f832c 100644
--- a/arch/x86/kernel/fpu/bugs.c
+++ b/arch/x86/kernel/fpu/bugs.c
@@ -23,17 +23,12 @@ static double __initdata y = 3145727.0;
*/
void __init fpu__init_check_bugs(void)
{
- u32 cr0_saved;
s32 fdiv_bug;
/* kernel_fpu_begin/end() relies on patched alternative instructions. */
if (!boot_cpu_has(X86_FEATURE_FPU))
return;
- /* We might have CR0::TS set already, clear it: */
- cr0_saved = read_cr0();
- write_cr0(cr0_saved & ~X86_CR0_TS);
-
kernel_fpu_begin();
/*
@@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void)
kernel_fpu_end();
- write_cr0(cr0_saved);
-
if (fdiv_bug) {
set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV);
pr_warn("Hmm, FPU with FDIV bug\n");
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index ebb4e95fbd74..e4e97a5355ce 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void)
return this_cpu_read(in_kernel_fpu);
}
-/*
- * Were we in an interrupt that interrupted kernel mode?
- *
- * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that
- * pair does nothing at all: the thread must not have fpu (so
- * that we don't try to save the FPU state), and TS must
- * be set (so that the clts/stts pair does nothing that is
- * visible in the interrupted kernel thread).
- *
- * Except for the eagerfpu case when we return true; in the likely case
- * the thread has FPU but we are not going to set/clear TS.
- */
static bool interrupted_kernel_fpu_idle(void)
{
- if (kernel_fpu_disabled())
- return false;
-
- if (use_eager_fpu())
- return true;
-
- return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS);
+ return !kernel_fpu_disabled();
}
/*
@@ -125,8 +107,7 @@ void __kernel_fpu_begin(void)
*/
copy_fpregs_to_fpstate(fpu);
} else {
- this_cpu_write(fpu_fpregs_owner_ctx, NULL);
- __fpregs_activate_hw();
+ __cpu_invalidate_fpregs_state();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -137,8 +118,6 @@ void __kernel_fpu_end(void)
if (fpu->fpregs_active)
copy_kernel_to_fpregs(&fpu->state);
- else
- __fpregs_deactivate_hw();
kernel_fpu_enable();
}
@@ -159,35 +138,6 @@ void kernel_fpu_end(void)
EXPORT_SYMBOL_GPL(kernel_fpu_end);
/*
- * CR0::TS save/restore functions:
- */
-int irq_ts_save(void)
-{
- /*
- * If in process context and not atomic, we can take a spurious DNA fault.
- * Otherwise, doing clts() in process context requires disabling preemption
- * or some heavy lifting like kernel_fpu_begin()
- */
- if (!in_atomic())
- return 0;
-
- if (read_cr0() & X86_CR0_TS) {
- clts();
- return 1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(irq_ts_save);
-
-void irq_ts_restore(int TS_state)
-{
- if (TS_state)
- stts();
-}
-EXPORT_SYMBOL_GPL(irq_ts_restore);
-
-/*
* Save the FPU state (mark it for reload if necessary):
*
* This only ever gets called for the current task.
@@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_before_save(fpu);
if (fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(fpu)) {
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&fpu->state);
- else
- fpregs_deactivate(fpu);
+ copy_kernel_to_fpregs(&fpu->state);
}
}
trace_x86_fpu_after_save(fpu);
@@ -247,7 +194,6 @@ EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
- dst_fpu->counter = 0;
dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
@@ -260,8 +206,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
- if (use_eager_fpu())
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
/*
* Save current FPU registers directly into the child
@@ -283,10 +228,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
memcpy(&src_fpu->state, &dst_fpu->state,
fpu_kernel_xstate_size);
- if (use_eager_fpu())
- copy_kernel_to_fpregs(&src_fpu->state);
- else
- fpregs_deactivate(src_fpu);
+ copy_kernel_to_fpregs(&src_fpu->state);
}
preempt_enable();
@@ -366,7 +308,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
if (fpu->fpstate_active) {
/* Invalidate any lazy state: */
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
@@ -409,7 +351,7 @@ void fpu__current_fpstate_write_begin(void)
* ensures we will not be lazy and skip a XRSTOR in the
* future.
*/
- fpu->last_cpu = -1;
+ __fpu_invalidate_fpregs_state(fpu);
}
/*
@@ -459,7 +401,6 @@ void fpu__restore(struct fpu *fpu)
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
copy_kernel_to_fpregs(&fpu->state);
- fpu->counter++;
trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
@@ -477,7 +418,6 @@ EXPORT_SYMBOL_GPL(fpu__restore);
void fpu__drop(struct fpu *fpu)
{
preempt_disable();
- fpu->counter = 0;
if (fpu->fpregs_active) {
/* Ignore delayed exceptions from user space */
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 2f2b8c7ccb85..60dece392b3a 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -10,18 +10,6 @@
#include <linux/init.h>
/*
- * Initialize the TS bit in CR0 according to the style of context-switches
- * we are using:
- */
-static void fpu__init_cpu_ctx_switch(void)
-{
- if (!boot_cpu_has(X86_FEATURE_EAGER_FPU))
- stts();
- else
- clts();
-}
-
-/*
* Initialize the registers found in all CPUs, CR0 and CR4:
*/
static void fpu__init_cpu_generic(void)
@@ -58,7 +46,6 @@ void fpu__init_cpu(void)
{
fpu__init_cpu_generic();
fpu__init_cpu_xstate();
- fpu__init_cpu_ctx_switch();
}
/*
@@ -233,82 +220,16 @@ static void __init fpu__init_system_xstate_size_legacy(void)
}
/*
- * FPU context switching strategies:
- *
- * Against popular belief, we don't do lazy FPU saves, due to the
- * task migration complications it brings on SMP - we only do
- * lazy FPU restores.
- *
- * 'lazy' is the traditional strategy, which is based on setting
- * CR0::TS to 1 during context-switch (instead of doing a full
- * restore of the FPU state), which causes the first FPU instruction
- * after the context switch (whenever it is executed) to fault - at
- * which point we lazily restore the FPU state into FPU registers.
- *
- * Tasks are of course under no obligation to execute FPU instructions,
- * so it can easily happen that another context-switch occurs without
- * a single FPU instruction being executed. If we eventually switch
- * back to the original task (that still owns the FPU) then we have
- * not only saved the restores along the way, but we also have the
- * FPU ready to be used for the original task.
- *
- * 'lazy' is deprecated because it's almost never a performance win
- * and it's much more complicated than 'eager'.
- *
- * 'eager' switching is by default on all CPUs, there we switch the FPU
- * state during every context switch, regardless of whether the task
- * has used FPU instructions in that time slice or not. This is done
- * because modern FPU context saving instructions are able to optimize
- * state saving and restoration in hardware: they can detect both
- * unused and untouched FPU state and optimize accordingly.
- *
- * [ Note that even in 'lazy' mode we might optimize context switches
- * to use 'eager' restores, if we detect that a task is using the FPU
- * frequently. See the fpu->counter logic in fpu/internal.h for that. ]
- */
-static enum { ENABLE, DISABLE } eagerfpu = ENABLE;
-
-/*
* Find supported xfeatures based on cpu features and command-line input.
* This must be called after fpu__init_parse_early_param() is called and
* xfeatures_mask is enumerated.
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
- /* Support all xfeatures known to us */
- if (eagerfpu != DISABLE)
- return XCNTXT_MASK;
-
- /* Warning of xfeatures being disabled for no eagerfpu mode */
- if (xfeatures_mask & XFEATURE_MASK_EAGER) {
- pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
- xfeatures_mask & XFEATURE_MASK_EAGER);
- }
-
- /* Return a mask that masks out all features requiring eagerfpu mode */
- return ~XFEATURE_MASK_EAGER;
+ return XCNTXT_MASK;
}
-/*
- * Disable features dependent on eagerfpu.
- */
-static void __init fpu__clear_eager_fpu_features(void)
-{
- setup_clear_cpu_cap(X86_FEATURE_MPX);
-}
-
-/*
- * Pick the FPU context switching strategy:
- *
- * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
- * the following is true:
- *
- * (1) the cpu has xsaveopt, as it has the optimization and doing eager
- * FPU switching has a relatively low cost compared to a plain xsave;
- * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
- * switching. Should the kernel boot with noxsaveopt, we support MPX
- * with eager FPU switching at a higher cost.
- */
+/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
static bool on_boot_cpu __initdata = 1;
@@ -317,17 +238,6 @@ static void __init fpu__init_system_ctx_switch(void)
on_boot_cpu = 0;
WARN_ON_FPU(current->thread.fpu.fpstate_active);
-
- if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
- eagerfpu = ENABLE;
-
- if (xfeatures_mask & XFEATURE_MASK_EAGER)
- eagerfpu = ENABLE;
-
- if (eagerfpu == ENABLE)
- setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
-
- printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy");
}
/*
@@ -336,11 +246,6 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
- eagerfpu = DISABLE;
- fpu__clear_eager_fpu_features();
- }
-
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -375,14 +280,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
*/
fpu__init_cpu();
- /*
- * But don't leave CR0::TS set yet, as some of the FPU setup
- * methods depend on being able to execute FPU instructions
- * that will fault on a set TS, such as the FXSAVE in
- * fpu__init_system_mxcsr().
- */
- clts();
-
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a184c210efba..83c23c230b4c 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
fpu->fpstate_active = 1;
- if (use_eager_fpu()) {
- preempt_disable();
- fpu__restore(fpu);
- preempt_enable();
- }
+ preempt_disable();
+ fpu__restore(fpu);
+ preempt_enable();
return err;
} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 095ef7ddd6ae..1d7770447b3e 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
@@ -73,6 +74,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+ setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
setup_clear_cpu_cap(X86_FEATURE_PKU);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
@@ -890,15 +892,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
*/
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
- /*
- * For most XSAVE components, this would be an arduous task:
- * brining fpstate up to date with fpregs, updating fpstate,
- * then re-populating fpregs. But, for components that are
- * never lazily managed, we can just access the fpregs
- * directly. PKRU is never managed lazily, so we can just
- * manipulate it directly. Make sure it stays that way.
- */
- WARN_ON_ONCE(!use_eager_fpu());
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 2dabea46f039..4e8577d03372 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -63,6 +63,8 @@
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
+#define SIZEOF_PTREGS 17*4
+
/*
* Number of possible pages in the lowmem region.
*
@@ -248,19 +250,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
+ jb .Ldefault_entry
/* Paravirt-compatible boot parameters. Look to see what architecture
we're booting under. */
movl pa(boot_params + BP_hardware_subarch), %eax
cmpl $num_subarch_entries, %eax
- jae bad_subarch
+ jae .Lbad_subarch
movl pa(subarch_entries)(,%eax,4), %eax
subl $__PAGE_OFFSET, %eax
jmp *%eax
-bad_subarch:
+.Lbad_subarch:
WEAK(lguest_entry)
WEAK(xen_entry)
/* Unknown implementation; there's really
@@ -270,14 +272,14 @@ WEAK(xen_entry)
__INITDATA
subarch_entries:
- .long default_entry /* normal x86/PC */
+ .long .Ldefault_entry /* normal x86/PC */
.long lguest_entry /* lguest hypervisor */
.long xen_entry /* Xen hypervisor */
- .long default_entry /* Moorestown MID */
+ .long .Ldefault_entry /* Moorestown MID */
num_subarch_entries = (. - subarch_entries) / 4
.previous
#else
- jmp default_entry
+ jmp .Ldefault_entry
#endif /* CONFIG_PARAVIRT */
#ifdef CONFIG_HOTPLUG_CPU
@@ -289,7 +291,8 @@ num_subarch_entries = (. - subarch_entries) / 4
ENTRY(start_cpu0)
movl initial_stack, %ecx
movl %ecx, %esp
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
ENDPROC(start_cpu0)
#endif
@@ -317,7 +320,7 @@ ENTRY(startup_32_smp)
call load_ucode_ap
#endif
-default_entry:
+.Ldefault_entry:
#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
X86_CR0_PG)
@@ -347,7 +350,7 @@ default_entry:
pushfl
popl %eax # get EFLAGS
testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set?
- jz enable_paging # hw disallowed setting of ID bit
+ jz .Lenable_paging # hw disallowed setting of ID bit
# which means no CPUID and no CR4
xorl %eax,%eax
@@ -357,13 +360,13 @@ default_entry:
movl $1,%eax
cpuid
andl $~1,%edx # Ignore CPUID.FPU
- jz enable_paging # No flags or only CPUID.FPU = no CR4
+ jz .Lenable_paging # No flags or only CPUID.FPU = no CR4
movl pa(mmu_cr4_features),%eax
movl %eax,%cr4
testb $X86_CR4_PAE, %al # check if PAE is enabled
- jz enable_paging
+ jz .Lenable_paging
/* Check if extended functions are implemented */
movl $0x80000000, %eax
@@ -371,7 +374,7 @@ default_entry:
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
- ja enable_paging
+ ja .Lenable_paging
/* Clear bogus XD_DISABLE bits */
call verify_cpu
@@ -380,7 +383,7 @@ default_entry:
cpuid
/* Execute Disable bit supported? */
btl $(X86_FEATURE_NX & 31), %edx
- jnc enable_paging
+ jnc .Lenable_paging
/* Setup EFER (Extended Feature Enable Register) */
movl $MSR_EFER, %ecx
@@ -390,7 +393,7 @@ default_entry:
/* Make changes effective */
wrmsr
-enable_paging:
+.Lenable_paging:
/*
* Enable paging
@@ -419,7 +422,7 @@ enable_paging:
*/
movb $4,X86 # at least 486
cmpl $-1,X86_CPUID
- je is486
+ je .Lis486
/* get vendor info */
xorl %eax,%eax # call CPUID with 0 -> return vendor ID
@@ -430,7 +433,7 @@ enable_paging:
movl %ecx,X86_VENDOR_ID+8 # last 4 chars
orl %eax,%eax # do we have processor info as well?
- je is486
+ je .Lis486
movl $1,%eax # Use the CPUID instruction to get CPU type
cpuid
@@ -444,7 +447,7 @@ enable_paging:
movb %cl,X86_MASK
movl %edx,X86_CAPABILITY
-is486:
+.Lis486:
movl $0x50022,%ecx # set AM, WP, NE and MP
movl %cr0,%eax
andl $0x80000011,%eax # Save PG,PE,ET
@@ -470,8 +473,9 @@ is486:
xorl %eax,%eax # Clear LDT
lldt %ax
- pushl $0 # fake return address for unwinder
- jmp *(initial_code)
+ call *(initial_code)
+1: jmp 1b
+ENDPROC(startup_32_smp)
#include "verify_cpu.S"
@@ -709,7 +713,12 @@ ENTRY(initial_page_table)
.data
.balign 4
ENTRY(initial_stack)
- .long init_thread_union+THREAD_SIZE
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \
+ TOP_OF_KERNEL_STACK_PADDING;
__INITRODATA
int_msg:
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b4421cc191b0..b467b14b03eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -66,13 +66,8 @@ startup_64:
* tables and then reload them.
*/
- /*
- * Setup stack for verify_cpu(). "-8" because initial_stack is defined
- * this way, see below. Our best guess is a NULL ptr for stack
- * termination heuristics and we don't want to break anything which
- * might depend on it (kgdb, ...).
- */
- leaq (__end_init_task - 8)(%rip), %rsp
+ /* Set up the stack for verify_cpu(), similar to initial_stack below */
+ leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp
/* Sanitize CPU configuration */
call verify_cpu
@@ -117,20 +112,20 @@ startup_64:
movq %rdi, %rax
shrq $PGDIR_SHIFT, %rax
- leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
movq %rdx, 0(%rbx,%rax,8)
movq %rdx, 8(%rbx,%rax,8)
- addq $4096, %rdx
+ addq $PAGE_SIZE, %rdx
movq %rdi, %rax
shrq $PUD_SHIFT, %rax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
incl %eax
andl $(PTRS_PER_PUD-1), %eax
- movq %rdx, 4096(%rbx,%rax,8)
+ movq %rdx, PAGE_SIZE(%rbx,%rax,8)
- addq $8192, %rbx
+ addq $PAGE_SIZE * 2, %rbx
movq %rdi, %rax
shrq $PMD_SHIFT, %rdi
addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
@@ -147,6 +142,9 @@ startup_64:
decl %ecx
jnz 1b
+ test %rbp, %rbp
+ jz .Lskip_fixup
+
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
@@ -154,9 +152,9 @@ startup_64:
* beyond _end.
*/
leaq level2_kernel_pgt(%rip), %rdi
- leaq 4096(%rdi), %r8
+ leaq PAGE_SIZE(%rdi), %r8
/* See if it is a valid page table entry */
-1: testb $1, 0(%rdi)
+1: testb $_PAGE_PRESENT, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
@@ -167,6 +165,7 @@ startup_64:
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
+.Lskip_fixup:
movq $(early_level4_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
@@ -265,13 +264,17 @@ ENTRY(secondary_startup_64)
movl $MSR_GS_BASE,%ecx
movl initial_gs(%rip),%eax
movl initial_gs+4(%rip),%edx
- wrmsr
+ wrmsr
/* rsi is pointer to real mode structure with interesting info.
pass it to C */
movq %rsi, %rdi
-
- /* Finally jump to run C code and to be on real kernel address
+ jmp start_cpu
+ENDPROC(secondary_startup_64)
+
+ENTRY(start_cpu)
+ /*
+ * Jump to run C code and to be on a real kernel address.
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
@@ -295,12 +298,14 @@ ENTRY(secondary_startup_64)
* REX.W + FF /5 JMP m16:64 Jump far, absolute indirect,
* address given in m16:64.
*/
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
+ pushq $.Lafter_lret # put return address on stack for unwinder
+ xorq %rbp, %rbp # clear frame pointer
+ movq initial_code(%rip), %rax
pushq $__KERNEL_CS # set correct cs
pushq %rax # target address in negative space
lretq
-ENDPROC(secondary_startup_64)
+.Lafter_lret:
+ENDPROC(start_cpu)
#include "verify_cpu.S"
@@ -308,15 +313,11 @@ ENDPROC(secondary_startup_64)
/*
* Boot CPU0 entry point. It's called from play_dead(). Everything has been set
* up already except stack. We just set up stack here. Then call
- * start_secondary().
+ * start_secondary() via start_cpu().
*/
ENTRY(start_cpu0)
- movq initial_stack(%rip),%rsp
- movq initial_code(%rip),%rax
- pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ movq initial_stack(%rip), %rsp
+ jmp start_cpu
ENDPROC(start_cpu0)
#endif
@@ -328,7 +329,11 @@ ENDPROC(start_cpu0)
GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
GLOBAL(initial_stack)
- .quad init_thread_union+THREAD_SIZE-8
+ /*
+ * The SIZEOF_PTREGS gap is a convention which helps the in-kernel
+ * unwinder reliably detect the end of the stack.
+ */
+ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
bad_address:
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9f669fdd2010..7c6e9ffe4424 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -14,7 +14,6 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 9ebd0b0e73d9..6b0678a541e2 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -16,7 +16,6 @@
#include <linux/uaccess.h>
#include <linux/smp.h>
#include <asm/io_apic.h>
-#include <asm/idle.h>
#include <asm/apic.h>
int sysctl_panic_on_stackoverflow;
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..cb9c1ed1d391
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,215 @@
+/*
+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
+ *
+ * (C) Copyright 2016 Intel Corporation
+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
+ * the maximum turbo frequencies of some cores in a CPU package may be
+ * higher than for the other cores in the same package. In that case,
+ * better performance can be achieved by making the scheduler prefer
+ * to run tasks on the CPUs with higher max turbo frequencies.
+ *
+ * This file provides functions and data structures for enabling the
+ * scheduler to favor scheduling on cores can be boosted to a higher
+ * frequency under ITMT.
+ */
+
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/cpuset.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/nodemask.h>
+
+static DEFINE_MUTEX(itmt_update_mutex);
+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
+
+/* Boolean to track if system has ITMT capabilities */
+static bool __read_mostly sched_itmt_capable;
+
+/*
+ * Boolean to control whether we want to move processes to cpu capable
+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
+ * Technology 3.0.
+ *
+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ */
+unsigned int __read_mostly sysctl_sched_itmt_enabled;
+
+static int sched_itmt_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ unsigned int old_sysctl;
+ int ret;
+
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return -EINVAL;
+ }
+
+ old_sysctl = sysctl_sched_itmt_enabled;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return ret;
+}
+
+static unsigned int zero;
+static unsigned int one = 1;
+static struct ctl_table itmt_kern_table[] = {
+ {
+ .procname = "sched_itmt_enabled",
+ .data = &sysctl_sched_itmt_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = sched_itmt_update_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {}
+};
+
+static struct ctl_table itmt_root_table[] = {
+ {
+ .procname = "kernel",
+ .mode = 0555,
+ .child = itmt_kern_table,
+ },
+ {}
+};
+
+static struct ctl_table_header *itmt_sysctl_header;
+
+/**
+ * sched_set_itmt_support() - Indicate platform supports ITMT
+ *
+ * This function is used by the OS to indicate to scheduler that the platform
+ * is capable of supporting the ITMT feature.
+ *
+ * The current scheme has the pstate driver detects if the system
+ * is ITMT capable and call sched_set_itmt_support.
+ *
+ * This must be done only after sched_set_itmt_core_prio
+ * has been called to set the cpus' priorities.
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ *
+ * Return: 0 on success
+ */
+int sched_set_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return 0;
+ }
+
+ itmt_sysctl_header = register_sysctl_table(itmt_root_table);
+ if (!itmt_sysctl_header) {
+ mutex_unlock(&itmt_update_mutex);
+ return -ENOMEM;
+ }
+
+ sched_itmt_capable = true;
+
+ sysctl_sched_itmt_enabled = 1;
+
+ if (sysctl_sched_itmt_enabled) {
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+
+ return 0;
+}
+
+/**
+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
+ *
+ * This function is used by the OS to indicate that it has
+ * revoked the platform's support of ITMT feature.
+ *
+ * It must not be called with cpu hot plug lock
+ * held as we need to acquire the lock to rebuild sched domains
+ * later.
+ */
+void sched_clear_itmt_support(void)
+{
+ mutex_lock(&itmt_update_mutex);
+
+ if (!sched_itmt_capable) {
+ mutex_unlock(&itmt_update_mutex);
+ return;
+ }
+ sched_itmt_capable = false;
+
+ if (itmt_sysctl_header) {
+ unregister_sysctl_table(itmt_sysctl_header);
+ itmt_sysctl_header = NULL;
+ }
+
+ if (sysctl_sched_itmt_enabled) {
+ /* disable sched_itmt if we are no longer ITMT capable */
+ sysctl_sched_itmt_enabled = 0;
+ x86_topology_update = true;
+ rebuild_sched_domains();
+ }
+
+ mutex_unlock(&itmt_update_mutex);
+}
+
+int arch_asym_cpu_priority(int cpu)
+{
+ return per_cpu(sched_core_priority, cpu);
+}
+
+/**
+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+ * @prio: Priority of cpu core
+ * @core_cpu: The cpu number associated with the core
+ *
+ * The pstate driver will find out the max boost frequency
+ * and call this function to set a priority proportional
+ * to the max boost frequency. CPU with higher boost
+ * frequency will receive higher priority.
+ *
+ * No need to rebuild sched domain after updating
+ * the CPU priorities. The sched domains have no
+ * dependency on CPU priorities.
+ */
+void sched_set_itmt_core_prio(int prio, int core_cpu)
+{
+ int cpu, i = 1;
+
+ for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+ int smt_prio;
+
+ /*
+ * Ensure that the siblings are moved to the end
+ * of the priority chain and only used when
+ * all other high priority cpus are out of capacity.
+ */
+ smt_prio = prio * smp_num_siblings / i;
+ per_cpu(sched_core_priority, cpu) = smt_prio;
+ i++;
+ }
+}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 3407b148c240..d0a814a9d96a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -331,17 +331,17 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
struct setup_header *header;
int setup_sects, kern16_size, ret = 0;
- unsigned long setup_header_size, params_cmdline_sz, params_misc_sz;
+ unsigned long setup_header_size, params_cmdline_sz;
struct boot_params *params;
unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr;
unsigned long purgatory_load_addr;
- unsigned long kernel_bufsz, kernel_memsz, kernel_align;
- char *kernel_buf;
struct bzimage64_data *ldata;
struct kexec_entry64_regs regs64;
void *stack;
unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
+ struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
+ .top_down = true };
header = (struct setup_header *)(kernel + setup_hdr_offset);
setup_sects = header->setup_sects;
@@ -402,11 +402,11 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
params_cmdline_sz = sizeof(struct boot_params) + cmdline_len +
MAX_ELFCOREHDR_STR_LEN;
params_cmdline_sz = ALIGN(params_cmdline_sz, 16);
- params_misc_sz = params_cmdline_sz + efi_map_sz +
+ kbuf.bufsz = params_cmdline_sz + efi_map_sz +
sizeof(struct setup_data) +
sizeof(struct efi_setup_data);
- params = kzalloc(params_misc_sz, GFP_KERNEL);
+ params = kzalloc(kbuf.bufsz, GFP_KERNEL);
if (!params)
return ERR_PTR(-ENOMEM);
efi_map_offset = params_cmdline_sz;
@@ -418,37 +418,41 @@ static void *bzImage64_load(struct kimage *image, char *kernel,
/* Is there a limit on setup header size? */
memcpy(&params->hdr, (kernel + setup_hdr_offset), setup_header_size);
- ret = kexec_add_buffer(image, (char *)params, params_misc_sz,
- params_misc_sz, 16, MIN_BOOTPARAM_ADDR,
- ULONG_MAX, 1, &bootparam_load_addr);
+ kbuf.buffer = params;
+ kbuf.memsz = kbuf.bufsz;
+ kbuf.buf_align = 16;
+ kbuf.buf_min = MIN_BOOTPARAM_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ bootparam_load_addr = kbuf.mem;
pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- bootparam_load_addr, params_misc_sz, params_misc_sz);
+ bootparam_load_addr, kbuf.bufsz, kbuf.bufsz);
/* Load kernel */
- kernel_buf = kernel + kern16_size;
- kernel_bufsz = kernel_len - kern16_size;
- kernel_memsz = PAGE_ALIGN(header->init_size);
- kernel_align = header->kernel_alignment;
-
- ret = kexec_add_buffer(image, kernel_buf,
- kernel_bufsz, kernel_memsz, kernel_align,
- MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1,
- &kernel_load_addr);
+ kbuf.buffer = kernel + kern16_size;
+ kbuf.bufsz = kernel_len - kern16_size;
+ kbuf.memsz = PAGE_ALIGN(header->init_size);
+ kbuf.buf_align = header->kernel_alignment;
+ kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ kernel_load_addr = kbuf.mem;
pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
- kernel_load_addr, kernel_memsz, kernel_memsz);
+ kernel_load_addr, kbuf.bufsz, kbuf.memsz);
/* Load initrd high */
if (initrd) {
- ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len,
- PAGE_SIZE, MIN_INITRD_LOAD_ADDR,
- ULONG_MAX, 1, &initrd_load_addr);
+ kbuf.buffer = initrd;
+ kbuf.bufsz = kbuf.memsz = initrd_len;
+ kbuf.buf_align = PAGE_SIZE;
+ kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
+ ret = kexec_add_buffer(&kbuf);
if (ret)
goto out_free_params;
+ initrd_load_addr = kbuf.mem;
pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
initrd_load_addr, initrd_len, initrd_len);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index edbbfc854e39..36bc66416021 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,7 +42,6 @@
#include <asm/traps.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
-#include <asm/idle.h>
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
@@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
prev_state = exception_enter();
- exit_idle();
kvm_async_pf_task_wait((u32)read_cr2());
exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
- exit_idle();
kvm_async_pf_task_wake((u32)read_cr2());
rcu_irq_exit();
break;
@@ -308,7 +305,7 @@ static void kvm_register_steal_time(void)
static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
-static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
+static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
/**
* This relies on __test_and_clear_bit to modify the memory
@@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
*/
if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
return;
- apic_write(APIC_EOI, APIC_EOI_ACK);
+ apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
}
static void kvm_guest_cpu_init(void)
@@ -592,6 +589,14 @@ out:
local_irq_restore(flags);
}
+__visible bool __kvm_vcpu_is_preempted(int cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!src->preempted;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -608,6 +613,11 @@ void __init kvm_spinlock_init(void)
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = kvm_wait;
pv_lock_ops.kick = kvm_kick_cpu;
+
+ if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ pv_lock_ops.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+ }
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6707039b9032..d4a15831ac58 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm)
}
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int size)
{
struct ldt_struct *new_ldt;
- int alloc_size;
+ unsigned int alloc_size;
if (size > LDT_ENTRIES)
return NULL;
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
paravirt_free_ldt(ldt->entries, ldt->size);
if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
+ vfree_atomic(ldt->entries);
else
free_page((unsigned long)ldt->entries);
kfree(ldt);
@@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
{
struct mm_struct *mm = current->mm;
+ struct ldt_struct *new_ldt, *old_ldt;
+ unsigned int oldsize, newsize;
+ struct user_desc ldt_info;
struct desc_struct ldt;
int error;
- struct user_desc ldt_info;
- int oldsize, newsize;
- struct ldt_struct *new_ldt, *old_ldt;
error = -EINVAL;
if (bytecount != sizeof(ldt_info))
@@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
old_ldt = mm->context.ldt;
oldsize = old_ldt ? old_ldt->size : 0;
- newsize = max((int)(ldt_info.entry_number + 1), oldsize);
+ newsize = max(ldt_info.entry_number + 1, oldsize);
error = -ENOMEM;
new_ldt = alloc_ldt_struct(newsize);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 8c1f218926d7..307b1f4543de 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image)
void arch_crash_save_vmcoreinfo(void)
{
- VMCOREINFO_SYMBOL(phys_base);
+ VMCOREINFO_NUMBER(phys_base);
VMCOREINFO_SYMBOL(init_level4_pgt);
#ifdef CONFIG_NUMA
@@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
#endif
vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
kaslr_offset());
- VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET);
- VMCOREINFO_VMALLOC_START(VMALLOC_START);
- VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
}
/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7f3550acde1b..f5e3ff835cc8 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -44,6 +44,7 @@
#include <asm/msr.h>
static struct class *msr_class;
+static enum cpuhp_state cpuhp_msr_state;
static ssize_t msr_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -180,7 +181,7 @@ static const struct file_operations msr_fops = {
.compat_ioctl = msr_ioctl,
};
-static int msr_device_create(int cpu)
+static int msr_device_create(unsigned int cpu)
{
struct device *dev;
@@ -189,34 +190,12 @@ static int msr_device_create(int cpu)
return PTR_ERR_OR_ZERO(dev);
}
-static void msr_device_destroy(int cpu)
+static int msr_device_destroy(unsigned int cpu)
{
device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu));
+ return 0;
}
-static int msr_class_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- int err = 0;
-
- switch (action) {
- case CPU_UP_PREPARE:
- err = msr_device_create(cpu);
- break;
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- case CPU_DEAD:
- msr_device_destroy(cpu);
- break;
- }
- return notifier_from_errno(err);
-}
-
-static struct notifier_block __refdata msr_class_cpu_notifier = {
- .notifier_call = msr_class_cpu_callback,
-};
-
static char *msr_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
@@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode)
static int __init msr_init(void)
{
- int i, err = 0;
- i = 0;
+ int err;
if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
pr_err("unable to get major %d for msr\n", MSR_MAJOR);
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
msr_class = class_create(THIS_MODULE, "msr");
if (IS_ERR(msr_class)) {
@@ -239,44 +216,28 @@ static int __init msr_init(void)
}
msr_class->devnode = msr_devnode;
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- err = msr_device_create(i);
- if (err != 0)
- goto out_class;
- }
- __register_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
-
- err = 0;
- goto out;
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online",
+ msr_device_create, msr_device_destroy);
+ if (err < 0)
+ goto out_class;
+ cpuhp_msr_state = err;
+ return 0;
out_class:
- i = 0;
- for_each_online_cpu(i)
- msr_device_destroy(i);
- cpu_notifier_register_done();
+ cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
out_chrdev:
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
-out:
return err;
}
+module_init(msr_init);
static void __exit msr_exit(void)
{
- int cpu = 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- msr_device_destroy(cpu);
+ cpuhp_remove_state(cpuhp_msr_state);
class_destroy(msr_class);
__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
- __unregister_hotcpu_notifier(&msr_class_cpu_notifier);
- cpu_notifier_register_done();
}
-
-module_init(msr_init);
module_exit(msr_exit)
MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 2c55a003b793..6d4bf812af45 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock)
{
native_queued_spin_unlock(lock);
}
-
PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
bool pv_is_native_spin_unlock(void)
@@ -21,12 +20,25 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
}
+__visible bool __native_vcpu_is_preempted(int cpu)
+{
+ return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+ return pv_lock_ops.vcpu_is_preempted.func ==
+ __raw_callee_save___native_vcpu_is_preempted;
+}
+
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
.wait = paravirt_nop,
.kick = paravirt_nop,
+ .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bbf3d5933eaa..a1bfba0f7234 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
.cpuid = native_cpuid,
.get_debugreg = native_get_debugreg,
.set_debugreg = native_set_debugreg,
- .clts = native_clts,
.read_cr0 = native_read_cr0,
.write_cr0 = native_write_cr0,
.read_cr4 = native_read_cr4,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 920c6ae08592..553acbbb4d32 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret");
DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
@@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default: __maybe_unused
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index bb3840cedb4f..11aaf1eaa0e4 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
-DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq");
@@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax");
#endif
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
}
extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
@@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr2);
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
- PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
@@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
end = end_pv_lock_ops_queued_spin_unlock;
goto patch_site;
}
+ goto patch_default;
+
+ case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+ if (pv_is_native_vcpu_is_preempted()) {
+ start = start_pv_lock_ops_vcpu_is_preempted;
+ end = end_pv_lock_ops_vcpu_is_preempted;
+ goto patch_site;
+ }
+ goto patch_default;
#endif
default:
+patch_default: __maybe_unused
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c
index 24a50301f150..91271122f0df 100644
--- a/arch/x86/kernel/platform-quirks.c
+++ b/arch/x86/kernel/platform-quirks.c
@@ -6,6 +6,7 @@
void __init x86_early_init_platform_quirks(void)
{
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_EXPECTED_PRESENT;
x86_platform.legacy.rtc = 1;
x86_platform.legacy.reserve_bios_regions = 0;
x86_platform.legacy.devices.pnpbios = 1;
@@ -16,10 +17,14 @@ void __init x86_early_init_platform_quirks(void)
break;
case X86_SUBARCH_XEN:
case X86_SUBARCH_LGUEST:
+ x86_platform.legacy.devices.pnpbios = 0;
+ x86_platform.legacy.rtc = 0;
+ break;
case X86_SUBARCH_INTEL_MID:
case X86_SUBARCH_CE4100:
x86_platform.legacy.devices.pnpbios = 0;
x86_platform.legacy.rtc = 0;
+ x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
break;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0888a879120f..37363e46b1f0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -23,7 +23,6 @@
#include <asm/cpu.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
-#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/mwait.h>
#include <asm/fpu/internal.h>
@@ -65,23 +64,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
-#ifdef CONFIG_X86_64
-static DEFINE_PER_CPU(unsigned char, is_idle);
-static ATOMIC_NOTIFIER_HEAD(idle_notifier);
-
-void idle_notifier_register(struct notifier_block *n)
-{
- atomic_notifier_chain_register(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_register);
-
-void idle_notifier_unregister(struct notifier_block *n)
-{
- atomic_notifier_chain_unregister(&idle_notifier, n);
-}
-EXPORT_SYMBOL_GPL(idle_notifier_unregister);
-#endif
-
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -251,39 +233,10 @@ static inline void play_dead(void)
}
#endif
-#ifdef CONFIG_X86_64
-void enter_idle(void)
-{
- this_cpu_write(is_idle, 1);
- atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
-}
-
-static void __exit_idle(void)
-{
- if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
- return;
- atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
-}
-
-/* Called from interrupts to signify idle end */
-void exit_idle(void)
-{
- /* idle loop has pid 0 */
- if (current->pid)
- return;
- __exit_idle();
-}
-#endif
-
void arch_cpu_idle_enter(void)
{
+ tsc_verify_tsc_adjust(false);
local_touch_nmi();
- enter_idle();
-}
-
-void arch_cpu_idle_exit(void)
-{
- __exit_idle();
}
void arch_cpu_idle_dead(void)
@@ -336,59 +289,33 @@ void stop_this_cpu(void *dummy)
halt();
}
-bool amd_e400_c1e_detected;
-EXPORT_SYMBOL(amd_e400_c1e_detected);
-
-static cpumask_var_t amd_e400_c1e_mask;
-
-void amd_e400_remove_cpu(int cpu)
-{
- if (amd_e400_c1e_mask != NULL)
- cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
-}
-
/*
- * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
- * pending message MSR. If we detect C1E, then we handle it the same
- * way as C3 power states (local apic timer and TSC stop)
+ * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
+ * states (local apic timer and TSC stop).
*/
static void amd_e400_idle(void)
{
- if (!amd_e400_c1e_detected) {
- u32 lo, hi;
-
- rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
-
- if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- amd_e400_c1e_detected = true;
- if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
- mark_tsc_unstable("TSC halt in AMD C1E");
- pr_info("System has AMD C1E enabled\n");
- }
+ /*
+ * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
+ * gets set after static_cpu_has() places have been converted via
+ * alternatives.
+ */
+ if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ default_idle();
+ return;
}
- if (amd_e400_c1e_detected) {
- int cpu = smp_processor_id();
+ tick_broadcast_enter();
- if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
- cpumask_set_cpu(cpu, amd_e400_c1e_mask);
- /* Force broadcast so ACPI can not interfere. */
- tick_broadcast_force();
- pr_info("Switch to broadcast mode on CPU%d\n", cpu);
- }
- tick_broadcast_enter();
-
- default_idle();
+ default_idle();
- /*
- * The switch back from broadcast mode needs to be
- * called with interrupts disabled.
- */
- local_irq_disable();
- tick_broadcast_exit();
- local_irq_enable();
- } else
- default_idle();
+ /*
+ * The switch back from broadcast mode needs to be called with
+ * interrupts disabled.
+ */
+ local_irq_disable();
+ tick_broadcast_exit();
+ local_irq_enable();
}
/*
@@ -448,8 +375,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
if (x86_idle || boot_option_idle_override == IDLE_POLL)
return;
- if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) {
- /* E400: APIC timer interrupt does not wake up CPU from C1e */
+ if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
} else if (prefer_mwait_c1_over_halt(c)) {
@@ -459,11 +385,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
x86_idle = default_idle;
}
-void __init init_amd_e400_c1e_mask(void)
+void amd_e400_c1e_apic_setup(void)
+{
+ if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
+ pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
+ local_irq_disable();
+ tick_broadcast_force();
+ local_irq_enable();
+ }
+}
+
+void __init arch_post_acpi_subsys_init(void)
{
- /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
- if (x86_idle == amd_e400_idle)
- zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
+ u32 lo, hi;
+
+ if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
+ return;
+
+ /*
+ * AMD E400 detection needs to happen after ACPI has been enabled. If
+ * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
+ * MSR_K8_INT_PENDING_MSG.
+ */
+ rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+ if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
+ return;
+
+ boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
+
+ if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+ mark_tsc_unstable("TSC halt in AMD C1E");
+ pr_info("System has AMD C1E enabled\n");
}
static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index bd7be8efdc4c..a0ac3e81518a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -49,11 +49,11 @@
#include <asm/tlbflush.h>
#include <asm/cpu.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
+#include <asm/intel_rdt.h>
void __show_regs(struct pt_regs *regs, int all)
{
@@ -72,10 +72,9 @@ void __show_regs(struct pt_regs *regs, int all)
savesegment(gs, gs);
}
- printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- (u16)regs->cs, regs->ip, regs->flags,
- smp_processor_id());
- print_symbol("EIP is at %s\n", regs->ip);
+ printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip);
+ printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags,
+ smp_processor_id());
printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
regs->ax, regs->bx, regs->cx, regs->dx);
@@ -232,11 +231,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
- fpu_switch_t fpu_switch;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -295,9 +293,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
this_cpu_write(current_task, next_p);
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3760b3c1ca0..a61e141b6891 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -44,12 +44,12 @@
#include <asm/desc.h>
#include <asm/proto.h>
#include <asm/ia32.h>
-#include <asm/idle.h>
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
+#include <asm/intel_rdt.h>
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@@ -61,10 +61,15 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned int fsindex, gsindex;
unsigned int ds, cs, es;
- printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
- printk_address(regs->ip);
- printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
- regs->sp, regs->flags);
+ printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff,
+ (void *)regs->ip);
+ printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss,
+ regs->sp, regs->flags);
+ if (regs->orig_ax != -1)
+ pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
+ else
+ pr_cont("\n");
+
printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
regs->ax, regs->bx, regs->cx);
printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
@@ -265,9 +270,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
- fpu_switch_t fpu_switch;
- fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu);
+ switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -417,7 +421,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;
- switch_fpu_finish(next_fpu, fpu_switch);
+ switch_fpu_finish(next_fpu, cpu);
/*
* Switch the PDA and FPU contexts.
@@ -473,6 +477,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
loadsegment(ss, __KERNEL_DS);
}
+ /* Load the Intel cache allocation PQR MSR. */
+ intel_rdt_sched_in();
+
return prev_p;
}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 79c6311cd912..5b21cb7d84d6 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now)
unsigned int status, year, mon, day, hour, min, sec, century = 0;
unsigned long flags;
+ /*
+ * If pm_trace abused the RTC as storage, set the timespec to 0,
+ * which tells the caller that this RTC value is unusable.
+ */
+ if (!pm_trace_rtc_valid()) {
+ now->tv_sec = now->tv_nsec = 0;
+ return;
+ }
+
spin_lock_irqsave(&rtc_lock, flags);
/*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0e8ba7..4cfba947d774 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
parse_early_param();
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Memory used by the kernel cannot be hot-removed because Linux
+ * cannot migrate the kernel pages. When memory hotplug is
+ * enabled, we should prevent memblock from allocating memory
+ * for the kernel.
+ *
+ * ACPI SRAT records all hotpluggable memory ranges. But before
+ * SRAT is parsed, we don't know about it.
+ *
+ * The kernel image is loaded into memory at very early time. We
+ * cannot prevent this anyway. So on NUMA system, we set any
+ * node the kernel resides in as un-hotpluggable.
+ *
+ * Since on modern servers, one node could have double-digit
+ * gigabytes memory, we can assume the memory around the kernel
+ * image is also un-hotpluggable. So before SRAT is parsed, just
+ * allocate memory near the kernel image to try the best to keep
+ * the kernel away from hotpluggable memory.
+ */
+ if (movable_node_is_enabled())
+ memblock_set_bottom_up(true);
+#endif
+
x86_report_nx();
/* after early param, so could get panic from serial */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 2bbd27f89802..9820d6d977c6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,7 +1,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
@@ -12,6 +12,7 @@
#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
+#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/mpspec.h>
#include <asm/apicdef.h>
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index c00cb64bc0a1..68f8cc222f25 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -261,10 +261,8 @@ static inline void __smp_reschedule_interrupt(void)
__visible void smp_reschedule_interrupt(struct pt_regs *regs)
{
- irq_enter();
ack_APIC_irq();
__smp_reschedule_interrupt();
- irq_exit();
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 42f5eb7b4f6c..46732dc3b73c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -58,7 +58,6 @@
#include <asm/desc.h>
#include <asm/nmi.h>
#include <asm/irq.h>
-#include <asm/idle.h>
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
@@ -104,11 +103,21 @@ static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
-static bool logical_packages_frozen __read_mostly;
/* Maximum number of SMT threads on any online core */
int __max_smt_threads __read_mostly;
+/* Flag to indicate if a complete sched domain rebuild is required */
+bool x86_topology_update;
+
+int arch_update_cpu_topology(void)
+{
+ int retval = x86_topology_update;
+
+ x86_topology_update = false;
+ return retval;
+}
+
static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
{
unsigned long flags;
@@ -263,9 +272,14 @@ static void notrace start_secondary(void *unused)
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
-int topology_update_package_map(unsigned int apicid, unsigned int cpu)
+/**
+ * topology_update_package_map - Update the physical to logical package map
+ * @pkg: The physical package id as retrieved via CPUID
+ * @cpu: The cpu for which this is updated
+ */
+int topology_update_package_map(unsigned int pkg, unsigned int cpu)
{
- unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits;
+ unsigned int new;
/* Called from early boot ? */
if (!physical_package_map)
@@ -278,16 +292,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- if (logical_packages_frozen) {
- physical_to_logical_pkg[pkg] = -1;
- pr_warn("APIC(%x) Package %u exceeds logical package max\n",
- apicid, pkg);
+ if (logical_packages >= __max_logical_packages) {
+ pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n",
+ logical_packages, cpu, __max_logical_packages);
return -ENOSPC;
}
new = logical_packages++;
- pr_info("APIC(%x) Converting physical %u to logical package %u\n",
- apicid, pkg, new);
+ if (new != pkg) {
+ pr_info("CPU %u Converting physical %u to logical package %u\n",
+ cpu, pkg, new);
+ }
physical_to_logical_pkg[pkg] = new;
found:
@@ -308,9 +323,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg)
}
EXPORT_SYMBOL(topology_phys_to_logical_pkg);
-static void __init smp_init_package_map(void)
+static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu)
{
- unsigned int ncpus, cpu;
+ unsigned int ncpus;
size_t size;
/*
@@ -355,27 +370,9 @@ static void __init smp_init_package_map(void)
size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
physical_package_map = kzalloc(size, GFP_KERNEL);
- for_each_present_cpu(cpu) {
- unsigned int apicid = apic->cpu_present_to_apicid(cpu);
-
- if (apicid == BAD_APICID || !apic->apic_id_valid(apicid))
- continue;
- if (!topology_update_package_map(apicid, cpu))
- continue;
- pr_warn("CPU %u APICId %x disabled\n", cpu, apicid);
- per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID;
- set_cpu_possible(cpu, false);
- set_cpu_present(cpu, false);
- }
-
- if (logical_packages > __max_logical_packages) {
- pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n",
- logical_packages, __max_logical_packages);
- logical_packages_frozen = true;
- __max_logical_packages = logical_packages;
- }
-
pr_info("Max logical packages: %u\n", __max_logical_packages);
+
+ topology_update_package_map(c->phys_proc_id, cpu);
}
void __init smp_store_boot_cpu_info(void)
@@ -385,7 +382,7 @@ void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
- smp_init_package_map();
+ smp_init_package_map(c, id);
}
/*
@@ -471,22 +468,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
return false;
}
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+static inline int x86_sched_itmt_flags(void)
+{
+ return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
+}
+
+#ifdef CONFIG_SCHED_MC
+static int x86_core_flags(void)
+{
+ return cpu_core_flags() | x86_sched_itmt_flags();
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static int x86_smt_flags(void)
+{
+ return cpu_smt_flags() | x86_sched_itmt_flags();
+}
+#endif
+#endif
+
static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ NULL, },
};
static struct sched_domain_topology_level x86_topology[] = {
#ifdef CONFIG_SCHED_SMT
- { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+ { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
- { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+ { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
@@ -821,14 +838,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
return (send_status | accept_status);
}
-void smp_announce(void)
-{
- int num_nodes = num_online_nodes();
-
- printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n",
- num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus());
-}
-
/* reduce the number of lines printed when booting a large cpu count system */
static void announce_cpu(int cpu, int apicid)
{
@@ -964,9 +973,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int cpu0_nmi_registered = 0;
unsigned long timeout;
- idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(idle))) - 1);
-
+ idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;
@@ -1111,7 +1118,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
return err;
/* the FPU context is blank, nobody can own it */
- __cpu_disable_lazy_restore(cpu);
+ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
common_cpu_up(cpu, tidle);
@@ -1331,7 +1338,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
default_setup_apic_routing();
cpu0_logical_apicid = apic_bsp_setup(false);
- pr_info("CPU%d: ", 0);
+ pr_info("CPU0: ");
print_cpu_info(&cpu_data(0));
if (is_uv_system())
@@ -1456,15 +1463,15 @@ __init void prefill_possible_map(void)
possible = i;
}
+ nr_cpu_ids = possible;
+
pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
+ reset_cpu_possible_mask();
+
for (i = 0; i < possible; i++)
set_cpu_possible(i, true);
- for (; i < NR_CPUS; i++)
- set_cpu_possible(i, false);
-
- nr_cpu_ids = possible;
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1575,7 +1582,6 @@ void play_dead_common(void)
{
idle_task_exit();
reset_lazy_tlbstate();
- amd_e400_remove_cpu(raw_smp_processor_id());
/* Ack it */
(void)cpu_report_death();
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..15515132bf0d 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -34,7 +34,7 @@ static void switch_idt(void *arg)
local_irq_restore(flags);
}
-void trace_irq_vector_regfunc(void)
+int trace_irq_vector_regfunc(void)
{
mutex_lock(&irq_vector_mutex);
if (!trace_irq_vector_refcount) {
@@ -44,6 +44,7 @@ void trace_irq_vector_regfunc(void)
}
trace_irq_vector_refcount++;
mutex_unlock(&irq_vector_mutex);
+ return 0;
}
void trace_irq_vector_unregfunc(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bd4e3d4d3625..bf0c6d049080 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -853,6 +853,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
{
+ unsigned long cr0;
+
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
@@ -866,10 +868,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
return;
}
#endif
- fpu__restore(&current->thread.fpu); /* interrupts still off */
-#ifdef CONFIG_X86_32
- cond_local_irq_enable(regs);
-#endif
+
+ /* This should not happen. */
+ cr0 = read_cr0();
+ if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) {
+ /* Try to fix it up and carry on. */
+ write_cr0(cr0 & ~X86_CR0_TS);
+ } else {
+ /*
+ * Something terrible happened, and we're better off trying
+ * to kill the task than getting stuck in a never-ending
+ * loop of #NM faults.
+ */
+ die("unexpected #NM exception", regs, error_code);
+ }
}
NOKPROBE_SYMBOL(do_device_not_available);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 46b2f41f8b05..0aed75a1e31b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -702,6 +702,20 @@ unsigned long native_calibrate_tsc(void)
}
}
+ /*
+ * TSC frequency determined by CPUID is a "hardware reported"
+ * frequency and is the most accurate one so far we have. This
+ * is considered a known frequency.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * For Atom SoCs TSC is the only reliable clocksource.
+ * Mark TSC reliable so no watchdog on it.
+ */
+ if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT)
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
return crystal_khz * ebx_numerator / eax_denominator;
}
@@ -1043,18 +1057,20 @@ static void detect_art(void)
if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF)
return;
- cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
- &art_to_tsc_numerator, unused, unused+1);
-
- /* Don't enable ART in a VM, non-stop TSC required */
+ /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */
if (boot_cpu_has(X86_FEATURE_HYPERVISOR) ||
!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
- art_to_tsc_denominator < ART_MIN_DENOMINATOR)
+ !boot_cpu_has(X86_FEATURE_TSC_ADJUST))
return;
- if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset))
+ cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator,
+ &art_to_tsc_numerator, unused, unused+1);
+
+ if (art_to_tsc_denominator < ART_MIN_DENOMINATOR)
return;
+ rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset);
+
/* Make this sticky over multiple CPU init calls */
setup_force_cpu_cap(X86_FEATURE_ART);
}
@@ -1064,6 +1080,11 @@ static void detect_art(void)
static struct clocksource clocksource_tsc;
+static void tsc_resume(struct clocksource *cs)
+{
+ tsc_verify_tsc_adjust(true);
+}
+
/*
* We used to compare the TSC to the cycle_last value in the clocksource
* structure to avoid a nasty time-warp. This can be observed in a
@@ -1096,6 +1117,7 @@ static struct clocksource clocksource_tsc = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
.archdata = { .vclock_mode = VCLOCK_TSC },
+ .resume = tsc_resume,
};
void mark_tsc_unstable(char *reason)
@@ -1283,10 +1305,10 @@ static int __init init_tsc_clocksource(void)
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
/*
- * Trust the results of the earlier calibration on systems
- * exporting a reliable TSC.
+ * When TSC frequency is known (retrieved via MSR or CPUID), we skip
+ * the refined calibration and directly register it as a clocksource.
*/
- if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
clocksource_register_khz(&clocksource_tsc, tsc_khz);
return 0;
}
@@ -1363,6 +1385,8 @@ void __init tsc_init(void)
if (unsynchronized_tsc())
mark_tsc_unstable("TSCs unsynchronized");
+ else
+ tsc_store_and_check_tsc_adjust(true);
check_system_tsc_reliable();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index 0fe720d64fef..19afdbd7d0a7 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void)
#ifdef CONFIG_X86_LOCAL_APIC
lapic_timer_frequency = (freq * 1000) / HZ;
#endif
+
+ /*
+ * TSC frequency determined by MSR is always considered "known"
+ * because it is reported by HW.
+ * Another fact is that on MSR capable platforms, PIT/HPET is
+ * generally not available so calibration won't work at all.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+
+ /*
+ * Unfortunately there is no way for hardware to tell whether the
+ * TSC is reliable. We were told by silicon design team that TSC
+ * on Atom SoCs are always "reliable". TSC is also the only
+ * reliable clocksource on these SoCs (HPET is either not present
+ * or not functional) so mark TSC reliable which removes the
+ * requirement for a watchdog clocksource.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+
return res;
}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 78083bf23ed1..d0db011051a5 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -14,18 +14,166 @@
* ( The serial nature of the boot logic and the CPU hotplug lock
* protects against more than 2 CPUs entering this code. )
*/
+#include <linux/topology.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/smp.h>
#include <linux/nmi.h>
#include <asm/tsc.h>
+struct tsc_adjust {
+ s64 bootval;
+ s64 adjusted;
+ unsigned long nextcheck;
+ bool warned;
+};
+
+static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+
+void tsc_verify_tsc_adjust(bool resume)
+{
+ struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust);
+ s64 curval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return;
+
+ /* Rate limit the MSR check */
+ if (!resume && time_before(jiffies, adj->nextcheck))
+ return;
+
+ adj->nextcheck = jiffies + HZ;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, curval);
+ if (adj->adjusted == curval)
+ return;
+
+ /* Restore the original value */
+ wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted);
+
+ if (!adj->warned || resume) {
+ pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n",
+ smp_processor_id(), adj->adjusted, curval);
+ adj->warned = true;
+ }
+}
+
+static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
+ unsigned int cpu, bool bootcpu)
+{
+ /*
+ * First online CPU in a package stores the boot value in the
+ * adjustment value. This value might change later via the sync
+ * mechanism. If that fails we still can yell about boot values not
+ * being consistent.
+ *
+ * On the boot cpu we just force set the ADJUST value to 0 if it's
+ * non zero. We don't do that on non boot cpus because physical
+ * hotplug should have set the ADJUST register to a value > 0 so
+ * the TSC is in sync with the already running cpus.
+ *
+ * But we always force positive ADJUST values. Otherwise the TSC
+ * deadline timer creates an interrupt storm. We also have to
+ * prevent values > 0x7FFFFFFF as those wreckage the timer as well.
+ */
+ if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) ||
+ (bootval > 0x7FFFFFFF)) {
+ pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu,
+ bootval);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ bootval = 0;
+ }
+ cur->adjusted = bootval;
+}
+
+#ifndef CONFIG_SMP
+bool __init tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu);
+ return false;
+}
+
+#else /* !CONFIG_SMP */
+
+/*
+ * Store and check the TSC ADJUST MSR if available
+ */
+bool tsc_store_and_check_tsc_adjust(bool bootcpu)
+{
+ struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int refcpu, cpu = smp_processor_id();
+ struct cpumask *mask;
+ s64 bootval;
+
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ return false;
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, bootval);
+ cur->bootval = bootval;
+ cur->nextcheck = jiffies + HZ;
+ cur->warned = false;
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different. When called on the
+ * boot CPU topology_core_cpumask() might not be available yet.
+ */
+ mask = topology_core_cpumask(cpu);
+ refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids;
+
+ if (refcpu >= nr_cpu_ids) {
+ tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(),
+ bootcpu);
+ return false;
+ }
+
+ ref = per_cpu_ptr(&tsc_adjust, refcpu);
+ /*
+ * Compare the boot value and complain if it differs in the
+ * package.
+ */
+ if (bootval != ref->bootval) {
+ pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n",
+ refcpu, ref->bootval, cpu, bootval);
+ }
+ /*
+ * The TSC_ADJUST values in a package must be the same. If the boot
+ * value on this newly upcoming CPU differs from the adjustment
+ * value of the already online CPU in this package, set it to that
+ * adjusted value.
+ */
+ if (bootval != ref->adjusted) {
+ pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n",
+ refcpu, ref->adjusted, cpu, bootval);
+ cur->adjusted = ref->adjusted;
+ wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted);
+ }
+ /*
+ * We have the TSCs forced to be in sync on this package. Skip sync
+ * test:
+ */
+ return true;
+}
+
/*
* Entry/exit counters that make sure that both CPUs
* run the measurement code at once:
*/
static atomic_t start_count;
static atomic_t stop_count;
+static atomic_t skip_test;
+static atomic_t test_runs;
/*
* We use a raw spinlock in this exceptional case, because
@@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static cycles_t last_tsc;
static cycles_t max_warp;
static int nr_warps;
+static int random_warps;
/*
* TSC-warp measurement loop running on both CPUs. This is not called
* if there is no TSC.
*/
-static void check_tsc_warp(unsigned int timeout)
+static cycles_t check_tsc_warp(unsigned int timeout)
{
- cycles_t start, now, prev, end;
- int i;
+ cycles_t start, now, prev, end, cur_max_warp = 0;
+ int i, cur_warps = 0;
start = rdtsc_ordered();
/*
@@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout)
if (unlikely(prev > now)) {
arch_spin_lock(&sync_lock);
max_warp = max(max_warp, prev - now);
+ cur_max_warp = max_warp;
+ /*
+ * Check whether this bounces back and forth. Only
+ * one CPU should observe time going backwards.
+ */
+ if (cur_warps != nr_warps)
+ random_warps++;
nr_warps++;
+ cur_warps = nr_warps;
arch_spin_unlock(&sync_lock);
}
}
WARN(!(now-start),
"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
now-start, end-start);
+ return cur_max_warp;
}
/*
@@ -136,15 +294,26 @@ void check_tsc_sync_source(int cpu)
}
/*
- * Reset it - in case this is a second bootup:
+ * Set the maximum number of test runs to
+ * 1 if the CPU does not provide the TSC_ADJUST MSR
+ * 3 if the MSR is available, so the target can try to adjust
*/
- atomic_set(&stop_count, 0);
-
+ if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST))
+ atomic_set(&test_runs, 1);
+ else
+ atomic_set(&test_runs, 3);
+retry:
/*
- * Wait for the target to arrive:
+ * Wait for the target to start or to skip the test:
*/
- while (atomic_read(&start_count) != cpus-1)
+ while (atomic_read(&start_count) != cpus - 1) {
+ if (atomic_read(&skip_test) > 0) {
+ atomic_set(&skip_test, 0);
+ return;
+ }
cpu_relax();
+ }
+
/*
* Trigger the target to continue into the measurement too:
*/
@@ -155,21 +324,35 @@ void check_tsc_sync_source(int cpu)
while (atomic_read(&stop_count) != cpus-1)
cpu_relax();
- if (nr_warps) {
+ /*
+ * If the test was successful set the number of runs to zero and
+ * stop. If not, decrement the number of runs an check if we can
+ * retry. In case of random warps no retry is attempted.
+ */
+ if (!nr_warps) {
+ atomic_set(&test_runs, 0);
+
+ pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
+ smp_processor_id(), cpu);
+
+ } else if (atomic_dec_and_test(&test_runs) || random_warps) {
+ /* Force it to 0 if random warps brought us here */
+ atomic_set(&test_runs, 0);
+
pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
smp_processor_id(), cpu);
pr_warning("Measured %Ld cycles TSC warp between CPUs, "
"turning off TSC clock.\n", max_warp);
+ if (random_warps)
+ pr_warning("TSC warped randomly between CPUs\n");
mark_tsc_unstable("check_tsc_sync_source failed");
- } else {
- pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
- smp_processor_id(), cpu);
}
/*
* Reset it - just in case we boot another CPU later:
*/
atomic_set(&start_count, 0);
+ random_warps = 0;
nr_warps = 0;
max_warp = 0;
last_tsc = 0;
@@ -178,6 +361,12 @@ void check_tsc_sync_source(int cpu)
* Let the target continue with the bootup:
*/
atomic_inc(&stop_count);
+
+ /*
+ * Retry, if there is a chance to do so.
+ */
+ if (atomic_read(&test_runs) > 0)
+ goto retry;
}
/*
@@ -185,6 +374,9 @@ void check_tsc_sync_source(int cpu)
*/
void check_tsc_sync_target(void)
{
+ struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust);
+ unsigned int cpu = smp_processor_id();
+ cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;
/* Also aborts if there is no TSC. */
@@ -192,6 +384,16 @@ void check_tsc_sync_target(void)
return;
/*
+ * Store, verify and sanitize the TSC adjust register. If
+ * successful skip the test.
+ */
+ if (tsc_store_and_check_tsc_adjust(false)) {
+ atomic_inc(&skip_test);
+ return;
+ }
+
+retry:
+ /*
* Register this CPU's participation and wait for the
* source CPU to start the measurement:
*/
@@ -199,7 +401,12 @@ void check_tsc_sync_target(void)
while (atomic_read(&start_count) != cpus)
cpu_relax();
- check_tsc_warp(loop_timeout(smp_processor_id()));
+ cur_max_warp = check_tsc_warp(loop_timeout(cpu));
+
+ /*
+ * Store the maximum observed warp value for a potential retry:
+ */
+ gbl_max_warp = max_warp;
/*
* Ok, we are done:
@@ -211,4 +418,61 @@ void check_tsc_sync_target(void)
*/
while (atomic_read(&stop_count) != cpus)
cpu_relax();
+
+ /*
+ * Reset it for the next sync test:
+ */
+ atomic_set(&stop_count, 0);
+
+ /*
+ * Check the number of remaining test runs. If not zero, the test
+ * failed and a retry with adjusted TSC is possible. If zero the
+ * test was either successful or failed terminally.
+ */
+ if (!atomic_read(&test_runs))
+ return;
+
+ /*
+ * If the warp value of this CPU is 0, then the other CPU
+ * observed time going backwards so this TSC was ahead and
+ * needs to move backwards.
+ */
+ if (!cur_max_warp)
+ cur_max_warp = -gbl_max_warp;
+
+ /*
+ * Add the result to the previous adjustment value.
+ *
+ * The adjustement value is slightly off by the overhead of the
+ * sync mechanism (observed values are ~200 TSC cycles), but this
+ * really depends on CPU, node distance and frequency. So
+ * compensating for this is hard to get right. Experiments show
+ * that the warp is not longer detectable when the observed warp
+ * value is used. In the worst case the adjustment needs to go
+ * through a 3rd run for fine tuning.
+ */
+ cur->adjusted += cur_max_warp;
+
+ /*
+ * TSC deadline timer stops working or creates an interrupt storm
+ * with adjust values < 0 and > x07ffffff.
+ *
+ * To allow adjust values > 0x7FFFFFFF we need to disable the
+ * deadline timer and use the local APIC timer, but that requires
+ * more intrusive changes and we do not have any useful information
+ * from Intel about the underlying HW wreckage yet.
+ */
+ if (cur->adjusted < 0)
+ cur->adjusted = 0;
+ if (cur->adjusted > 0x7FFFFFFF)
+ cur->adjusted = 0x7FFFFFFF;
+
+ pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n",
+ cpu, cur_max_warp, cur->adjusted);
+
+ wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted);
+ goto retry;
+
}
+
+#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index a2456d4d286a..4443e499f279 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -6,6 +6,37 @@
#define FRAME_HEADER_SIZE (sizeof(long) * 2)
+static void unwind_dump(struct unwind_state *state, unsigned long *sp)
+{
+ static bool dumped_before = false;
+ bool prev_zero, zero = false;
+ unsigned long word;
+
+ if (dumped_before)
+ return;
+
+ dumped_before = true;
+
+ printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n",
+ state->stack_info.type, state->stack_info.next_sp,
+ state->stack_mask, state->graph_idx);
+
+ for (sp = state->orig_sp; sp < state->stack_info.end; sp++) {
+ word = READ_ONCE_NOCHECK(*sp);
+
+ prev_zero = zero;
+ zero = word == 0;
+
+ if (zero) {
+ if (!prev_zero)
+ printk_deferred("%p: %016x ...\n", sp, 0);
+ continue;
+ }
+
+ printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word);
+ }
+}
+
unsigned long unwind_get_return_address(struct unwind_state *state)
{
unsigned long addr;
@@ -14,6 +45,9 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
if (unwind_done(state))
return 0;
+ if (state->regs && user_mode(state->regs))
+ return 0;
+
addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
addr_p);
@@ -21,10 +55,49 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);
+static size_t regs_size(struct pt_regs *regs)
+{
+ /* x86_32 regs from kernel mode are two words shorter: */
+ if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs))
+ return sizeof(*regs) - 2*sizeof(long);
+
+ return sizeof(*regs);
+}
+
+static bool is_last_task_frame(struct unwind_state *state)
+{
+ unsigned long bp = (unsigned long)state->bp;
+ unsigned long regs = (unsigned long)task_pt_regs(state->task);
+
+ /*
+ * We have to check for the last task frame at two different locations
+ * because gcc can occasionally decide to realign the stack pointer and
+ * change the offset of the stack frame by a word in the prologue of a
+ * function called by head/entry code.
+ */
+ return bp == regs - FRAME_HEADER_SIZE ||
+ bp == regs - FRAME_HEADER_SIZE - sizeof(long);
+}
+
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
+{
+ unsigned long regs = (unsigned long)bp;
+
+ if (!(regs & 0x1))
+ return NULL;
+
+ return (struct pt_regs *)(regs & ~0x1);
+}
+
static bool update_stack_state(struct unwind_state *state, void *addr,
size_t len)
{
struct stack_info *info = &state->stack_info;
+ enum stack_type orig_type = info->type;
/*
* If addr isn't on the current stack, switch to the next one.
@@ -38,31 +111,127 @@ static bool update_stack_state(struct unwind_state *state, void *addr,
&state->stack_mask))
return false;
+ if (!state->orig_sp || info->type != orig_type)
+ state->orig_sp = addr;
+
return true;
}
bool unwind_next_frame(struct unwind_state *state)
{
- unsigned long *next_bp;
+ struct pt_regs *regs;
+ unsigned long *next_bp, *next_frame;
+ size_t next_len;
+ enum stack_type prev_type = state->stack_info.type;
if (unwind_done(state))
return false;
- next_bp = (unsigned long *)*state->bp;
+ /* have we reached the end? */
+ if (state->regs && user_mode(state->regs))
+ goto the_end;
+
+ if (is_last_task_frame(state)) {
+ regs = task_pt_regs(state->task);
+
+ /*
+ * kthreads (other than the boot CPU's idle thread) have some
+ * partial regs at the end of their stack which were placed
+ * there by copy_thread_tls(). But the regs don't have any
+ * useful information, so we can skip them.
+ *
+ * This user_mode() check is slightly broader than a PF_KTHREAD
+ * check because it also catches the awkward situation where a
+ * newly forked kthread transitions into a user task by calling
+ * do_execve(), which eventually clears PF_KTHREAD.
+ */
+ if (!user_mode(regs))
+ goto the_end;
+
+ /*
+ * We're almost at the end, but not quite: there's still the
+ * syscall regs frame. Entry code doesn't encode the regs
+ * pointer for syscalls, so we have to set it manually.
+ */
+ state->regs = regs;
+ state->bp = NULL;
+ return true;
+ }
+
+ /* get the next frame pointer */
+ if (state->regs)
+ next_bp = (unsigned long *)state->regs->bp;
+ else
+ next_bp = (unsigned long *)*state->bp;
+
+ /* is the next frame pointer an encoded pointer to pt_regs? */
+ regs = decode_frame_pointer(next_bp);
+ if (regs) {
+ next_frame = (unsigned long *)regs;
+ next_len = sizeof(*regs);
+ } else {
+ next_frame = next_bp;
+ next_len = FRAME_HEADER_SIZE;
+ }
/* make sure the next frame's data is accessible */
- if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
- return false;
+ if (!update_stack_state(state, next_frame, next_len)) {
+ /*
+ * Don't warn on bad regs->bp. An interrupt in entry code
+ * might cause a false positive warning.
+ */
+ if (state->regs)
+ goto the_end;
+
+ goto bad_address;
+ }
+
+ /* Make sure it only unwinds up and doesn't overlap the last frame: */
+ if (state->stack_info.type == prev_type) {
+ if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs))
+ goto bad_address;
+
+ if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE)
+ goto bad_address;
+ }
/* move to the next frame */
- state->bp = next_bp;
+ if (regs) {
+ state->regs = regs;
+ state->bp = NULL;
+ } else {
+ state->bp = next_bp;
+ state->regs = NULL;
+ }
+
return true;
+
+bad_address:
+ if (state->regs) {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",
+ state->regs, state->task->comm,
+ state->task->pid, next_frame);
+ unwind_dump(state, (unsigned long *)state->regs);
+ } else {
+ printk_deferred_once(KERN_WARNING
+ "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n",
+ state->bp, state->task->comm,
+ state->task->pid, next_frame);
+ unwind_dump(state, state->bp);
+ }
+the_end:
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
}
EXPORT_SYMBOL_GPL(unwind_next_frame);
void __unwind_start(struct unwind_state *state, struct task_struct *task,
struct pt_regs *regs, unsigned long *first_frame)
{
+ unsigned long *bp, *frame;
+ size_t len;
+
memset(state, 0, sizeof(*state));
state->task = task;
@@ -73,12 +242,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
}
/* set up the starting stack frame */
- state->bp = get_frame_pointer(task, regs);
+ bp = get_frame_pointer(task, regs);
+ regs = decode_frame_pointer(bp);
+ if (regs) {
+ state->regs = regs;
+ frame = (unsigned long *)regs;
+ len = sizeof(*regs);
+ } else {
+ state->bp = bp;
+ frame = bp;
+ len = FRAME_HEADER_SIZE;
+ }
/* initialize stack info and make sure the frame data is accessible */
- get_stack_info(state->bp, state->task, &state->stack_info,
+ get_stack_info(frame, state->task, &state->stack_info,
&state->stack_mask);
- update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
+ update_stack_state(state, frame, len);
/*
* The caller can provide the address of the first frame directly
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index dbf67f64d5ec..e79f15f108a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -91,10 +91,10 @@ SECTIONS
/* Text and read-only data */
.text : AT(ADDR(.text) - LOAD_OFFSET) {
_text = .;
+ _stext = .;
/* bootstrapping code */
HEAD_TEXT
. = ALIGN(8);
- _stext = .;
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 0bd9f1287f39..11a93f005268 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -89,7 +89,6 @@ struct x86_cpuinit_ops x86_cpuinit = {
};
static void default_nmi_init(void) { };
-static int default_i8042_detect(void) { return 1; };
struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu,
@@ -100,7 +99,6 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
.get_nmi_reason = default_get_nmi_reason,
- .i8042_detect = default_i8042_detect,
.save_sched_clock_state = tsc_save_sched_clock_state,
.restore_sched_clock_state = tsc_restore_sched_clock_state,
};
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index afa7bbb596cd..e85f6bd7b9d5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,7 +16,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */
+#include <asm/processor.h>
#include <asm/user.h>
#include <asm/fpu/xstate.h>
#include "cpuid.h"
@@ -65,6 +65,11 @@ u64 kvm_supported_xcr0(void)
#define F(x) bit(X86_FEATURE_##x)
+/* These are scattered features in cpufeatures.h. */
+#define KVM_CPUID_BIT_AVX512_4VNNIW 2
+#define KVM_CPUID_BIT_AVX512_4FMAPS 3
+#define KF(x) bit(KVM_CPUID_BIT_##x)
+
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -81,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
best->ecx |= F(OSXSAVE);
}
+ best->edx &= ~F(APIC);
+ if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE)
+ best->edx |= F(APIC);
+
if (apic) {
if (best->ecx & F(TSC_DEADLINE_TIMER))
apic->lapic_timer.timer_mode_mask = 3 << 17;
@@ -114,8 +123,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- if (use_eager_fpu())
- kvm_x86_ops->fpu_activate(vcpu);
+ kvm_x86_ops->fpu_activate(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
@@ -365,16 +373,21 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_cpuid_7_0_ebx_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
- F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
- F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
- F(AVX512BW) | F(AVX512VL);
+ F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
+ F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) |
+ F(SHA_NI) | F(AVX512BW) | F(AVX512VL);
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
/* cpuid 7.0.ecx*/
- const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+ const u32 kvm_cpuid_7_0_ecx_x86_features =
+ F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+
+ /* cpuid 7.0.edx*/
+ const u32 kvm_cpuid_7_0_edx_x86_features =
+ KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -458,12 +471,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled)
entry->ecx &= ~F(PKU);
+ entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+ entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
} else {
entry->ebx = 0;
entry->ecx = 0;
+ entry->edx = 0;
}
entry->eax = 0;
- entry->edx = 0;
break;
}
case 9:
@@ -863,17 +878,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
}
EXPORT_SYMBOL_GPL(kvm_cpuid);
-void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
+int kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
- u32 function, eax, ebx, ecx, edx;
+ u32 eax, ebx, ecx, edx;
- function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+ eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx);
kvm_register_write(vcpu, VCPU_REGS_RAX, eax);
kvm_register_write(vcpu, VCPU_REGS_RBX, ebx);
kvm_register_write(vcpu, VCPU_REGS_RCX, ecx);
kvm_register_write(vcpu, VCPU_REGS_RDX, edx);
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a3ce9d260d68..56628a44668b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -158,9 +158,11 @@
#define Src2GS (OpGS << Src2Shift)
#define Src2Mask (OpMask << Src2Shift)
#define Mmx ((u64)1 << 40) /* MMX Vector instruction */
+#define AlignMask ((u64)7 << 41)
#define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */
-#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */
-#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */
+#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */
+#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */
+#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */
#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */
#define NoWrite ((u64)1 << 45) /* No writeback */
#define SrcWrite ((u64)1 << 46) /* Write back src operand */
@@ -446,6 +448,26 @@ FOP_END;
FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET
FOP_END;
+/*
+ * XXX: inoutclob user must know where the argument is being expanded.
+ * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault.
+ */
+#define asm_safe(insn, inoutclob...) \
+({ \
+ int _fault = 0; \
+ \
+ asm volatile("1:" insn "\n" \
+ "2:\n" \
+ ".pushsection .fixup, \"ax\"\n" \
+ "3: movl $1, %[_fault]\n" \
+ " jmp 2b\n" \
+ ".popsection\n" \
+ _ASM_EXTABLE(1b, 3b) \
+ : [_fault] "+qm"(_fault) inoutclob ); \
+ \
+ _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \
+})
+
static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
enum x86_intercept intercept,
enum x86_intercept_stage stage)
@@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
* depending on whether they're AVX encoded or not.
*
* Also included is CMPXCHG16B which is not a vector instruction, yet it is
- * subject to the same check.
+ * subject to the same check. FXSAVE and FXRSTOR are checked here too as their
+ * 512 bytes of data must be aligned to a 16 byte boundary.
*/
-static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
+static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size)
{
- if (likely(size < 16))
- return false;
+ u64 alignment = ctxt->d & AlignMask;
- if (ctxt->d & Aligned)
- return true;
- else if (ctxt->d & Unaligned)
- return false;
- else if (ctxt->d & Avx)
- return false;
- else
- return true;
+ if (likely(size < 16))
+ return 1;
+
+ switch (alignment) {
+ case Unaligned:
+ case Avx:
+ return 1;
+ case Aligned16:
+ return 16;
+ case Aligned:
+ default:
+ return size;
+ }
}
static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
@@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
}
break;
}
- if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
+ if (la & (insn_alignment(ctxt, size) - 1))
return emulate_gp(ctxt, 0);
return X86EMUL_CONTINUE;
bad:
@@ -3842,6 +3869,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
+static int check_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax = 1, ebx, ecx = 0, edx;
+
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ if (!(edx & FFL(FXSR)))
+ return emulate_ud(ctxt);
+
+ if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
+ return emulate_nm(ctxt);
+
+ /*
+ * Don't emulate a case that should never be hit, instead of working
+ * around a lack of fxsave64/fxrstor64 on old compilers.
+ */
+ if (ctxt->mode >= X86EMUL_MODE_PROT64)
+ return X86EMUL_UNHANDLEABLE;
+
+ return X86EMUL_CONTINUE;
+}
+
+/*
+ * FXSAVE and FXRSTOR have 4 different formats depending on execution mode,
+ * 1) 16 bit mode
+ * 2) 32 bit mode
+ * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs
+ * preserve whole 32 bit values, though, so (1) and (2) are the same wrt.
+ * save and restore
+ * 3) 64-bit mode with REX.W prefix
+ * - like (2), but XMM 8-15 are being saved and restored
+ * 4) 64-bit mode without REX.W prefix
+ * - like (3), but FIP and FDP are 64 bit
+ *
+ * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the
+ * desired result. (4) is not emulated.
+ *
+ * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS
+ * and FPU DS) should match.
+ */
+static int em_fxsave(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ size_t size;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ ctxt->ops->get_fpu(ctxt);
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+
+ ctxt->ops->put_fpu(ctxt);
+
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)
+ size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]);
+ else
+ size = offsetof(struct fxregs_state, xmm_space[0]);
+
+ return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+}
+
+static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt,
+ struct fxregs_state *new)
+{
+ int rc = X86EMUL_CONTINUE;
+ struct fxregs_state old;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old));
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ /*
+ * 64 bit host will restore XMM 8-15, which is not correct on non-64
+ * bit guests. Load the current values in order to preserve 64 bit
+ * XMMs after fxrstor.
+ */
+#ifdef CONFIG_X86_64
+ /* XXX: accessing XMM 8-15 very awkwardly */
+ memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16);
+#endif
+
+ /*
+ * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but
+ * does save and restore MXCSR.
+ */
+ if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))
+ memcpy(new->xmm_space, old.xmm_space, 8 * 16);
+
+ return rc;
+}
+
+static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
+{
+ struct fxregs_state fx_state;
+ int rc;
+
+ rc = check_fxsr(ctxt);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ if (fx_state.mxcsr >> 16)
+ return emulate_gp(ctxt, 0);
+
+ ctxt->ops->get_fpu(ctxt);
+
+ if (ctxt->mode < X86EMUL_MODE_PROT64)
+ rc = fxrstor_fixup(ctxt, &fx_state);
+
+ if (rc == X86EMUL_CONTINUE)
+ rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
+
+ ctxt->ops->put_fpu(ctxt);
+
+ return rc;
+}
+
static bool valid_cr(int nr)
{
switch (nr) {
@@ -4194,7 +4346,9 @@ static const struct gprefix pfx_0f_ae_7 = {
};
static const struct group_dual group15 = { {
- N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7),
+ I(ModRM | Aligned16, em_fxsave),
+ I(ModRM | Aligned16, em_fxrstor),
+ N, N, N, N, N, GP(0, &pfx_0f_ae_7),
}, {
N, N, N, N, N, N, N, N,
} };
@@ -5066,21 +5220,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
- bool fault = false;
+ int rc;
ctxt->ops->get_fpu(ctxt);
- asm volatile("1: fwait \n\t"
- "2: \n\t"
- ".pushsection .fixup,\"ax\" \n\t"
- "3: \n\t"
- "movb $1, %[fault] \n\t"
- "jmp 2b \n\t"
- ".popsection \n\t"
- _ASM_EXTABLE(1b, 3b)
- : [fault]"+qm"(fault));
+ rc = asm_safe("fwait");
ctxt->ops->put_fpu(ctxt);
- if (unlikely(fault))
+ if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
return X86EMUL_CONTINUE;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 42b1c83741c8..1572c35b4f1a 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
return ret;
}
-int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
{
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
struct kvm_lapic_irq irq;
@@ -852,6 +852,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
return;
+ mutex_lock(&kvm->arch.hyperv.hv_lock);
+ if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+ goto out_unlock;
+
gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
/*
* Because the TSC parameters only vary when there is a
@@ -859,7 +863,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
*/
if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
&tsc_seq, sizeof(tsc_seq))))
- return;
+ goto out_unlock;
/*
* While we're computing and writing the parameters, force the
@@ -868,15 +872,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = 0;
if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
- return;
+ goto out_unlock;
if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
- return;
+ goto out_unlock;
/* Ensure sequence is zero before writing the rest of the struct. */
smp_wmb();
if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
- return;
+ goto out_unlock;
/*
* Now switch to the TSC page mechanism by writing the sequence.
@@ -891,6 +895,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm,
hv->tsc_ref.tsc_sequence = tsc_seq;
kvm_write_guest(kvm, gfn_to_gpa(gfn),
&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+out_unlock:
+ mutex_unlock(&kvm->arch.hyperv.hv_lock);
}
static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
@@ -1142,9 +1148,9 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
return kvm_hv_set_msr(vcpu, msr, data, host);
@@ -1155,9 +1161,9 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
if (kvm_hv_msr_partition_wide(msr)) {
int r;
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock);
r = kvm_hv_get_msr_pw(vcpu, msr, pdata);
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
return r;
} else
return kvm_hv_get_msr(vcpu, msr, pdata);
@@ -1165,7 +1171,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
bool kvm_hv_hypercall_enabled(struct kvm *kvm)
{
- return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
+ return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE;
}
static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 16a7134eedac..a78b445ce411 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
*/
smp_mb();
if (atomic_dec_if_positive(&ps->pending) > 0)
- kthread_queue_work(&pit->worker, &pit->expired);
+ kthread_queue_work(pit->worker, &pit->expired);
}
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
if (atomic_read(&ps->reinject))
atomic_inc(&ps->pending);
- kthread_queue_work(&pt->worker, &pt->expired);
+ kthread_queue_work(pt->worker, &pt->expired);
if (ps->is_periodic) {
hrtimer_add_expires_ns(&ps->timer, ps->period);
@@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pid_nr = pid_vnr(pid);
put_pid(pid);
- kthread_init_worker(&pit->worker);
- pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker,
- "kvm-pit/%d", pid_nr);
- if (IS_ERR(pit->worker_task))
+ pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr);
+ if (IS_ERR(pit->worker))
goto fail_kthread;
kthread_init_work(&pit->expired, pit_do_work);
@@ -713,7 +711,7 @@ fail_register_speaker:
fail_register_pit:
mutex_unlock(&kvm->slots_lock);
kvm_pit_set_reinject(pit, false);
- kthread_stop(pit->worker_task);
+ kthread_destroy_worker(pit->worker);
fail_kthread:
kvm_free_irq_source_id(kvm, pit->irq_source_id);
fail_request:
@@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm)
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev);
kvm_pit_set_reinject(pit, false);
hrtimer_cancel(&pit->pit_state.timer);
- kthread_flush_work(&pit->expired);
- kthread_stop(pit->worker_task);
+ kthread_destroy_worker(pit->worker);
kvm_free_irq_source_id(kvm, pit->irq_source_id);
kfree(pit);
}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 2f5af0798326..600bee9dcbbd 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -44,8 +44,7 @@ struct kvm_pit {
struct kvm_kpit_state pit_state;
int irq_source_id;
struct kvm_irq_mask_notifier mask_notifier;
- struct kthread_worker worker;
- struct task_struct *worker_task;
+ struct kthread_worker *worker;
struct kthread_work expired;
};
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6f69340f9fa3..34a66b2d47e6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs)
u32 i, pir_val;
for (i = 0; i <= 7; i++) {
- pir_val = xchg(&pir[i], 0);
- if (pir_val)
+ pir_val = READ_ONCE(pir[i]);
+ if (pir_val) {
+ pir_val = xchg(&pir[i], 0);
*((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+ }
}
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
@@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
static u32 apic_get_tmcct(struct kvm_lapic *apic)
{
- ktime_t remaining;
+ ktime_t remaining, now;
s64 ns;
u32 tmcct;
@@ -1101,7 +1103,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
apic->lapic_timer.period == 0)
return 0;
- remaining = hrtimer_get_remaining(&apic->lapic_timer.timer);
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
if (ktime_to_ns(remaining) < 0)
remaining = ktime_set(0, 0);
@@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_save(flags);
- now = apic->lapic_timer.timer.base->get_time();
+ now = ktime_get();
guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
@@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic)
local_irq_restore(flags);
}
+static void start_sw_period(struct kvm_lapic *apic)
+{
+ if (!apic->lapic_timer.period)
+ return;
+
+ if (apic_lvtt_oneshot(apic) &&
+ ktime_after(ktime_get(),
+ apic->lapic_timer.target_expiration)) {
+ apic_timer_expired(apic);
+ return;
+ }
+
+ hrtimer_start(&apic->lapic_timer.timer,
+ apic->lapic_timer.target_expiration,
+ HRTIMER_MODE_ABS_PINNED);
+}
+
+static bool set_target_expiration(struct kvm_lapic *apic)
+{
+ ktime_t now;
+ u64 tscl = rdtsc();
+
+ now = ktime_get();
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+ if (!apic->lapic_timer.period)
+ return false;
+
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+
+ apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+ PRIx64 ", "
+ "timer initial count 0x%x, period %lldns, "
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
+ APIC_BUS_CYCLE_NS, ktime_to_ns(now),
+ kvm_lapic_get_reg(apic, APIC_TMICT),
+ apic->lapic_timer.period,
+ ktime_to_ns(ktime_add_ns(now,
+ apic->lapic_timer.period)));
+
+ apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) +
+ nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period);
+
+ return true;
+}
+
+static void advance_periodic_target_expiration(struct kvm_lapic *apic)
+{
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, apic->lapic_timer.period);
+ apic->lapic_timer.target_expiration =
+ ktime_add_ns(apic->lapic_timer.target_expiration,
+ apic->lapic_timer.period);
+}
+
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
-static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+static void cancel_hv_timer(struct kvm_lapic *apic)
{
kvm_x86_ops->cancel_hv_timer(apic->vcpu);
apic->lapic_timer.hv_timer_in_use = false;
}
-void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
-{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- WARN_ON(!apic->lapic_timer.hv_timer_in_use);
- WARN_ON(swait_active(&vcpu->wq));
- cancel_hv_tscdeadline(apic);
- apic_timer_expired(apic);
-}
-EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
-
-static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+static bool start_hv_timer(struct kvm_lapic *apic)
{
u64 tscdeadline = apic->lapic_timer.tscdeadline;
- if (atomic_read(&apic->lapic_timer.pending) ||
+ if ((atomic_read(&apic->lapic_timer.pending) &&
+ !apic_lvtt_period(apic)) ||
kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
if (apic->lapic_timer.hv_timer_in_use)
- cancel_hv_tscdeadline(apic);
+ cancel_hv_timer(apic);
} else {
apic->lapic_timer.hv_timer_in_use = true;
hrtimer_cancel(&apic->lapic_timer.timer);
/* In case the sw timer triggered in the window */
- if (atomic_read(&apic->lapic_timer.pending))
- cancel_hv_tscdeadline(apic);
+ if (atomic_read(&apic->lapic_timer.pending) &&
+ !apic_lvtt_period(apic))
+ cancel_hv_timer(apic);
}
trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
apic->lapic_timer.hv_timer_in_use);
return apic->lapic_timer.hv_timer_in_use;
}
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ WARN_ON(swait_active(&vcpu->wq));
+ cancel_hv_timer(apic);
+ apic_timer_expired(apic);
+
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ advance_periodic_target_expiration(apic);
+ if (!start_hv_timer(apic))
+ start_sw_period(apic);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
WARN_ON(apic->lapic_timer.hv_timer_in_use);
- if (apic_lvtt_tscdeadline(apic))
- start_hv_tscdeadline(apic);
+ start_hv_timer(apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
@@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
if (!apic->lapic_timer.hv_timer_in_use)
return;
- cancel_hv_tscdeadline(apic);
+ cancel_hv_timer(apic);
if (atomic_read(&apic->lapic_timer.pending))
return;
- start_sw_tscdeadline(apic);
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic))
+ start_sw_period(apic);
+ else if (apic_lvtt_tscdeadline(apic))
+ start_sw_tscdeadline(apic);
}
EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
static void start_apic_timer(struct kvm_lapic *apic)
{
- ktime_t now;
-
atomic_set(&apic->lapic_timer.pending, 0);
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
- /* lapic timer in oneshot or periodic mode */
- now = apic->lapic_timer.timer.base->get_time();
- apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
- * APIC_BUS_CYCLE_NS * apic->divide_count;
-
- if (!apic->lapic_timer.period)
- return;
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
- }
-
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, apic->lapic_timer.period),
- HRTIMER_MODE_ABS_PINNED);
-
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
- PRIx64 ", "
- "timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __func__,
- APIC_BUS_CYCLE_NS, ktime_to_ns(now),
- kvm_lapic_get_reg(apic, APIC_TMICT),
- apic->lapic_timer.period,
- ktime_to_ns(ktime_add_ns(now,
- apic->lapic_timer.period)));
+ if (set_target_expiration(apic) &&
+ !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
+ start_sw_period(apic);
} else if (apic_lvtt_tscdeadline(apic)) {
- if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+ if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic)))
start_sw_tscdeadline(apic);
}
}
@@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
* LAPIC interface
*----------------------------------------------------------------------
*/
+u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (!lapic_in_kernel(vcpu))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
- apic_lvtt_period(apic))
+ if (!lapic_in_kernel(vcpu) ||
+ !apic_lvtt_tscdeadline(apic))
return 0;
return apic->lapic_timer.tscdeadline;
@@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
u64 old_value = vcpu->arch.apic_base;
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!apic) {
+ if (!apic)
value |= MSR_IA32_APICBASE_BSP;
- vcpu->arch.apic_base = value;
- return;
- }
vcpu->arch.apic_base = value;
+ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE)
+ kvm_update_cpuid(vcpu);
+
+ if (!apic)
+ return;
+
/* update jump label if enable bit changes */
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
if (value & MSR_IA32_APICBASE_ENABLE) {
@@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
apic_timer_expired(apic);
if (lapic_is_periodic(apic)) {
+ advance_periodic_target_expiration(apic);
hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
return HRTIMER_RESTART;
} else
@@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
kvm_apic_local_deliver(apic, APIC_LVTT);
if (apic_lvtt_tscdeadline(apic))
apic->lapic_timer.tscdeadline = 0;
+ if (apic_lvtt_oneshot(apic)) {
+ apic->lapic_timer.tscdeadline = 0;
+ apic->lapic_timer.target_expiration = ktime_set(0, 0);
+ }
atomic_set(&apic->lapic_timer.pending, 0);
}
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f60d01c29d51..e0c80233b3e1 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -15,6 +15,7 @@
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
+ ktime_t target_expiration;
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
@@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d9c7e986b4e4..7012de4a1fed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask) {
- /*
- * We are holding the kvm->mmu_lock, and we are blowing up
- * shadow PTEs. MMU notifier consumers need to be kept at bay.
- * This is correct as long as we don't decouple the mmu_lock
- * protected regions (like invalidate_range_start|end does).
- */
- kvm->mmu_notifier_seq++;
+ if (!shadow_accessed_mask)
return kvm_handle_hva_range(kvm, start, end, 0,
kvm_unmap_rmapp);
- }
return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
}
@@ -4405,7 +4397,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
}
static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ const u8 *new, int bytes,
+ struct kvm_page_track_notifier_node *node)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -4508,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu)
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_RETRY;
@@ -4527,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
return r;
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
if (r < 0)
return r;
if (!r)
return 1;
+ /*
+ * Before emulating the instruction, check if the error code
+ * was due to a RO violation while translating the guest page.
+ * This can occur when using nested virtualization with nested
+ * paging in both guests. If true, we simply unprotect the page
+ * and resume the guest.
+ *
+ * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used
+ * in PFERR_NEXT_GUEST_PAGE)
+ */
+ if (error_code == PFERR_NESTED_GUEST_PAGE) {
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+ return 1;
+ }
+
if (mmio_info_in_cache(vcpu, cr2, direct))
emulation_type = 0;
emulate:
@@ -4617,11 +4626,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
+static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ struct kvm_page_track_notifier_node *node)
+{
+ kvm_mmu_invalidate_zap_all_pages(kvm);
+}
+
void kvm_mmu_init_vm(struct kvm *kvm)
{
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
node->track_write = kvm_mmu_pte_write;
+ node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
kvm_page_track_register_notifier(kvm, node);
}
@@ -4958,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
* zap all shadow pages.
*/
if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
- printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
+ kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
}
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index b431539c3714..4a1c13eaa518 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -106,6 +106,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm,
if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn))
kvm_flush_remote_tlbs(kvm);
}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page);
/*
* remove the guest page from the tracking pool which stops the interception
@@ -135,6 +136,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
*/
kvm_mmu_gfn_allow_lpage(slot, gfn);
}
+EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page);
/*
* check if the corresponding access on the specified guest page is tracked.
@@ -181,6 +183,7 @@ kvm_page_track_register_notifier(struct kvm *kvm,
hlist_add_head_rcu(&n->node, &head->track_notifier_list);
spin_unlock(&kvm->mmu_lock);
}
+EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier);
/*
* stop receiving the event interception. It is the opposed operation of
@@ -199,6 +202,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm,
spin_unlock(&kvm->mmu_lock);
synchronize_srcu(&head->track_srcu);
}
+EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier);
/*
* Notify the node that write access is intercepted and write emulation is
@@ -222,6 +226,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new,
idx = srcu_read_lock(&head->track_srcu);
hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
if (n->track_write)
- n->track_write(vcpu, gpa, new, bytes);
+ n->track_write(vcpu, gpa, new, bytes, n);
+ srcu_read_unlock(&head->track_srcu, idx);
+}
+
+/*
+ * Notify the node that memory slot is being removed or moved so that it can
+ * drop write-protection for the pages in the memory slot.
+ *
+ * The node should figure out it has any write-protected pages in this slot
+ * by itself.
+ */
+void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ struct kvm_page_track_notifier_head *head;
+ struct kvm_page_track_notifier_node *n;
+ int idx;
+
+ head = &kvm->arch.track_notifier_head;
+
+ if (hlist_empty(&head->track_notifier_list))
+ return;
+
+ idx = srcu_read_lock(&head->track_srcu);
+ hlist_for_each_entry_rcu(n, &head->track_notifier_list, node)
+ if (n->track_flush_slot)
+ n->track_flush_slot(kvm, slot, n);
srcu_read_unlock(&head->track_srcu, idx);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ca1eca5038d..08a4d3ab3455 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
static int pf_interception(struct vcpu_svm *svm)
{
u64 fault_address = svm->vmcb->control.exit_info_2;
- u32 error_code;
+ u64 error_code;
int r = 1;
switch (svm->apf_reason) {
@@ -2270,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
- if (string || in)
+ if (string)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
@@ -2278,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm)
svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
- return kvm_fast_pio_out(vcpu, size, port);
+ return in ? kvm_fast_pio_in(vcpu, size, port)
+ : kvm_fast_pio_out(vcpu, size, port);
}
static int nmi_interception(struct vcpu_svm *svm)
@@ -3150,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm)
static int wbinvd_interception(struct vcpu_svm *svm)
{
- kvm_emulate_wbinvd(&svm->vcpu);
- return 1;
+ return kvm_emulate_wbinvd(&svm->vcpu);
}
static int xsetbv_interception(struct vcpu_svm *svm)
@@ -3238,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm)
static int cpuid_interception(struct vcpu_svm *svm)
{
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- kvm_emulate_cpuid(&svm->vcpu);
- return 1;
+ return kvm_emulate_cpuid(&svm->vcpu);
}
static int iret_interception(struct vcpu_svm *svm)
@@ -3275,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm)
return emulate_on_interception(svm);
err = kvm_rdpmc(&svm->vcpu);
- kvm_complete_insn_gp(&svm->vcpu, err);
-
- return 1;
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
@@ -3374,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm)
}
kvm_register_write(&svm->vcpu, reg, val);
}
- kvm_complete_insn_gp(&svm->vcpu, err);
-
- return 1;
+ return kvm_complete_insn_gp(&svm->vcpu, err);
}
static int dr_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5382b82462fc..24db5fb6f575 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
+/*
+ * Hyper-V requires all of these, so mark them as supported even though
+ * they are just treated the same as all-context.
+ */
+#define VMX_VPID_EXTENT_SUPPORTED_MASK \
+ (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \
+ VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive
@@ -446,23 +462,31 @@ struct nested_vmx {
u16 vpid02;
u16 last_vpid;
+ /*
+ * We only store the "true" versions of the VMX capability MSRs. We
+ * generate the "non-true" versions by setting the must-be-1 bits
+ * according to the SDM.
+ */
u32 nested_vmx_procbased_ctls_low;
u32 nested_vmx_procbased_ctls_high;
- u32 nested_vmx_true_procbased_ctls_low;
u32 nested_vmx_secondary_ctls_low;
u32 nested_vmx_secondary_ctls_high;
u32 nested_vmx_pinbased_ctls_low;
u32 nested_vmx_pinbased_ctls_high;
u32 nested_vmx_exit_ctls_low;
u32 nested_vmx_exit_ctls_high;
- u32 nested_vmx_true_exit_ctls_low;
u32 nested_vmx_entry_ctls_low;
u32 nested_vmx_entry_ctls_high;
- u32 nested_vmx_true_entry_ctls_low;
u32 nested_vmx_misc_low;
u32 nested_vmx_misc_high;
u32 nested_vmx_ept_caps;
u32 nested_vmx_vpid_caps;
+ u64 nested_vmx_basic;
+ u64 nested_vmx_cr0_fixed0;
+ u64 nested_vmx_cr0_fixed1;
+ u64 nested_vmx_cr4_fixed0;
+ u64 nested_vmx_cr4_fixed1;
+ u64 nested_vmx_vmcs_enum;
};
#define POSTED_INTR_ON 0
@@ -520,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
static inline int pi_test_on(struct pi_desc *pi_desc)
{
return test_bit(POSTED_INTR_ON,
@@ -920,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-static unsigned long *vmx_io_bitmap_a;
-static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap_legacy;
-static unsigned long *vmx_msr_bitmap_longmode;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic;
-static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
-static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
-static unsigned long *vmx_vmread_bitmap;
-static unsigned long *vmx_vmwrite_bitmap;
+enum {
+ VMX_IO_BITMAP_A,
+ VMX_IO_BITMAP_B,
+ VMX_MSR_BITMAP_LEGACY,
+ VMX_MSR_BITMAP_LONGMODE,
+ VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
+ VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
+ VMX_MSR_BITMAP_LEGACY_X2APIC,
+ VMX_MSR_BITMAP_LONGMODE_X2APIC,
+ VMX_VMREAD_BITMAP,
+ VMX_VMWRITE_BITMAP,
+ VMX_BITMAP_NR
+};
+
+static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
+
+#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
+#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
+#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
+#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
+#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
+#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
+#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
+#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
+#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
+#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
static bool cpu_has_load_ia32_efer;
static bool cpu_has_load_perf_global_ctrl;
@@ -1343,10 +1389,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
}
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+ == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
}
static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
@@ -2145,12 +2191,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
- /*
- * If the FPU is not active (through the host task or
- * the guest vcpu), then restore the cr0.TS bit.
- */
- if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
- stts();
load_gdt(this_cpu_ptr(&host_gdt));
}
@@ -2529,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
} else {
if (is_long_mode(vcpu))
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive;
+ msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive;
+ msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
}
} else {
if (is_long_mode(vcpu))
@@ -2712,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
/* We support free control of debug control saving. */
- vmx->nested.nested_vmx_true_exit_ctls_low =
- vmx->nested.nested_vmx_exit_ctls_low &
- ~VM_EXIT_SAVE_DEBUG_CONTROLS;
+ vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2733,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
/* We support free control of debug control loading. */
- vmx->nested.nested_vmx_true_entry_ctls_low =
- vmx->nested.nested_vmx_entry_ctls_low &
- ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
+ vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2768,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
CPU_BASED_USE_MSR_BITMAPS;
/* We support free control of CR3 access interception. */
- vmx->nested.nested_vmx_true_procbased_ctls_low =
- vmx->nested.nested_vmx_procbased_ctls_low &
+ vmx->nested.nested_vmx_procbased_ctls_low &=
~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
/* secondary cpu-based controls */
@@ -2780,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_DESC |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_VPID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -2811,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
*/
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
- VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
- VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+ VMX_VPID_EXTENT_SUPPORTED_MASK;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -2829,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
VMX_MISC_ACTIVITY_HLT;
vmx->nested.nested_vmx_misc_high = 0;
+
+ /*
+ * This MSR reports some information about VMX support. We
+ * should return information about the VMX we emulate for the
+ * guest, and the VMCS structure we give it - not about the
+ * VMX support of the underlying hardware.
+ */
+ vmx->nested.nested_vmx_basic =
+ VMCS12_REVISION |
+ VMX_BASIC_TRUE_CTLS |
+ ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+ (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+
+ if (cpu_has_vmx_basic_inout())
+ vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT;
+
+ /*
+ * These MSRs specify bits which the guest must keep fixed on
+ * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+ * We picked the standard core2 setting.
+ */
+#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
+ vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON;
+ vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON;
+
+ /* These MSRs specify bits which the guest must keep fixed off. */
+ rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1);
+ rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1);
+
+ /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ vmx->nested.nested_vmx_vmcs_enum = 0x2e;
+}
+
+/*
+ * if fixed0[i] == 1: val[i] must be 1
+ * if fixed1[i] == 0: val[i] must be 0
+ */
+static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1)
+{
+ return ((val & fixed1) | fixed0) == val;
}
static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
{
- /*
- * Bits 0 in high must be 0, and bits 1 in low must be 1.
- */
- return ((control & high) | low) == control;
+ return fixed_bits_valid(control, low, high);
}
static inline u64 vmx_control_msr(u32 low, u32 high)
@@ -2844,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high)
return low | ((u64)high << 32);
}
-/* Returns 0 on success, non-0 otherwise. */
-static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
+{
+ superset &= mask;
+ subset &= mask;
+
+ return (superset | subset) == superset;
+}
+
+static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved =
+ /* feature (except bit 48; see below) */
+ BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
+ /* reserved */
+ BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
+ u64 vmx_basic = vmx->nested.nested_vmx_basic;
+
+ if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
+ return -EINVAL;
+
+ /*
+ * KVM does not emulate a version of VMX that constrains physical
+ * addresses of VMX structures (e.g. VMCS) to 32-bits.
+ */
+ if (data & BIT_ULL(48))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_revision_id(vmx_basic) !=
+ vmx_basic_vmcs_revision_id(data))
+ return -EINVAL;
+
+ if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_basic = data;
+ return 0;
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 supported;
+ u32 *lowp, *highp;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ lowp = &vmx->nested.nested_vmx_pinbased_ctls_low;
+ highp = &vmx->nested.nested_vmx_pinbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ lowp = &vmx->nested.nested_vmx_procbased_ctls_low;
+ highp = &vmx->nested.nested_vmx_procbased_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ lowp = &vmx->nested.nested_vmx_exit_ctls_low;
+ highp = &vmx->nested.nested_vmx_exit_ctls_high;
+ break;
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ lowp = &vmx->nested.nested_vmx_entry_ctls_low;
+ highp = &vmx->nested.nested_vmx_entry_ctls_high;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ lowp = &vmx->nested.nested_vmx_secondary_ctls_low;
+ highp = &vmx->nested.nested_vmx_secondary_ctls_high;
+ break;
+ default:
+ BUG();
+ }
+
+ supported = vmx_control_msr(*lowp, *highp);
+
+ /* Check must-be-1 bits are still 1. */
+ if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
+ return -EINVAL;
+
+ /* Check must-be-0 bits are still 0. */
+ if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
+ return -EINVAL;
+
+ *lowp = data;
+ *highp = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
+{
+ const u64 feature_and_reserved_bits =
+ /* feature */
+ BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
+ BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
+ /* reserved */
+ GENMASK_ULL(13, 9) | BIT_ULL(31);
+ u64 vmx_misc;
+
+ vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low,
+ vmx->nested.nested_vmx_misc_high);
+
+ if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
+ return -EINVAL;
+
+ if ((vmx->nested.nested_vmx_pinbased_ctls_high &
+ PIN_BASED_VMX_PREEMPTION_TIMER) &&
+ vmx_misc_preemption_timer_rate(data) !=
+ vmx_misc_preemption_timer_rate(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
+ return -EINVAL;
+
+ if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_misc_low = data;
+ vmx->nested.nested_vmx_misc_high = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
+{
+ u64 vmx_ept_vpid_cap;
+
+ vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps,
+ vmx->nested.nested_vmx_vpid_caps);
+
+ /* Every bit is either reserved or a feature bit. */
+ if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
+ return -EINVAL;
+
+ vmx->nested.nested_vmx_ept_caps = data;
+ vmx->nested.nested_vmx_vpid_caps = data >> 32;
+ return 0;
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u64 *msr;
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_CR0_FIXED0:
+ msr = &vmx->nested.nested_vmx_cr0_fixed0;
+ break;
+ case MSR_IA32_VMX_CR4_FIXED0:
+ msr = &vmx->nested.nested_vmx_cr4_fixed0;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * 1 bits (which indicates bits which "must-be-1" during VMX operation)
+ * must be 1 in the restored value.
+ */
+ if (!is_bitwise_subset(data, *msr, -1ULL))
+ return -EINVAL;
+
+ *msr = data;
+ return 0;
+}
+
+/*
+ * Called when userspace is restoring VMX MSRs.
+ *
+ * Returns 0 on success, non-0 otherwise.
+ */
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
switch (msr_index) {
case MSR_IA32_VMX_BASIC:
+ return vmx_restore_vmx_basic(vmx, data);
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS:
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ /*
+ * The "non-true" VMX capability MSRs are generated from the
+ * "true" MSRs, so we do not support restoring them directly.
+ *
+ * If userspace wants to emulate VMX_BASIC[55]=0, userspace
+ * should restore the "true" MSRs with the must-be-1 bits
+ * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
+ * DEFAULT SETTINGS".
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ return vmx_restore_control_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_MISC:
+ return vmx_restore_vmx_misc(vmx, data);
+ case MSR_IA32_VMX_CR0_FIXED0:
+ case MSR_IA32_VMX_CR4_FIXED0:
+ return vmx_restore_fixed0_msr(vmx, msr_index, data);
+ case MSR_IA32_VMX_CR0_FIXED1:
+ case MSR_IA32_VMX_CR4_FIXED1:
+ /*
+ * These MSRs are generated based on the vCPU's CPUID, so we
+ * do not support restoring them directly.
+ */
+ return -EINVAL;
+ case MSR_IA32_VMX_EPT_VPID_CAP:
+ return vmx_restore_vmx_ept_vpid_cap(vmx, data);
+ case MSR_IA32_VMX_VMCS_ENUM:
+ vmx->nested.nested_vmx_vmcs_enum = data;
+ return 0;
+ default:
/*
- * This MSR reports some information about VMX support. We
- * should return information about the VMX we emulate for the
- * guest, and the VMCS structure we give it - not about the
- * VMX support of the underlying hardware.
+ * The rest of the VMX capability MSRs do not support restore.
*/
- *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
- ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
- (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
- if (cpu_has_vmx_basic_inout())
- *pdata |= VMX_BASIC_INOUT;
+ return -EINVAL;
+ }
+}
+
+/* Returns 0 on success, non-0 otherwise. */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ switch (msr_index) {
+ case MSR_IA32_VMX_BASIC:
+ *pdata = vmx->nested.nested_vmx_basic;
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_pinbased_ctls_low,
vmx->nested.nested_vmx_pinbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
+ *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_procbased_ctls_low,
- vmx->nested.nested_vmx_procbased_ctls_high);
- break;
case MSR_IA32_VMX_PROCBASED_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high);
+ if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
+ *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_exit_ctls_low,
- vmx->nested.nested_vmx_exit_ctls_high);
- break;
case MSR_IA32_VMX_EXIT_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_exit_ctls_low,
vmx->nested.nested_vmx_exit_ctls_high);
+ if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
+ *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- *pdata = vmx_control_msr(
- vmx->nested.nested_vmx_true_entry_ctls_low,
- vmx->nested.nested_vmx_entry_ctls_high);
- break;
case MSR_IA32_VMX_ENTRY_CTLS:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high);
+ if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
+ *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
break;
case MSR_IA32_VMX_MISC:
*pdata = vmx_control_msr(
vmx->nested.nested_vmx_misc_low,
vmx->nested.nested_vmx_misc_high);
break;
- /*
- * These MSRs specify bits which the guest must keep fixed (on or off)
- * while L1 is in VMXON mode (in L1's root mode, or running an L2).
- * We picked the standard core2 setting.
- */
-#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
-#define VMXON_CR4_ALWAYSON X86_CR4_VMXE
case MSR_IA32_VMX_CR0_FIXED0:
- *pdata = VMXON_CR0_ALWAYSON;
+ *pdata = vmx->nested.nested_vmx_cr0_fixed0;
break;
case MSR_IA32_VMX_CR0_FIXED1:
- *pdata = -1ULL;
+ *pdata = vmx->nested.nested_vmx_cr0_fixed1;
break;
case MSR_IA32_VMX_CR4_FIXED0:
- *pdata = VMXON_CR4_ALWAYSON;
+ *pdata = vmx->nested.nested_vmx_cr4_fixed0;
break;
case MSR_IA32_VMX_CR4_FIXED1:
- *pdata = -1ULL;
+ *pdata = vmx->nested.nested_vmx_cr4_fixed1;
break;
case MSR_IA32_VMX_VMCS_ENUM:
- *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
+ *pdata = vmx->nested.nested_vmx_vmcs_enum;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
*pdata = vmx_control_msr(
@@ -3107,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
vmx_leave_nested(vcpu);
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
- return 1; /* they are read-only */
+ if (!msr_info->host_initiated)
+ return 1; /* they are read-only */
+ if (!nested_vmx_allowed(vcpu))
+ return 1;
+ return vmx_set_vmx_msr(vcpu, msr_index, data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
@@ -3869,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_dirty);
}
+static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST &&
+ nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ fixed0 &= ~(X86_CR0_PE | X86_CR0_PG);
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0;
+ u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1;
+
+ return fixed_bits_valid(val, fixed0, fixed1);
+}
+
+/* No difference in the restrictions on guest and host CR4 in VMX operation. */
+#define nested_guest_cr4_valid nested_cr4_valid
+#define nested_host_cr4_valid nested_cr4_valid
+
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -3997,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!nested_vmx_allowed(vcpu))
return 1;
}
- if (to_vmx(vcpu)->nested.vmxon &&
- ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
return 1;
vcpu->arch.cr4 = cr4;
@@ -4575,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
}
}
-static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type)
-{
- int f = sizeof(unsigned long);
-
- if (!cpu_has_vmx_msr_bitmap())
- return;
-
- /*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
-
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
-
- }
-}
-
/*
* If a msr is allowed by L0, we should check whether it is allowed by L1.
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
@@ -4665,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
msr, MSR_TYPE_R | MSR_TYPE_W);
}
-static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
+static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
{
if (apicv_active) {
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
+ msr, type);
+ __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
+ msr, type);
} else {
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- }
-}
-
-static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active)
-{
- if (apicv_active) {
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_R);
+ msr, type);
__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_R);
- } else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_R);
- }
-}
-
-static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active)
-{
- if (apicv_active) {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
- msr, MSR_TYPE_W);
- } else {
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
- msr, MSR_TYPE_W);
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
- msr, MSR_TYPE_W);
+ msr, type);
}
}
@@ -4828,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!pi_test_and_clear_on(&vmx->pi_desc))
+ if (!pi_test_on(&vmx->pi_desc))
return;
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
}
@@ -4845,9 +5095,11 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
u32 low32, high32;
unsigned long tmpl;
struct desc_ptr dt;
- unsigned long cr4;
+ unsigned long cr0, cr4;
- vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
+ cr0 = read_cr0();
+ WARN_ON(cr0 & X86_CR0_TS);
+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
/* Save the most likely value for this task's CR4 in the VMCS. */
@@ -5476,7 +5728,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+ if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
@@ -5587,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu)
static int handle_io(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int size, in, string;
+ int size, in, string, ret;
unsigned port;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -5601,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu)
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
- skip_emulated_instruction(vcpu);
- return kvm_fast_pio_out(vcpu, size, port);
+ ret = kvm_skip_emulated_instruction(vcpu);
+
+ /*
+ * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_fast_pio_out(vcpu, size, port) && ret;
}
static void
@@ -5617,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
-{
- unsigned long always_on = VMXON_CR0_ALWAYSON;
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
- SECONDARY_EXEC_UNRESTRICTED_GUEST &&
- nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
- always_on &= ~(X86_CR0_PE | X86_CR0_PG);
- return (val & always_on) == always_on;
-}
-
/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
{
@@ -5647,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
val = (val & ~vmcs12->cr0_guest_host_mask) |
(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
- if (!nested_cr0_valid(vcpu, val))
+ if (!nested_guest_cr0_valid(vcpu, val))
return 1;
if (kvm_set_cr0(vcpu, val))
@@ -5656,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
return 0;
} else {
if (to_vmx(vcpu)->nested.vmxon &&
- ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+ !nested_host_cr0_valid(vcpu, val))
return 1;
+
return kvm_set_cr0(vcpu, val);
}
}
@@ -5701,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
int cr;
int reg;
int err;
+ int ret;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
cr = exit_qualification & 15;
@@ -5712,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = handle_set_cr0(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 3:
err = kvm_set_cr3(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 4:
err = handle_set_cr4(vcpu, val);
- kvm_complete_insn_gp(vcpu, err);
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = (u8)val;
err = kvm_set_cr8(vcpu, cr8);
- kvm_complete_insn_gp(vcpu, err);
+ ret = kvm_complete_insn_gp(vcpu, err);
if (lapic_in_kernel(vcpu))
- return 1;
+ return ret;
if (cr8_prev <= cr8)
- return 1;
+ return ret;
+ /*
+ * TODO: we might be squashing a
+ * KVM_GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -5739,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 2: /* clts */
handle_clts(vcpu);
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- skip_emulated_instruction(vcpu);
vmx_fpu_activate(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
case 1: /*mov from cr*/
switch (cr) {
case 3:
val = kvm_read_cr3(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
case 8:
val = kvm_get_cr8(vcpu);
kvm_register_write(vcpu, reg, val);
trace_kvm_cr_read(cr, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
case 3: /* lmsw */
@@ -5763,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
kvm_lmsw(vcpu, val);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
default:
break;
}
@@ -5835,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu)
if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
return 1;
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
@@ -5868,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
static int handle_cpuid(struct kvm_vcpu *vcpu)
{
- kvm_emulate_cpuid(vcpu);
- return 1;
+ return kvm_emulate_cpuid(vcpu);
}
static int handle_rdmsr(struct kvm_vcpu *vcpu)
@@ -5890,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_wrmsr(struct kvm_vcpu *vcpu)
@@ -5911,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
}
trace_kvm_msr_write(ecx, data);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -5956,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
kvm_mmu_invlpg(vcpu, exit_qualification);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_rdpmc(struct kvm_vcpu *vcpu)
@@ -5965,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
int err;
err = kvm_rdpmc(vcpu);
- kvm_complete_insn_gp(vcpu, err);
-
- return 1;
+ return kvm_complete_insn_gp(vcpu, err);
}
static int handle_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_emulate_wbinvd(vcpu);
- return 1;
+ return kvm_emulate_wbinvd(vcpu);
}
static int handle_xsetbv(struct kvm_vcpu *vcpu)
@@ -5982,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
if (kvm_set_xcr(vcpu, index, new_bv) == 0)
- skip_emulated_instruction(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
return 1;
}
static int handle_xsaves(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
+ kvm_skip_emulated_instruction(vcpu);
WARN(1, "this should never happen\n");
return 1;
}
static int handle_xrstors(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
+ kvm_skip_emulated_instruction(vcpu);
WARN(1, "this should never happen\n");
return 1;
}
@@ -6016,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
(offset == APIC_EOI)) {
kvm_lapic_set_eoi(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
}
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -6165,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
- skip_emulated_instruction(vcpu);
trace_kvm_fast_mmio(gpa);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
ret = handle_mmio_page_fault(vcpu, gpa, true);
@@ -6352,50 +6587,13 @@ static __init int hardware_setup(void)
for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
kvm_define_shared_msr(i, vmx_msr_index[i]);
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return r;
+ for (i = 0; i < VMX_BITMAP_NR; i++) {
+ vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_bitmap[i])
+ goto out;
+ }
vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b)
- goto out;
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy)
- goto out1;
-
- vmx_msr_bitmap_legacy_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic)
- goto out2;
-
- vmx_msr_bitmap_legacy_x2apic_apicv_inactive =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive)
- goto out3;
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode)
- goto out4;
-
- vmx_msr_bitmap_longmode_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic)
- goto out5;
-
- vmx_msr_bitmap_longmode_x2apic_apicv_inactive =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive)
- goto out6;
-
- vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmread_bitmap)
- goto out7;
-
- vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmwrite_bitmap)
- goto out8;
-
memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
@@ -6413,7 +6611,7 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config) < 0) {
r = -EIO;
- goto out9;
+ goto out;
}
if (boot_cpu_has(X86_FEATURE_NX))
@@ -6476,39 +6674,34 @@ static __init int hardware_setup(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
- memcpy(vmx_msr_bitmap_legacy_x2apic,
+ memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
+ memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
vmx_msr_bitmap_longmode, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive,
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive,
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
vmx_msr_bitmap_longmode, PAGE_SIZE);
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+ for (msr = 0x800; msr <= 0x8ff; msr++) {
+ if (msr == 0x839 /* TMCCT */)
+ continue;
+ vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
+ }
+
/*
- * enable_apicv && kvm_vcpu_apicv_active()
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
*/
- for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr, true);
+ vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
+ vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
- /* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839, true);
- /* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808, true);
/* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b, true);
+ vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
/* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f, true);
-
- /*
- * (enable_apicv && !kvm_vcpu_apicv_active()) ||
- * !enable_apicv
- */
- /* TPR */
- vmx_disable_intercept_msr_read_x2apic(0x808, false);
- vmx_disable_intercept_msr_write_x2apic(0x808, false);
+ vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
if (enable_ept) {
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -6555,42 +6748,19 @@ static __init int hardware_setup(void)
return alloc_kvm_area();
-out9:
- free_page((unsigned long)vmx_vmwrite_bitmap);
-out8:
- free_page((unsigned long)vmx_vmread_bitmap);
-out7:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
-out6:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out5:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out4:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
-out3:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
out:
- free_page((unsigned long)vmx_io_bitmap_a);
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
return r;
}
static __exit void hardware_unsetup(void)
{
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive);
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
- free_page((unsigned long)vmx_vmwrite_bitmap);
- free_page((unsigned long)vmx_vmread_bitmap);
+ int i;
+
+ for (i = 0; i < VMX_BITMAP_NR; i++)
+ free_page((unsigned long)vmx_bitmap[i]);
free_kvm_area();
}
@@ -6604,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu)
if (ple_gap)
grow_ple_window(vcpu);
- skip_emulated_instruction(vcpu);
kvm_vcpu_on_spin(vcpu);
-
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_nop(struct kvm_vcpu *vcpu)
{
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_mwait(struct kvm_vcpu *vcpu)
@@ -6920,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*/
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
page = nested_get_page(vcpu, vmptr);
@@ -6929,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*(u32 *)kmap(page) != VMCS12_REVISION) {
nested_vmx_failInvalid(vcpu);
kunmap(page);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
kunmap(page);
vmx->nested.vmxon_ptr = vmptr;
@@ -6939,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmptr == vmx->nested.vmxon_ptr) {
nested_vmx_failValid(vcpu,
VMXERR_VMCLEAR_VMXON_POINTER);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
case EXIT_REASON_VMPTRLD:
if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INVALID_ADDRESS);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmptr == vmx->nested.vmxon_ptr) {
nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
- skip_emulated_instruction(vcpu);
- return 1;
+ VMXERR_VMPTRLD_VMXON_POINTER);
+ return kvm_skip_emulated_instruction(vcpu);
}
break;
default:
@@ -7018,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
@@ -7059,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
vmx->nested.vmxon = true;
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
out_shadow_vmcs:
kfree(vmx->nested.cached_vmcs12);
@@ -7180,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
free_nested(to_vmx(vcpu));
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMCLEAR instruction */
@@ -7221,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
nested_free_vmcs02(vmx, vmptr);
- skip_emulated_instruction(vcpu);
nested_vmx_succeed(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
@@ -7421,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (vmx->nested.current_vmptr == -1ull) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
return 0;
}
return 1;
@@ -7435,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gva_t gva = 0;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/*
* Now copy part of this value to register or memory, as requested.
@@ -7465,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
@@ -7485,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
u64 field_value = 0;
struct x86_exception e;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
+ if (!nested_vmx_check_vmcs12(vcpu))
+ return kvm_skip_emulated_instruction(vcpu);
+
if (vmx_instruction_info & (1u << 10))
field_value = kvm_register_readl(vcpu,
(((vmx_instruction_info) >> 3) & 0xf));
@@ -7508,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
if (vmcs_field_readonly(field)) {
nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
if (vmcs12_write_any(vcpu, field, field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMPTRLD instruction */
@@ -7541,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
page = nested_get_page(vcpu, vmptr);
if (page == NULL) {
nested_vmx_failInvalid(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
new_vmcs12 = kmap(page);
if (new_vmcs12->revision_id != VMCS12_REVISION) {
@@ -7550,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
nested_release_page_clean(page);
nested_vmx_failValid(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
nested_release_vmcs12(vmx);
@@ -7575,8 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the VMPTRST instruction */
@@ -7601,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
return 1;
}
nested_vmx_succeed(vcpu);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* Emulate the INVEPT instruction */
@@ -7640,8 +7791,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* According to the Intel VMX instruction reference, the memory
@@ -7672,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
break;
}
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_invvpid(struct kvm_vcpu *vcpu)
@@ -7698,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
- types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+ types = (vmx->nested.nested_vmx_vpid_caps &
+ VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- skip_emulated_instruction(vcpu);
- return 1;
+ return kvm_skip_emulated_instruction(vcpu);
}
/* according to the intel vmx instruction reference, the memory
@@ -7720,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
- /*
- * Old versions of KVM use the single-context version so we
- * have to support it; just treat it the same as all-context.
- */
+ case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+ if (!vpid) {
+ nested_vmx_failValid(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
- nested_vmx_succeed(vcpu);
break;
default:
- /* Trap individual address invalidation invvpid calls */
- BUG_ON(1);
- break;
+ WARN_ON_ONCE(1);
+ return kvm_skip_emulated_instruction(vcpu);
}
- skip_emulated_instruction(vcpu);
- return 1;
+ __vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+ nested_vmx_succeed(vcpu);
+
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -8018,7 +8170,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
switch (exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
- if (!is_exception(intr_info))
+ if (is_nmi(intr_info))
return false;
else if (is_page_fault(intr_info))
return enable_ept;
@@ -8075,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
case EXIT_REASON_IO_INSTRUCTION:
return nested_vmx_exit_handled_io(vcpu, vmcs12);
+ case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
case EXIT_REASON_MSR_READ:
case EXIT_REASON_MSR_WRITE:
return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
@@ -8611,8 +8765,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
kvm_machine_check();
/* We need to handle NMIs before interrupts are enabled */
- if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
- (exit_intr_info & INTR_INFO_VALID_MASK)) {
+ if (is_nmi(exit_intr_info)) {
kvm_before_handle_nmi(&vmx->vcpu);
asm("int $2");
kvm_after_handle_nmi(&vmx->vcpu);
@@ -8624,11 +8777,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
register void *__sp asm(_ASM_SP);
- /*
- * If external interrupt exists, IF bit is set in rflags/eflags on the
- * interrupt stack frame, and interrupt will be enabled on a return
- * from interrupt handler.
- */
if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
unsigned int vector;
@@ -8813,7 +8961,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
-void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 tscl;
@@ -9283,6 +9431,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl)
(new_ctl & ~mask) | (cur_ctl & mask));
}
+/*
+ * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
+ * (indicating "allowed-1") if they are supported in the guest's CPUID.
+ */
+static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+
+ vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff;
+ vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE;
+
+#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
+ if (entry && (entry->_reg & (_cpuid_mask))) \
+ vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \
+} while (0)
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+
+ entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
+ /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
+ cr4_fixed1_update(bit(11), ecx, bit(2));
+
+#undef cr4_fixed1_update
+}
+
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -9324,6 +9516,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+
+ if (nested_vmx_allowed(vcpu))
+ nested_vmx_cr_fixed1_bits_update(vcpu);
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9778,6 +9973,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
return 0;
}
+static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
+{
+ unsigned long invalid_mask;
+
+ invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ return (val & invalid_mask) == 0;
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
+ * emulating VM entry into a guest with EPT enabled.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
+ */
+static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
+ unsigned long *entry_failure_code)
+{
+ if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
+ if (!nested_cr3_valid(vcpu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return 1;
+ }
+
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
+ !nested_ept) {
+ if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return 1;
+ }
+ }
+
+ vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ }
+
+ kvm_mmu_reset_context(vcpu);
+ return 0;
+}
+
/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -9786,11 +10024,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
* needs. In addition to modifying the active vmcs (which is vmcs02), this
* function also has additional necessary side-effects, like setting various
* vcpu->arch fields.
+ * Returns 0 on success, 1 on failure. Invalid state exit qualification code
+ * is assigned to entry_failure_code on failure.
*/
-static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ unsigned long *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;
+ bool nested_ept_enabled = false;
vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -9955,6 +10197,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs12->guest_intr_status);
}
+ nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -9968,6 +10211,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx_set_constant_host_state(vmx);
/*
+ * Set the MSR load/store lists to match L0's settings.
+ */
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
+
+ /*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
* entry, but only if the current (host) sp changed from the value
* we wrote last (vmx->host_rsp). This cache is no longer relevant
@@ -10073,15 +10325,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
nested_ept_init_mmu_context(vcpu);
}
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
- vcpu->arch.efer = vmcs12->guest_ia32_efer;
- else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
- vcpu->arch.efer |= (EFER_LMA | EFER_LME);
- else
- vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
- /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
- vmx_set_efer(vcpu, vcpu->arch.efer);
-
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
* TS bit (for lazy fpu) and bits which we consider mandatory enabled.
@@ -10096,8 +10339,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- /* shadow page tables on either EPT or shadow page tables */
- kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+ if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+ vcpu->arch.efer = vmcs12->guest_ia32_efer;
+ else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+ vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+ else
+ vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+ /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+ vmx_set_efer(vcpu, vcpu->arch.efer);
+
+ /* Shadow page tables on either EPT or shadow page tables. */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+ entry_failure_code))
+ return 1;
+
kvm_mmu_reset_context(vcpu);
if (!enable_ept)
@@ -10115,6 +10370,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+ return 0;
}
/*
@@ -10129,12 +10385,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
struct loaded_vmcs *vmcs02;
bool ia32e;
u32 msr_entry_idx;
+ unsigned long exit_qualification;
- if (!nested_vmx_check_permission(vcpu) ||
- !nested_vmx_check_vmcs12(vcpu))
+ if (!nested_vmx_check_permission(vcpu))
return 1;
- skip_emulated_instruction(vcpu);
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
vmcs12 = get_vmcs12(vcpu);
if (enable_shadow_vmcs)
@@ -10154,37 +10412,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
nested_vmx_failValid(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- return 1;
+ goto out;
}
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
- vmx->nested.nested_vmx_true_procbased_ctls_low,
+ vmx->nested.nested_vmx_procbased_ctls_low,
vmx->nested.nested_vmx_procbased_ctls_high) ||
!vmx_control_verify(vmcs12->secondary_vm_exec_control,
vmx->nested.nested_vmx_secondary_ctls_low,
@@ -10193,33 +10451,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx->nested.nested_vmx_pinbased_ctls_low,
vmx->nested.nested_vmx_pinbased_ctls_high) ||
!vmx_control_verify(vmcs12->vm_exit_controls,
- vmx->nested.nested_vmx_true_exit_ctls_low,
+ vmx->nested.nested_vmx_exit_ctls_low,
vmx->nested.nested_vmx_exit_ctls_high) ||
!vmx_control_verify(vmcs12->vm_entry_controls,
- vmx->nested.nested_vmx_true_entry_ctls_low,
+ vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high))
{
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- return 1;
+ goto out;
}
- if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
- ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
+ !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
nested_vmx_failValid(vcpu,
VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
- return 1;
+ goto out;
}
- if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
- ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+ if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
if (vmcs12->vmcs_link_pointer != -1ull) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
- return 1;
+ goto out;
}
/*
@@ -10239,7 +10498,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
}
@@ -10257,7 +10516,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
nested_vmx_entry_failure(vcpu, vmcs12,
EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
- return 1;
+ goto out;
}
}
@@ -10270,6 +10529,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (!vmcs02)
return -ENOMEM;
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions. Don't call
+ * kvm_skip_emulated_instruction.
+ */
+ skip_emulated_instruction(vcpu);
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10284,7 +10549,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- prepare_vmcs02(vcpu, vmcs12);
+ if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+ leave_guest_mode(vcpu);
+ vmx_load_vmcs01(vcpu);
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qualification);
+ return 1;
+ }
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
@@ -10311,6 +10582,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
return 1;
+
+out:
+ return kvm_skip_emulated_instruction(vcpu);
}
/*
@@ -10616,6 +10890,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
+ unsigned long entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10653,8 +10928,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
nested_ept_uninit_mmu_context(vcpu);
- kvm_set_cr3(vcpu, vmcs12->host_cr3);
- kvm_mmu_reset_context(vcpu);
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
@@ -10755,6 +11034,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 vm_inst_error = 0;
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
@@ -10767,6 +11047,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_msr_store_count))
nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+ if (unlikely(vmx->fail))
+ vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
+
vmx_load_vmcs01(vcpu);
if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -10795,6 +11078,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
/* Update any VMCS fields that might have changed while L2 ran */
+ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
+ vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
if (vmx->hv_deadline_tsc == -1)
vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
@@ -10843,7 +11128,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
if (unlikely(vmx->fail)) {
vmx->fail = 0;
- nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+ nested_vmx_failValid(vcpu, vm_inst_error);
} else
nested_vmx_succeed(vcpu);
if (enable_shadow_vmcs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04c5d96b1d67..445c51b6cf6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -434,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
if (err)
kvm_inject_gp(vcpu, 0);
else
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ return 1;
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
@@ -573,7 +575,7 @@ out:
}
EXPORT_SYMBOL_GPL(load_pdptrs);
-static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
bool changed = true;
@@ -599,6 +601,7 @@ out:
return changed;
}
+EXPORT_SYMBOL_GPL(pdptrs_changed);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
@@ -2071,6 +2074,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
+ vcpu->arch.st.steal.preempted = 0;
+
if (vcpu->arch.st.steal.version & 1)
vcpu->arch.st.steal.version += 1; /* first time write, random junk */
@@ -2176,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: {
- u64 gpa_offset;
struct kvm_arch *ka = &vcpu->kvm->arch;
kvmclock_reset(vcpu);
@@ -2198,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & 1))
break;
- gpa_offset = data & ~(PAGE_MASK | 1);
-
if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
@@ -2294,7 +2296,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (!ignore_msrs) {
- vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
+ vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
} else {
@@ -2506,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
if (!ignore_msrs) {
- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
+ vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
+ msr_info->index);
return 1;
} else {
vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
@@ -2810,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (kvm_lapic_hv_timer_in_use(vcpu) &&
kvm_x86_ops->set_hv_timer(vcpu,
- kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_get_lapic_target_expiration_tsc(vcpu)))
kvm_lapic_switch_to_sw_timer(vcpu);
/*
* On a host with synchronized TSC, there is no need to update
@@ -2826,8 +2829,39 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
}
+static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+ return;
+
+ vcpu->arch.st.steal.preempted = 1;
+
+ kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ &vcpu->arch.st.steal.preempted,
+ offsetof(struct kvm_steal_time, preempted),
+ sizeof(vcpu->arch.st.steal.preempted));
+}
+
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
+ int idx;
+ /*
+ * Disable page faults because we're in atomic context here.
+ * kvm_write_guest_offset_cached() would call might_fault()
+ * that relies on pagefault_disable() to tell if there's a
+ * bug. NOTE: the write to guest memory may not go through if
+ * during postcopy live migration or if there's heavy guest
+ * paging.
+ */
+ pagefault_disable();
+ /*
+ * kvm_memslots() will be called by
+ * kvm_write_guest_offset_cached() so take the srcu lock.
+ */
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_steal_time_set_preempted(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ pagefault_enable();
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
@@ -4816,7 +4850,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
}
-int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
+static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
{
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
@@ -4836,8 +4870,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return kvm_emulate_wbinvd_noskip(vcpu);
+ kvm_emulate_wbinvd_noskip(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -5081,11 +5115,6 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
{
preempt_disable();
kvm_load_guest_fpu(emul_to_vcpu(ctxt));
- /*
- * CR0.TS may reference the host fpu state, not the guest fpu state,
- * so it may be clear at this point.
- */
- clts();
}
static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
@@ -5440,7 +5469,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
} else {
- vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
/*
* "Certain debug exceptions may clear bit 0-3. The
* remaining contents of the DR6 register are never
@@ -5453,6 +5481,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
}
}
+int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ int r = EMULATE_DONE;
+
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ kvm_vcpu_check_singlestep(vcpu, rflags, &r);
+ return r == EMULATE_DONE;
+}
+EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
+
static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
@@ -5638,6 +5677,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
}
EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
+static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
+{
+ unsigned long val;
+
+ /* We should only ever be called with arch.pio.count equal to 1 */
+ BUG_ON(vcpu->arch.pio.count != 1);
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX)
+ : 0;
+
+ /*
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ * the copy and tracing
+ */
+ emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
+ vcpu->arch.pio.port, &val, 1);
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+
+ return 1;
+}
+
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+ unsigned long val;
+ int ret;
+
+ /* For size less than 4 we merge, else we zero extend */
+ val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0;
+
+ ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
+ &val, 1);
+ if (ret) {
+ kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+ return ret;
+ }
+
+ vcpu->arch.complete_userspace_io = complete_fast_pio_in;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
+
static int kvmclock_cpu_down_prep(unsigned int cpu)
{
__this_cpu_write(cpu_tsc_khz, 0);
@@ -5987,8 +6069,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops->skip_emulated_instruction(vcpu);
- return kvm_vcpu_halt(vcpu);
+ int ret = kvm_skip_emulated_instruction(vcpu);
+ /*
+ * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
+ * KVM_EXIT_DEBUG here.
+ */
+ return kvm_vcpu_halt(vcpu) && ret;
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
@@ -6019,9 +6105,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
- int op_64_bit, r = 1;
+ int op_64_bit, r;
- kvm_x86_ops->skip_emulated_instruction(vcpu);
+ r = kvm_skip_emulated_instruction(vcpu);
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
@@ -7407,25 +7493,13 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- if (!vcpu->guest_fpu_loaded) {
- vcpu->fpu_counter = 0;
+ if (!vcpu->guest_fpu_loaded)
return;
- }
vcpu->guest_fpu_loaded = 0;
copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- /*
- * If using eager FPU mode, or if the guest is a frequent user
- * of the FPU, just leave the FPU active for next time.
- * Every 255 times fpu_counter rolls over to 0; a guest that uses
- * the FPU in bursts will revert to loading it on demand.
- */
- if (!use_eager_fpu()) {
- if (++vcpu->fpu_counter < 5)
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
- }
trace_kvm_fpu(0);
}
@@ -7824,6 +7898,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
+ mutex_init(&kvm->arch.hyperv.hv_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
@@ -8176,7 +8251,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_invalidate_zap_all_pages(kvm);
+ kvm_page_track_flush_slot(kvm, slot);
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 25da5bc8d83d..4ca0d78adcf0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -497,38 +497,24 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
* a whole series of functions like read_cr0() and write_cr0().
*
* We start with cr0. cr0 allows you to turn on and off all kinds of basic
- * features, but Linux only really cares about one: the horrifically-named Task
- * Switched (TS) bit at bit 3 (ie. 8)
+ * features, but the only cr0 bit that Linux ever used at runtime was the
+ * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8)
*
* What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if
* the floating point unit is used. Which allows us to restore FPU state
- * lazily after a task switch, and Linux uses that gratefully, but wouldn't a
- * name like "FPUTRAP bit" be a little less cryptic?
+ * lazily after a task switch if we wanted to, but wouldn't a name like
+ * "FPUTRAP bit" be a little less cryptic?
*
- * We store cr0 locally because the Host never changes it. The Guest sometimes
- * wants to read it and we'd prefer not to bother the Host unnecessarily.
+ * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore
+ * cr0.
*/
-static unsigned long current_cr0;
static void lguest_write_cr0(unsigned long val)
{
- lazy_hcall1(LHCALL_TS, val & X86_CR0_TS);
- current_cr0 = val;
}
static unsigned long lguest_read_cr0(void)
{
- return current_cr0;
-}
-
-/*
- * Intel provided a special instruction to clear the TS bit for people too cool
- * to use write_cr0() to do it. This "clts" instruction is faster, because all
- * the vowels have been optimized out.
- */
-static void lguest_clts(void)
-{
- lazy_hcall1(LHCALL_TS, 0);
- current_cr0 &= ~X86_CR0_TS;
+ return 0;
}
/*
@@ -1432,7 +1418,6 @@ __init void lguest_init(void)
pv_cpu_ops.load_tls = lguest_load_tls;
pv_cpu_ops.get_debugreg = lguest_get_debugreg;
pv_cpu_ops.set_debugreg = lguest_set_debugreg;
- pv_cpu_ops.clts = lguest_clts;
pv_cpu_ops.read_cr0 = lguest_read_cr0;
pv_cpu_ops.write_cr0 = lguest_write_cr0;
pv_cpu_ops.read_cr4 = lguest_read_cr4;
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index d376e4b48f88..c5959576c315 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -16,53 +16,6 @@
#include <asm/smap.h>
#include <asm/export.h>
-/* Standard copy_to_user with segment limit checking */
-ENTRY(_copy_to_user)
- mov PER_CPU_VAR(current_task), %rax
- movq %rdi,%rcx
- addq %rdx,%rcx
- jc bad_to_user
- cmpq TASK_addr_limit(%rax),%rcx
- ja bad_to_user
- ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
- "jmp copy_user_generic_string", \
- X86_FEATURE_REP_GOOD, \
- "jmp copy_user_enhanced_fast_string", \
- X86_FEATURE_ERMS
-ENDPROC(_copy_to_user)
-EXPORT_SYMBOL(_copy_to_user)
-
-/* Standard copy_from_user with segment limit checking */
-ENTRY(_copy_from_user)
- mov PER_CPU_VAR(current_task), %rax
- movq %rsi,%rcx
- addq %rdx,%rcx
- jc bad_from_user
- cmpq TASK_addr_limit(%rax),%rcx
- ja bad_from_user
- ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
- "jmp copy_user_generic_string", \
- X86_FEATURE_REP_GOOD, \
- "jmp copy_user_enhanced_fast_string", \
- X86_FEATURE_ERMS
-ENDPROC(_copy_from_user)
-EXPORT_SYMBOL(_copy_from_user)
-
-
- .section .fixup,"ax"
- /* must zero dest */
-ENTRY(bad_from_user)
-bad_from_user:
- movl %edx,%ecx
- xorl %eax,%eax
- rep
- stosb
-bad_to_user:
- movl %edx,%eax
- ret
-ENDPROC(bad_from_user)
- .previous
-
/*
* copy_user_generic_unrolled - memory copy with exception handling.
* This version is for CPUs like P4 that don't have efficient micro
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index d1dee753b949..07764255b611 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -113,14 +113,14 @@ int msr_clear_bit(u32 msr, u8 bit)
}
#ifdef CONFIG_TRACEPOINTS
-void do_trace_write_msr(unsigned msr, u64 val, int failed)
+void do_trace_write_msr(unsigned int msr, u64 val, int failed)
{
trace_write_msr(msr, val, failed);
}
EXPORT_SYMBOL(do_trace_write_msr);
EXPORT_TRACEPOINT_SYMBOL(write_msr);
-void do_trace_read_msr(unsigned msr, u64 val, int failed)
+void do_trace_read_msr(unsigned int msr, u64 val, int failed)
{
trace_read_msr(msr, val, failed);
}
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index b4908789484e..c074799bddae 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -34,3 +34,52 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
return ret;
}
EXPORT_SYMBOL_GPL(copy_from_user_nmi);
+
+/**
+ * copy_to_user: - Copy a block of data into user space.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * Copy data from kernel space to user space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
+{
+ if (access_ok(VERIFY_WRITE, to, n))
+ n = __copy_to_user(to, from, n);
+ return n;
+}
+EXPORT_SYMBOL(_copy_to_user);
+
+/**
+ * copy_from_user: - Copy a block of data from user space.
+ * @to: Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep if pagefaults are
+ * enabled.
+ *
+ * Copy data from user space to kernel space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ */
+unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
+{
+ if (access_ok(VERIFY_READ, from, n))
+ n = __copy_from_user(to, from, n);
+ else
+ memset(to, 0, n);
+ return n;
+}
+EXPORT_SYMBOL(_copy_from_user);
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 3bc7baf2a711..0b281217c890 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -640,52 +640,3 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
return n;
}
EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero);
-
-/**
- * copy_to_user: - Copy a block of data into user space.
- * @to: Destination address, in user space.
- * @from: Source address, in kernel space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from kernel space to user space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- */
-unsigned long _copy_to_user(void __user *to, const void *from, unsigned n)
-{
- if (access_ok(VERIFY_WRITE, to, n))
- n = __copy_to_user(to, from, n);
- return n;
-}
-EXPORT_SYMBOL(_copy_to_user);
-
-/**
- * copy_from_user: - Copy a block of data from user space.
- * @to: Destination address, in kernel space.
- * @from: Source address, in user space.
- * @n: Number of bytes to copy.
- *
- * Context: User context only. This function may sleep if pagefaults are
- * enabled.
- *
- * Copy data from user space to kernel space.
- *
- * Returns number of bytes that could not be copied.
- * On success, this will be zero.
- *
- * If some data could not be copied, this function will pad the copied
- * data to the requested size using zero bytes.
- */
-unsigned long _copy_from_user(void *to, const void __user *from, unsigned n)
-{
- if (access_ok(VERIFY_READ, from, n))
- n = __copy_from_user(to, from, n);
- else
- memset(to, 0, n);
- return n;
-}
-EXPORT_SYMBOL(_copy_from_user);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9f72ca3b2669..e3254ca0eec4 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -413,7 +413,7 @@ out:
void vmalloc_sync_all(void)
{
- sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
+ sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
/*
@@ -679,8 +679,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
printk(KERN_CONT "paging request");
printk(KERN_CONT " at %p\n", (void *) address);
- printk(KERN_ALERT "IP:");
- printk_address(regs->ip);
+ printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
dump_pagetable(address);
}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 14b9dd71d9e8..963895f9af7f 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -89,10 +89,10 @@ static int __init nonx32_setup(char *str)
__setup("noexec32=", nonx32_setup);
/*
- * When memory was added/removed make sure all the processes MM have
+ * When memory was added make sure all the processes MM have
* suitable PGD entries in the local PGD level page.
*/
-void sync_global_pgds(unsigned long start, unsigned long end, int removed)
+void sync_global_pgds(unsigned long start, unsigned long end)
{
unsigned long address;
@@ -100,12 +100,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed)
const pgd_t *pgd_ref = pgd_offset_k(address);
struct page *page;
- /*
- * When it is called after memory hot remove, pgd_none()
- * returns true. In this case (removed == 1), we must clear
- * the PGD entries in the local PGD level page.
- */
- if (pgd_none(*pgd_ref) && !removed)
+ if (pgd_none(*pgd_ref))
continue;
spin_lock(&pgd_lock);
@@ -122,13 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed)
BUG_ON(pgd_page_vaddr(*pgd)
!= pgd_page_vaddr(*pgd_ref));
- if (removed) {
- if (pgd_none(*pgd_ref) && !pgd_none(*pgd))
- pgd_clear(pgd);
- } else {
- if (pgd_none(*pgd))
- set_pgd(pgd, *pgd_ref);
- }
+ if (pgd_none(*pgd))
+ set_pgd(pgd, *pgd_ref);
spin_unlock(pgt_lock);
}
@@ -596,7 +586,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
}
if (pgd_changed)
- sync_global_pgds(vaddr_start, vaddr_end - 1, 0);
+ sync_global_pgds(vaddr_start, vaddr_end - 1);
__flush_tlb_all();
@@ -1239,7 +1229,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
} else
err = vmemmap_populate_basepages(start, end, node);
if (!err)
- sync_global_pgds(start, end - 1, 0);
+ sync_global_pgds(start, end - 1);
return err;
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index e4f800999b32..324e5713d386 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -350,12 +350,12 @@ int mpx_enable_management(void)
* The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
* expected to be relatively expensive. Storing the bounds
* directory here means that we do not have to do xsave in the
- * unmap path; we can just use mm->bd_addr instead.
+ * unmap path; we can just use mm->context.bd_addr instead.
*/
bd_base = mpx_get_bounds_dir();
down_write(&mm->mmap_sem);
- mm->bd_addr = bd_base;
- if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR)
+ mm->context.bd_addr = bd_base;
+ if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
ret = -ENXIO;
up_write(&mm->mmap_sem);
@@ -370,7 +370,7 @@ int mpx_disable_management(void)
return -ENXIO;
down_write(&mm->mmap_sem);
- mm->bd_addr = MPX_INVALID_BOUNDS_DIR;
+ mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
up_write(&mm->mmap_sem);
return 0;
}
@@ -947,7 +947,7 @@ static int try_unmap_single_bt(struct mm_struct *mm,
end = bta_end_vaddr;
}
- bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start);
+ bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start);
ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
/*
* No bounds table there, so nothing to unmap.
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 3f35b48d1d9d..12dcad7297a5 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -19,7 +19,7 @@
#include "numa_internal.h"
-int __initdata numa_off;
+int numa_off;
nodemask_t numa_nodes_parsed __initdata;
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 83e701f160a9..efc32bc6862b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -986,20 +986,17 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
return 0;
}
-int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
- pfn_t pfn)
+void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn)
{
enum page_cache_mode pcm;
if (!pat_enabled())
- return 0;
+ return;
/* Set prot based on lookup */
pcm = lookup_memtype(pfn_t_to_phys(pfn));
*prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) |
cachemode2protval(pcm));
-
- return 0;
}
/*
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index f88ce0e5efd9..2dab69a706ec 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -141,8 +141,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) |
* Called from the FPU code when creating a fresh set of FPU
* registers. This is called from a very specific context where
* we know the FPU regstiers are safe for use and we can use PKRU
- * directly. The fact that PKRU is only available when we are
- * using eagerfpu mode makes this possible.
+ * directly.
*/
void copy_init_pkru_to_fpregs(void)
{
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index fe04a04dab8e..e76d1af60f7a 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
jmp_offset = func - (image + addrs[i]);
if (seen_ld_abs) {
- reload_skb_data = bpf_helper_changes_skb_data(func);
+ reload_skb_data = bpf_helper_changes_pkt_data(func);
if (reload_skb_data) {
EMIT1(0x57); /* push %rdi */
jmp_offset += 22; /* pop, mov, sub, mov */
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 28c04123b6dd..ffdbc4836b4f 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -339,10 +339,11 @@ fail:
return 0;
}
-static void nmi_cpu_setup(void *dummy)
+static void nmi_cpu_setup(void)
{
int cpu = smp_processor_id();
struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
+
nmi_cpu_save_registers(msrs);
raw_spin_lock(&oprofilefs_lock);
model->setup_ctrs(model, msrs);
@@ -369,7 +370,7 @@ static void nmi_cpu_restore_registers(struct op_msrs *msrs)
}
}
-static void nmi_cpu_shutdown(void *dummy)
+static void nmi_cpu_shutdown(void)
{
unsigned int v;
int cpu = smp_processor_id();
@@ -387,20 +388,26 @@ static void nmi_cpu_shutdown(void *dummy)
nmi_cpu_restore_registers(msrs);
}
-static void nmi_cpu_up(void *dummy)
+static int nmi_cpu_online(unsigned int cpu)
{
+ local_irq_disable();
if (nmi_enabled)
- nmi_cpu_setup(dummy);
+ nmi_cpu_setup();
if (ctr_running)
- nmi_cpu_start(dummy);
+ nmi_cpu_start(NULL);
+ local_irq_enable();
+ return 0;
}
-static void nmi_cpu_down(void *dummy)
+static int nmi_cpu_down_prep(unsigned int cpu)
{
+ local_irq_disable();
if (ctr_running)
- nmi_cpu_stop(dummy);
+ nmi_cpu_stop(NULL);
if (nmi_enabled)
- nmi_cpu_shutdown(dummy);
+ nmi_cpu_shutdown();
+ local_irq_enable();
+ return 0;
}
static int nmi_create_files(struct dentry *root)
@@ -433,26 +440,7 @@ static int nmi_create_files(struct dentry *root)
return 0;
}
-static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
- void *data)
-{
- int cpu = (unsigned long)data;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_FAILED:
- case CPU_ONLINE:
- smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
- break;
- case CPU_DOWN_PREPARE:
- smp_call_function_single(cpu, nmi_cpu_down, NULL, 1);
- break;
- }
- return NOTIFY_DONE;
-}
-
-static struct notifier_block oprofile_cpu_nb = {
- .notifier_call = oprofile_cpu_notifier
-};
+static enum cpuhp_state cpuhp_nmi_online;
static int nmi_setup(void)
{
@@ -495,20 +483,17 @@ static int nmi_setup(void)
if (err)
goto fail;
- cpu_notifier_register_begin();
-
- /* Use get/put_online_cpus() to protect 'nmi_enabled' */
- get_online_cpus();
nmi_enabled = 1;
/* make nmi_enabled visible to the nmi handler: */
smp_mb();
- on_each_cpu(nmi_cpu_setup, NULL, 1);
- __register_cpu_notifier(&oprofile_cpu_nb);
- put_online_cpus();
-
- cpu_notifier_register_done();
-
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online",
+ nmi_cpu_online, nmi_cpu_down_prep);
+ if (err < 0)
+ goto fail_nmi;
+ cpuhp_nmi_online = err;
return 0;
+fail_nmi:
+ unregister_nmi_handler(NMI_LOCAL, "oprofile");
fail:
free_msrs();
return err;
@@ -518,17 +503,9 @@ static void nmi_shutdown(void)
{
struct op_msrs *msrs;
- cpu_notifier_register_begin();
-
- /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */
- get_online_cpus();
- on_each_cpu(nmi_cpu_shutdown, NULL, 1);
+ cpuhp_remove_state(cpuhp_nmi_online);
nmi_enabled = 0;
ctr_running = 0;
- __unregister_cpu_notifier(&oprofile_cpu_nb);
- put_online_cpus();
-
- cpu_notifier_register_done();
/* make variables visible to the nmi handler: */
smp_mb();
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index c20d2cc7ef64..ae387e5ee6f7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -327,35 +327,18 @@ static int __init early_root_info_init(void)
#define ENABLE_CF8_EXT_CFG (1ULL << 46)
-static void enable_pci_io_ecs(void *unused)
+static int amd_bus_cpu_online(unsigned int cpu)
{
u64 reg;
+
rdmsrl(MSR_AMD64_NB_CFG, reg);
if (!(reg & ENABLE_CF8_EXT_CFG)) {
reg |= ENABLE_CF8_EXT_CFG;
wrmsrl(MSR_AMD64_NB_CFG, reg);
}
+ return 0;
}
-static int amd_cpu_notify(struct notifier_block *self, unsigned long action,
- void *hcpu)
-{
- int cpu = (long)hcpu;
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block amd_cpu_notifier = {
- .notifier_call = amd_cpu_notify,
-};
-
static void __init pci_enable_pci_io_ecs(void)
{
#ifdef CONFIG_AMD_NB
@@ -385,7 +368,7 @@ static void __init pci_enable_pci_io_ecs(void)
static int __init pci_io_ecs_init(void)
{
- int cpu;
+ int ret;
/* assume all cpus from fam10h have IO ECS */
if (boot_cpu_data.x86 < 0x10)
@@ -395,12 +378,9 @@ static int __init pci_io_ecs_init(void)
if (early_pci_allowed())
pci_enable_pci_io_ecs();
- cpu_notifier_register_begin();
- for_each_online_cpu(cpu)
- amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
- (void *)(long)cpu);
- __register_cpu_notifier(&amd_cpu_notifier);
- cpu_notifier_register_done();
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pci/amd_bus:online",
+ amd_bus_cpu_online, NULL);
+ WARN_ON(ret < 0);
pci_probe |= PCI_HAS_IO_ECS;
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index bedfab98077a..e1fb269c87af 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
return 0;
error:
- dev_err(&dev->dev,
- "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+ dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n",
+ type == PCI_CAP_ID_MSI ? "" : "-X", irq);
return irq;
}
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index 3c3c19ea94df..184842ef332e 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -8,7 +8,6 @@ obj-y += iris/
obj-y += intel/
obj-y += intel-mid/
obj-y += intel-quark/
-obj-y += mellanox/
obj-y += olpc/
obj-y += scx200/
obj-y += sfi/
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
index b27bccd4390f..ce4b06733c09 100644
--- a/arch/x86/platform/ce4100/ce4100.c
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -23,11 +23,6 @@
#include <asm/io_apic.h>
#include <asm/emergency-restart.h>
-static int ce4100_i8042_detect(void)
-{
- return 0;
-}
-
/*
* The CE4100 platform has an internal 8051 Microcontroller which is
* responsible for signaling to the external Power Management Unit the
@@ -89,7 +84,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
}
static void ce4100_serial_fixup(int port, struct uart_port *up,
- unsigned short *capabilites)
+ u32 *capabilites)
{
#ifdef CONFIG_EARLY_PRINTK
/*
@@ -145,7 +140,6 @@ static void sdv_pci_init(void)
void __init x86_ce4100_early_setup(void)
{
x86_init.oem.arch_setup = sdv_arch_setup;
- x86_platform.i8042_detect = ce4100_i8042_detect;
x86_init.resources.probe_roms = x86_init_noop;
x86_init.mpparse.get_smp_config = x86_init_uint_noop;
x86_init.mpparse.find_smp_config = x86_init_noop;
diff --git a/arch/x86/platform/intel-mid/device_libs/Makefile b/arch/x86/platform/intel-mid/device_libs/Makefile
index dd6cfa4ad3ac..61b5ed2b7d40 100644
--- a/arch/x86/platform/intel-mid/device_libs/Makefile
+++ b/arch/x86/platform/intel-mid/device_libs/Makefile
@@ -19,7 +19,7 @@ obj-$(subst m,y,$(CONFIG_SPI_SPIDEV)) += platform_spidev.o
# I2C Devices
obj-$(subst m,y,$(CONFIG_SENSORS_EMC1403)) += platform_emc1403.o
obj-$(subst m,y,$(CONFIG_SENSORS_LIS3LV02D)) += platform_lis331.o
-obj-$(subst m,y,$(CONFIG_INPUT_MPU3050)) += platform_mpu3050.o
+obj-$(subst m,y,$(CONFIG_MPU3050_I2C)) += platform_mpu3050.o
obj-$(subst m,y,$(CONFIG_INPUT_BMA150)) += platform_bma023.o
obj-$(subst m,y,$(CONFIG_DRM_MEDFIELD)) += platform_tc35876x.o
# I2C GPIO Expanders
diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 7850128f0026..12a272582cdc 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -161,12 +161,6 @@ out:
regulator_has_full_constraints();
}
-/* MID systems don't have i8042 controller */
-static int intel_mid_i8042_detect(void)
-{
- return 0;
-}
-
/*
* Moorestown does not have external NMI source nor port 0x61 to report
* NMI status. The possible NMI sources are from pmu as a result of NMI
@@ -197,7 +191,6 @@ void __init x86_intel_mid_early_setup(void)
x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
x86_platform.calibrate_tsc = intel_mid_calibrate_tsc;
- x86_platform.i8042_detect = intel_mid_i8042_detect;
x86_init.timers.wallclock_init = intel_mid_rtc_init;
x86_platform.get_nmi_reason = intel_mid_get_nmi_reason;
diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c
index 1eb47b6298c2..e793fe509971 100644
--- a/arch/x86/platform/intel-mid/mfld.c
+++ b/arch/x86/platform/intel-mid/mfld.c
@@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void)
fast_calibrate = ratio * fsb;
pr_debug("read penwell tsc %lu khz\n", fast_calibrate);
lapic_timer_frequency = fsb * 1000 / HZ;
- /* mark tsc clocksource as reliable */
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+
+ /*
+ * TSC on Intel Atom SoCs is reliable and of known frequency.
+ * See tsc_msr.c for details.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return fast_calibrate;
}
diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c
index 59253db41bbc..e0607c77a1bd 100644
--- a/arch/x86/platform/intel-mid/mrfld.c
+++ b/arch/x86/platform/intel-mid/mrfld.c
@@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void)
pr_debug("Setting lapic_timer_frequency = %d\n",
lapic_timer_frequency);
- /* mark tsc clocksource as reliable */
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE);
+ /*
+ * TSC on Intel Atom SoCs is reliable and of known frequency.
+ * See tsc_msr.c for details.
+ */
+ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
+ setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE);
return fast_calibrate;
}
diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c
index 67375dda451c..ef03852ea6e8 100644
--- a/arch/x86/platform/intel-mid/pwr.c
+++ b/arch/x86/platform/intel-mid/pwr.c
@@ -270,7 +270,6 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state)
return 0;
}
-EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state);
pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev)
{
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index f5bad40936ac..b8f562049cad 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -25,7 +25,8 @@
* @fmt: format string.
* ... variadic argument list.
*/
-static void __init imr_self_test_result(int res, const char *fmt, ...)
+static __printf(2, 3)
+void __init imr_self_test_result(int res, const char *fmt, ...)
{
va_list vlist;
diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile
deleted file mode 100644
index f43c93188a1d..000000000000
--- a/arch/x86/platform/mellanox/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o
diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c
deleted file mode 100644
index 7dcfcca97399..000000000000
--- a/arch/x86/platform/mellanox/mlx-platform.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * arch/x86/platform/mellanox/mlx-platform.c
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com>
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the names of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * Alternatively, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") version 2 as published by the Free
- * Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include <linux/device.h>
-#include <linux/dmi.h>
-#include <linux/i2c.h>
-#include <linux/i2c-mux.h>
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/platform_data/i2c-mux-reg.h>
-
-#define MLX_PLAT_DEVICE_NAME "mlxplat"
-
-/* LPC bus IO offsets */
-#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000
-#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500
-#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100
-#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb
-#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda
-#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL
-#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
- MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \
- MLXPLAT_CPLD_LPC_PIO_OFFSET)
-#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \
- MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \
- MLXPLAT_CPLD_LPC_PIO_OFFSET)
-
-/* Start channel numbers */
-#define MLXPLAT_CPLD_CH1 2
-#define MLXPLAT_CPLD_CH2 10
-
-/* Number of LPC attached MUX platform devices */
-#define MLXPLAT_CPLD_LPC_MUX_DEVS 2
-
-/* mlxplat_priv - platform private data
- * @pdev_i2c - i2c controller platform device
- * @pdev_mux - array of mux platform devices
- */
-struct mlxplat_priv {
- struct platform_device *pdev_i2c;
- struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS];
-};
-
-/* Regions for LPC I2C controller and LPC base register space */
-static const struct resource mlxplat_lpc_resources[] = {
- [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR,
- MLXPLAT_CPLD_LPC_IO_RANGE,
- "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO),
- [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR,
- MLXPLAT_CPLD_LPC_IO_RANGE,
- "mlxplat_cpld_lpc_regs",
- IORESOURCE_IO),
-};
-
-/* Platform default channels */
-static const int mlxplat_default_channels[][8] = {
- {
- MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2,
- MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 +
- 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7
- },
- {
- MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2,
- MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 +
- 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7
- },
-};
-
-/* Platform channels for MSN21xx system family */
-static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 };
-
-/* Platform mux data */
-static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = {
- {
- .parent = 1,
- .base_nr = MLXPLAT_CPLD_CH1,
- .write_only = 1,
- .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1,
- .reg_size = 1,
- .idle_in_use = 1,
- },
- {
- .parent = 1,
- .base_nr = MLXPLAT_CPLD_CH2,
- .write_only = 1,
- .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2,
- .reg_size = 1,
- .idle_in_use = 1,
- },
-
-};
-
-static struct platform_device *mlxplat_dev;
-
-static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
- mlxplat_mux_data[i].values = mlxplat_default_channels[i];
- mlxplat_mux_data[i].n_values =
- ARRAY_SIZE(mlxplat_default_channels[i]);
- }
-
- return 1;
-};
-
-static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
- mlxplat_mux_data[i].values = mlxplat_msn21xx_channels;
- mlxplat_mux_data[i].n_values =
- ARRAY_SIZE(mlxplat_msn21xx_channels);
- }
-
- return 1;
-};
-
-static struct dmi_system_id mlxplat_dmi_table[] __initdata = {
- {
- .callback = mlxplat_dmi_default_matched,
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
- DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"),
- },
- },
- {
- .callback = mlxplat_dmi_default_matched,
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
- DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"),
- },
- },
- {
- .callback = mlxplat_dmi_default_matched,
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
- DMI_MATCH(DMI_PRODUCT_NAME, "MSB"),
- },
- },
- {
- .callback = mlxplat_dmi_default_matched,
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
- DMI_MATCH(DMI_PRODUCT_NAME, "MSX"),
- },
- },
- {
- .callback = mlxplat_dmi_msn21xx_matched,
- .matches = {
- DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"),
- DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"),
- },
- },
- { }
-};
-
-static int __init mlxplat_init(void)
-{
- struct mlxplat_priv *priv;
- int i, err;
-
- if (!dmi_check_system(mlxplat_dmi_table))
- return -ENODEV;
-
- mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1,
- mlxplat_lpc_resources,
- ARRAY_SIZE(mlxplat_lpc_resources));
-
- if (IS_ERR(mlxplat_dev))
- return PTR_ERR(mlxplat_dev);
-
- priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv),
- GFP_KERNEL);
- if (!priv) {
- err = -ENOMEM;
- goto fail_alloc;
- }
- platform_set_drvdata(mlxplat_dev, priv);
-
- priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1,
- NULL, 0);
- if (IS_ERR(priv->pdev_i2c)) {
- err = PTR_ERR(priv->pdev_i2c);
- goto fail_alloc;
- };
-
- for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) {
- priv->pdev_mux[i] = platform_device_register_resndata(
- &mlxplat_dev->dev,
- "i2c-mux-reg", i, NULL,
- 0, &mlxplat_mux_data[i],
- sizeof(mlxplat_mux_data[i]));
- if (IS_ERR(priv->pdev_mux[i])) {
- err = PTR_ERR(priv->pdev_mux[i]);
- goto fail_platform_mux_register;
- }
- }
-
- return 0;
-
-fail_platform_mux_register:
- for (i--; i > 0 ; i--)
- platform_device_unregister(priv->pdev_mux[i]);
- platform_device_unregister(priv->pdev_i2c);
-fail_alloc:
- platform_device_unregister(mlxplat_dev);
-
- return err;
-}
-module_init(mlxplat_init);
-
-static void __exit mlxplat_exit(void)
-{
- struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev);
- int i;
-
- for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--)
- platform_device_unregister(priv->pdev_mux[i]);
-
- platform_device_unregister(priv->pdev_i2c);
- platform_device_unregister(mlxplat_dev);
-}
-module_exit(mlxplat_exit);
-
-MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)");
-MODULE_DESCRIPTION("Mellanox platform driver");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:");
-MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:");
-MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:");
-MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:");
-MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:");
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 9e42842e924a..766d4d3529a1 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -19,7 +19,6 @@
#include <asm/uv/uv_hub.h>
#include <asm/uv/uv_bau.h>
#include <asm/apic.h>
-#include <asm/idle.h>
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
#include <asm/timer.h>
diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c
index cd5173a2733f..8410e7d0a5b5 100644
--- a/arch/x86/platform/uv/uv_nmi.c
+++ b/arch/x86/platform/uv/uv_nmi.c
@@ -387,8 +387,8 @@ static void uv_nmi_dump_cpu_ip_hdr(void)
/* Dump Instruction Pointer info */
static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs)
{
- pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm);
- printk_address(regs->ip);
+ pr_info("UV: %4d %6d %-32.32s %pS",
+ cpu, current->pid, current->comm, (void *)regs->ip);
}
/*
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 53cace2ec0e2..66ade16c7693 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt)
fix_processor_context();
do_fpu_end();
+ tsc_verify_tsc_adjust(true);
x86_platform.restore_sched_clock_state();
mtrr_bp_restore();
perf_restore_debug_store();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 9634557a5444..ded2e8272382 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,10 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/suspend.h>
+#include <linux/scatterlist.h>
+#include <linux/kdebug.h>
+
+#include <crypto/hash.h>
#include <asm/init.h>
#include <asm/proto.h>
@@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn)
return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
}
+#define MD5_DIGEST_SIZE 16
+
struct restore_data_record {
unsigned long jump_address;
unsigned long jump_address_phys;
unsigned long cr3;
unsigned long magic;
+ u8 e820_digest[MD5_DIGEST_SIZE];
};
-#define RESTORE_MAGIC 0x123456789ABCDEF0UL
+#define RESTORE_MAGIC 0x23456789ABCDEF01UL
+
+#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
+/**
+ * get_e820_md5 - calculate md5 according to given e820 map
+ *
+ * @map: the e820 map to be calculated
+ * @buf: the md5 result to be stored to
+ */
+static int get_e820_md5(struct e820map *map, void *buf)
+{
+ struct scatterlist sg;
+ struct crypto_ahash *tfm;
+ int size;
+ int ret = 0;
+
+ tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return -ENOMEM;
+
+ {
+ AHASH_REQUEST_ON_STACK(req, tfm);
+ size = offsetof(struct e820map, map)
+ + sizeof(struct e820entry) * map->nr_map;
+ ahash_request_set_tfm(req, tfm);
+ sg_init_one(&sg, (u8 *)map, size);
+ ahash_request_set_callback(req, 0, NULL, NULL);
+ ahash_request_set_crypt(req, &sg, buf, size);
+
+ if (crypto_ahash_digest(req))
+ ret = -EINVAL;
+ ahash_request_zero(req);
+ }
+ crypto_free_ahash(tfm);
+
+ return ret;
+}
+
+static void hibernation_e820_save(void *buf)
+{
+ get_e820_md5(e820_saved, buf);
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ int ret;
+ u8 result[MD5_DIGEST_SIZE];
+
+ memset(result, 0, MD5_DIGEST_SIZE);
+ /* If there is no digest in suspend kernel, let it go. */
+ if (!memcmp(result, buf, MD5_DIGEST_SIZE))
+ return false;
+
+ ret = get_e820_md5(e820_saved, result);
+ if (ret)
+ return true;
+
+ return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
+}
+#else
+static void hibernation_e820_save(void *buf)
+{
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+ /* If md5 is not builtin for restore kernel, let it go. */
+ return false;
+}
+#endif
/**
* arch_hibernation_header_save - populate the architecture specific part
@@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
rdr->jump_address_phys = __pa_symbol(&restore_registers);
rdr->cr3 = restore_cr3;
rdr->magic = RESTORE_MAGIC;
+
+ hibernation_e820_save(rdr->e820_digest);
+
return 0;
}
@@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr)
restore_jump_address = rdr->jump_address;
jump_address_phys = rdr->jump_address_phys;
restore_cr3 = rdr->cr3;
- return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
+
+ if (rdr->magic != RESTORE_MAGIC) {
+ pr_crit("Unrecognized hibernate image header format!\n");
+ return -EINVAL;
+ }
+
+ if (hibernation_e820_mismatch(rdr->e820_digest)) {
+ pr_crit("Hibernate inconsistent memory map detected!\n");
+ return -ENODEV;
+ }
+
+ return 0;
}
diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c
index 1ac76479c266..8730c2882fff 100644
--- a/arch/x86/ras/mce_amd_inj.c
+++ b/arch/x86/ras/mce_amd_inj.c
@@ -275,6 +275,8 @@ static void do_inject(void)
unsigned int cpu = i_mce.extcpu;
u8 b = i_mce.bank;
+ rdtscll(i_mce.tsc);
+
if (i_mce.misc)
i_mce.status |= MCI_STATUS_MISCV;
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 25012abc3409..4463fa72db94 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -69,7 +69,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE
# ---------------------------------------------------------------------------
-KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
+KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \
-I$(srctree)/arch/x86/boot
KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
GCOV_PROFILE := n
diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c
index ba70ff232917..1972565ab106 100644
--- a/arch/x86/tools/insn_sanity.c
+++ b/arch/x86/tools/insn_sanity.c
@@ -269,7 +269,8 @@ int main(int argc, char **argv)
insns++;
}
- fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
+ fprintf((errors) ? stderr : stdout,
+ "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n",
prog,
(errors) ? "Failure" : "Success",
insns,
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8d929d..73eb7fd4aec4 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -992,11 +992,12 @@ static void emit_relocs(int as_text, int use_real_mode)
die("Segment relocations found but --realmode not specified\n");
/* Order the relocations for more efficient processing */
- sort_relocs(&relocs16);
sort_relocs(&relocs32);
#if ELF_BITS == 64
sort_relocs(&relocs32neg);
sort_relocs(&relocs64);
+#else
+ sort_relocs(&relocs16);
#endif
/* Print the relocations */
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
index 56f04db0c9c0..ecf31e0358c8 100644
--- a/arch/x86/tools/test_get_len.c
+++ b/arch/x86/tools/test_get_len.c
@@ -167,7 +167,7 @@ int main(int argc, char **argv)
fprintf(stderr, "Warning: decoded and checked %d"
" instructions with %d warnings\n", insns, warnings);
else
- fprintf(stderr, "Succeed: decoded and checked %d"
+ fprintf(stdout, "Success: decoded and checked %d"
" instructions\n", insns);
return 0;
}
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 233ee09c1ce8..c77db2288982 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -26,7 +26,6 @@ static inline void rep_nop(void)
}
#define cpu_relax() rep_nop()
-#define cpu_relax_lowlatency() cpu_relax()
#define task_pt_regs(t) (&(t)->thread.regs)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bdd855685403..ced7027b3fbc 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -980,17 +980,6 @@ static void xen_io_delay(void)
{
}
-static void xen_clts(void)
-{
- struct multicall_space mcs;
-
- mcs = xen_mc_entry(0);
-
- MULTI_fpu_taskswitch(mcs.mc, 0);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-}
-
static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
static unsigned long xen_read_cr0(void)
@@ -1233,8 +1222,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.set_debugreg = xen_set_debugreg,
.get_debugreg = xen_get_debugreg,
- .clts = xen_clts,
-
.read_cr0 = xen_read_cr0,
.write_cr0 = xen_write_cr0,
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index 0e98e5d241d0..a9fafb5c8738 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -19,7 +19,6 @@
int xen_swiotlb __read_mostly;
static struct dma_map_ops xen_swiotlb_dma_ops = {
- .mapping_error = xen_swiotlb_dma_mapping_error,
.alloc = xen_swiotlb_alloc_coherent,
.free = xen_swiotlb_free_coherent,
.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index f8960fca0827..8c394e30e5fe 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
unsigned long xen_released_pages;
/* E820 map used during setting up memory. */
-static struct e820entry xen_e820_map[E820MAX] __initdata;
+static struct e820entry xen_e820_map[E820_X_MAX] __initdata;
static u32 xen_e820_map_entries __initdata;
/*
@@ -750,7 +750,7 @@ char * __init xen_memory_setup(void)
max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn);
- memmap.nr_entries = E820MAX;
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
set_xen_guest_handle(memmap.buffer, xen_e820_map);
op = xen_initial_domain() ?
@@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void)
int i;
int rc;
- memmap.nr_entries = E820MAX;
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
set_xen_guest_handle(memmap.buffer, xen_e820_map);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 9fa27ceeecfd..311acad7dad2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -87,12 +87,6 @@ static void cpu_bringup(void)
cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
- /*
- * identify_cpu() may have set logical_pkg_id to -1 due
- * to incorrect phys_proc_id. Let's re-comupte it.
- */
- topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu);
-
xen_setup_cpu_clockevents();
notify_cpu_starting(cpu);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 3d6e0064cbfc..e8a9ea7d7a21 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,6 +114,7 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
}
+PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
/*
* Our init of PV spinlocks is split in two init functions due to us
@@ -137,6 +138,7 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
pv_lock_ops.wait = xen_qlock_wait;
pv_lock_ops.kick = xen_qlock_kick;
+ pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
}
/*