aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig116
-rw-r--r--arch/x86/Kconfig.cpu8
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/Makefile6
-rw-r--r--arch/x86/boot/compressed/Makefile4
-rw-r--r--arch/x86/boot/compressed/acpi.c6
-rw-r--r--arch/x86/boot/compressed/eboot.c278
-rw-r--r--arch/x86/boot/compressed/eboot.h30
-rw-r--r--arch/x86/boot/compressed/efi_stub_32.S87
-rw-r--r--arch/x86/boot/compressed/efi_stub_64.S5
-rw-r--r--arch/x86/boot/compressed/efi_thunk_64.S65
-rw-r--r--arch/x86/boot/compressed/head_32.S72
-rw-r--r--arch/x86/boot/compressed/head_64.S97
-rw-r--r--arch/x86/boot/compressed/kaslr_64.c3
-rw-r--r--arch/x86/boot/mkcpustr.c1
-rw-r--r--arch/x86/boot/setup.ld5
-rw-r--r--arch/x86/crypto/.gitignore1
-rw-r--r--arch/x86/crypto/Makefile18
-rw-r--r--arch/x86/crypto/aegis128-aesni-glue.c4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S8
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c55
-rw-r--r--arch/x86/crypto/blake2s-glue.c4
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c77
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c81
-rw-r--r--arch/x86/crypto/camellia_glue.c54
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c74
-rw-r--r--arch/x86/crypto/crc32-pclmul_glue.c4
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c4
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c11
-rw-r--r--arch/x86/crypto/glue_helper.c23
-rw-r--r--arch/x86/crypto/poly1305-avx2-x86_64.S390
-rw-r--r--arch/x86/crypto/poly1305-sse2-x86_64.S590
-rw-r--r--arch/x86/crypto/poly1305-x86_64-cryptogams.pl4265
-rw-r--r--arch/x86/crypto/poly1305_glue.c304
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c65
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c63
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c30
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S6
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S14
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c70
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S4
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S4
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S6
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c34
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S11
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S11
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S13
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c31
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c81
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c37
-rw-r--r--arch/x86/entry/entry_64.S2
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/vdso/Makefile2
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S13
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/entry/vdso/vma.c120
-rw-r--r--arch/x86/events/amd/core.c110
-rw-r--r--arch/x86/events/amd/uncore.c17
-rw-r--r--arch/x86/events/core.c74
-rw-r--r--arch/x86/events/intel/core.c1
-rw-r--r--arch/x86/events/intel/cstate.c22
-rw-r--r--arch/x86/events/intel/ds.c3
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/msr.c3
-rw-r--r--arch/x86/events/perf_event.h20
-rw-r--r--arch/x86/hyperv/hv_init.c50
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/Kbuild1
-rw-r--r--arch/x86/include/asm/acpi.h3
-rw-r--r--arch/x86/include/asm/apic.h10
-rw-r--r--arch/x86/include/asm/archrandom.h28
-rw-r--r--arch/x86/include/asm/bugs.h6
-rw-r--r--arch/x86/include/asm/compat.h17
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h10
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/crypto/camellia.h65
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h18
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h20
-rw-r--r--arch/x86/include/asm/crypto/serpent-sse2.h28
-rw-r--r--arch/x86/include/asm/crypto/twofish.h19
-rw-r--r--arch/x86/include/asm/device.h10
-rw-r--r--arch/x86/include/asm/disabled-features.h8
-rw-r--r--arch/x86/include/asm/efi.h244
-rw-r--r--arch/x86/include/asm/ftrace.h2
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h3
-rw-r--r--arch/x86/include/asm/intel-family.h1
-rw-r--r--arch/x86/include/asm/intel_pmc_ipc.h32
-rw-r--r--arch/x86/include/asm/intel_scu_ipc.h20
-rw-r--r--arch/x86/include/asm/intel_telemetry.h3
-rw-r--r--arch/x86/include/asm/io.h36
-rw-r--r--arch/x86/include/asm/io_bitmap.h9
-rw-r--r--arch/x86/include/asm/kdebug.h1
-rw-r--r--arch/x86/include/asm/kprobes.h14
-rw-r--r--arch/x86/include/asm/kvm_emulate.h18
-rw-r--r--arch/x86/include/asm/kvm_host.h71
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/memtype.h27
-rw-r--r--arch/x86/include/asm/microcode_amd.h2
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h112
-rw-r--r--arch/x86/include/asm/mpx.h116
-rw-r--r--arch/x86/include/asm/msr-index.h16
-rw-r--r--arch/x86/include/asm/mtrr.h4
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/nospec-branch.h1
-rw-r--r--arch/x86/include/asm/paravirt.h7
-rw-r--r--arch/x86/include/asm/paravirt_types.h4
-rw-r--r--arch/x86/include/asm/pat.h27
-rw-r--r--arch/x86/include/asm/pci.h33
-rw-r--r--arch/x86/include/asm/perf_event.h22
-rw-r--r--arch/x86/include/asm/pgtable.h10
-rw-r--r--arch/x86/include/asm/pgtable_32_areas.h53
-rw-r--r--arch/x86/include/asm/pgtable_32_types.h57
-rw-r--r--arch/x86/include/asm/pgtable_areas.h16
-rw-r--r--arch/x86/include/asm/pgtable_types.h147
-rw-r--r--arch/x86/include/asm/processor.h29
-rw-r--r--arch/x86/include/asm/ptrace.h29
-rw-r--r--arch/x86/include/asm/realmode.h4
-rw-r--r--arch/x86/include/asm/set_memory.h2
-rw-r--r--arch/x86/include/asm/text-patching.h86
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/tlb.h4
-rw-r--r--arch/x86/include/asm/trace/mpx.h134
-rw-r--r--arch/x86/include/asm/vdso.h1
-rw-r--r--arch/x86/include/asm/vdso/gettimeofday.h10
-rw-r--r--arch/x86/include/asm/vmalloc.h6
-rw-r--r--arch/x86/include/asm/vmx.h109
-rw-r--r--arch/x86/include/asm/vmxfeatures.h87
-rw-r--r--arch/x86/include/asm/vvar.h13
-rw-r--r--arch/x86/include/asm/x86_init.h2
-rw-r--r--arch/x86/include/uapi/asm/kvm.h1
-rw-r--r--arch/x86/include/uapi/asm/vmx.h4
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/sleep.c11
-rw-r--r--arch/x86/kernel/acpi/sleep.h2
-rw-r--r--arch/x86/kernel/alternative.c199
-rw-r--r--arch/x86/kernel/amd_nb.c3
-rw-r--r--arch/x86/kernel/apb_timer.c2
-rw-r--r--arch/x86/kernel/apic/apic.c30
-rw-r--r--arch/x86/kernel/apic/msi.c128
-rw-r--r--arch/x86/kernel/apic/vector.c14
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c43
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c24
-rw-r--r--arch/x86/kernel/cpu/bugs.c7
-rw-r--r--arch/x86/kernel/cpu/centaur.c37
-rw-r--r--arch/x86/kernel/cpu/common.c35
-rw-r--r--arch/x86/kernel/cpu/cpu.h4
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c145
-rw-r--r--arch/x86/kernel/cpu/intel.c85
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c52
-rw-r--r--arch/x86/kernel/cpu/mce/core.c70
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c24
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h2
-rw-r--r--arch/x86/kernel/cpu/mce/therm_throt.c11
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh15
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c66
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.c2
-rw-r--r--arch/x86/kernel/cpu/proc.c15
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h1
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c149
-rw-r--r--arch/x86/kernel/cpu/scattered.c2
-rw-r--r--arch/x86/kernel/cpu/topology.c2
-rw-r--r--arch/x86/kernel/cpu/tsx.c18
-rw-r--r--arch/x86/kernel/cpu/zhaoxin.c37
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/crash_core_32.c17
-rw-r--r--arch/x86/kernel/crash_core_64.c24
-rw-r--r--arch/x86/kernel/dumpstack.c26
-rw-r--r--arch/x86/kernel/fpu/signal.c3
-rw-r--r--arch/x86/kernel/fpu/xstate.c18
-rw-r--r--arch/x86/kernel/ftrace.c689
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/ima_arch.c6
-rw-r--r--arch/x86/kernel/jump_label.c116
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c21
-rw-r--r--arch/x86/kernel/kprobes/opt.c67
-rw-r--r--arch/x86/kernel/kvm.c72
-rw-r--r--arch/x86/kernel/ldt.c83
-rw-r--r--arch/x86/kernel/machine_kexec_32.c12
-rw-r--r--arch/x86/kernel/machine_kexec_64.c19
-rw-r--r--arch/x86/kernel/nmi.c20
-rw-r--r--arch/x86/kernel/paravirt.c5
-rw-r--r--arch/x86/kernel/process.c6
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/process_64.c1
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/setup.c169
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c9
-rw-r--r--arch/x86/kernel/sysfb_simplefb.c2
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/time.c12
-rw-r--r--arch/x86/kernel/traps.c177
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/unwind_orc.c11
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/x86_init.c3
-rw-r--r--arch/x86/kvm/Kconfig23
-rw-r--r--arch/x86/kvm/Makefile1
-rw-r--r--arch/x86/kvm/cpuid.c9
-rw-r--r--arch/x86/kvm/cpuid.h45
-rw-r--r--arch/x86/kvm/emulate.c164
-rw-r--r--arch/x86/kvm/hyperv.c22
-rw-r--r--arch/x86/kvm/i8254.c12
-rw-r--r--arch/x86/kvm/i8259.c6
-rw-r--r--arch/x86/kvm/ioapic.c193
-rw-r--r--arch/x86/kvm/ioapic.h6
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/irq_comm.c18
-rw-r--r--arch/x86/kvm/lapic.c71
-rw-r--r--arch/x86/kvm/lapic.h10
-rw-r--r--arch/x86/kvm/mmu.h13
-rw-r--r--arch/x86/kvm/mmu/mmu.c651
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h90
-rw-r--r--arch/x86/kvm/mmutrace.h14
-rw-r--r--arch/x86/kvm/mtrr.c8
-rw-r--r--arch/x86/kvm/pmu.h18
-rw-r--r--arch/x86/kvm/svm.c373
-rw-r--r--arch/x86/kvm/trace.h19
-rw-r--r--arch/x86/kvm/vmx/capabilities.h6
-rw-r--r--arch/x86/kvm/vmx/evmcs.c90
-rw-r--r--arch/x86/kvm/vmx/evmcs.h3
-rw-r--r--arch/x86/kvm/vmx/nested.c333
-rw-r--r--arch/x86/kvm/vmx/nested.h10
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c33
-rw-r--r--arch/x86/kvm/vmx/vmcs_shadow_fields.h4
-rw-r--r--arch/x86/kvm/vmx/vmx.c561
-rw-r--r--arch/x86/kvm/vmx/vmx.h5
-rw-r--r--arch/x86/kvm/x86.c764
-rw-r--r--arch/x86/kvm/x86.h23
-rw-r--r--arch/x86/lib/insn-eval.c26
-rw-r--r--arch/x86/lib/memmove_64.S7
-rw-r--r--arch/x86/lib/x86-opcode-map.txt2
-rw-r--r--arch/x86/mm/Makefile13
-rw-r--r--arch/x86/mm/debug_pagetables.c18
-rw-r--r--arch/x86/mm/dump_pagetables.c319
-rw-r--r--arch/x86/mm/fault.c66
-rw-r--r--arch/x86/mm/hugetlbpage.c5
-rw-r--r--arch/x86/mm/init_32.c29
-rw-r--r--arch/x86/mm/init_64.c36
-rw-r--r--arch/x86/mm/iomap_32.c6
-rw-r--r--arch/x86/mm/ioremap.c30
-rw-r--r--arch/x86/mm/kasan_init_64.c21
-rw-r--r--arch/x86/mm/mmap.c2
-rw-r--r--arch/x86/mm/mpx.c938
-rw-r--r--arch/x86/mm/pat/Makefile5
-rw-r--r--arch/x86/mm/pat/cpa-test.c (renamed from arch/x86/mm/pageattr-test.c)0
-rw-r--r--arch/x86/mm/pat/memtype.c (renamed from arch/x86/mm/pat.c)203
-rw-r--r--arch/x86/mm/pat/memtype.h (renamed from arch/x86/mm/pat_internal.h)12
-rw-r--r--arch/x86/mm/pat/memtype_interval.c194
-rw-r--r--arch/x86/mm/pat/set_memory.c (renamed from arch/x86/mm/pageattr.c)43
-rw-r--r--arch/x86/mm/pat_interval.c185
-rw-r--r--arch/x86/mm/pgtable_32.c1
-rw-r--r--arch/x86/mm/physaddr.c1
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/mm/tlb.c2
-rw-r--r--arch/x86/net/bpf_jit_comp.c166
-rw-r--r--arch/x86/pci/common.c48
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/mmconfig_64.c2
-rw-r--r--arch/x86/platform/efi/Makefile3
-rw-r--r--arch/x86/platform/efi/efi.c398
-rw-r--r--arch/x86/platform/efi/efi_32.c24
-rw-r--r--arch/x86/platform/efi/efi_64.c472
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S109
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S43
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S121
-rw-r--r--arch/x86/platform/efi/quirks.c47
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c26
-rw-r--r--arch/x86/platform/intel-quark/imr.c2
-rw-r--r--arch/x86/platform/intel-quark/imr_selftest.c2
-rw-r--r--arch/x86/platform/uv/bios_uv.c169
-rw-r--r--arch/x86/platform/uv/tlb_uv.c14
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--arch/x86/tools/Makefile4
-rw-r--r--arch/x86/xen/Kconfig8
-rw-r--r--arch/x86/xen/efi.c2
-rw-r--r--arch/x86/xen/enlighten_pv.c33
-rw-r--r--arch/x86/xen/mmu_pv.c2
288 files changed, 11224 insertions, 8696 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e8949953660..beea77046f9b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -93,10 +93,11 @@ config X86
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+ select ARCH_WANT_DEFAULT_BPF_JIT if X86_64
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANTS_THP_SWAP if X86_64
- select BUILDTIME_EXTABLE_SORT
+ select BUILDTIME_TABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
select CLOCKSOURCE_WATCHDOG
@@ -119,11 +120,13 @@ config X86
select GENERIC_IRQ_RESERVATION_MODE
select GENERIC_IRQ_SHOW
select GENERIC_PENDING_IRQ if SMP
+ select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
select GENERIC_GETTIMEOFDAY
+ select GENERIC_VDSO_TIME_NS
select GUP_GET_PTE_LOW_HIGH if X86_PAE
select HARDLOCKUP_CHECK_TIMESTAMP if X86_64
select HAVE_ACPI_APEI if ACPI
@@ -200,7 +203,7 @@ config X86
select HAVE_PCI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select HAVE_RCU_TABLE_FREE if PARAVIRT
+ select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_FUNCTION_ARG_ACCESS_API
@@ -439,8 +442,8 @@ config X86_MPPARSE
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
config GOLDFISH
- def_bool y
- depends on X86_GOLDFISH
+ def_bool y
+ depends on X86_GOLDFISH
config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
@@ -456,6 +459,7 @@ config X86_CPU_RESCTRL
bool "x86 CPU resource control support"
depends on X86 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select KERNFS
+ select PROC_CPU_RESCTRL if PROC_FS
help
Enable x86 CPU resource control support.
@@ -477,7 +481,7 @@ config X86_BIGSMP
bool "Support for big SMP systems with more than 8 CPUs"
depends on SMP
---help---
- This option is needed for the systems that have more than 8 CPUs
+ This option is needed for the systems that have more than 8 CPUs.
config X86_EXTENDED_PLATFORM
bool "Support for extended (non-PC) x86 platforms"
@@ -561,9 +565,9 @@ config X86_UV
# Please maintain the alphabetic order if and when there are additions
config X86_GOLDFISH
- bool "Goldfish (Virtual Platform)"
- depends on X86_EXTENDED_PLATFORM
- ---help---
+ bool "Goldfish (Virtual Platform)"
+ depends on X86_EXTENDED_PLATFORM
+ ---help---
Enable support for the Goldfish virtual platform used primarily
for Android development. Unless you are building for the Android
Goldfish emulator say N here.
@@ -806,9 +810,9 @@ config KVM_GUEST
timing infrastructure such as time of day, and system time
config ARCH_CPUIDLE_HALTPOLL
- def_bool n
- prompt "Disable host haltpoll when loading haltpoll driver"
- help
+ def_bool n
+ prompt "Disable host haltpoll when loading haltpoll driver"
+ help
If virtualized under KVM, disable host haltpoll.
config PVH
@@ -887,16 +891,16 @@ config HPET_EMULATE_RTC
depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
config APB_TIMER
- def_bool y if X86_INTEL_MID
- prompt "Intel MID APB Timer Support" if X86_INTEL_MID
- select DW_APB_TIMER
- depends on X86_INTEL_MID && SFI
- help
- APB timer is the replacement for 8254, HPET on X86 MID platforms.
- The APBT provides a stable time base on SMP
- systems, unlike the TSC, but it is more expensive to access,
- as it is off-chip. APB timers are always running regardless of CPU
- C states, they are used as per CPU clockevent device when possible.
+ def_bool y if X86_INTEL_MID
+ prompt "Intel MID APB Timer Support" if X86_INTEL_MID
+ select DW_APB_TIMER
+ depends on X86_INTEL_MID && SFI
+ help
+ APB timer is the replacement for 8254, HPET on X86 MID platforms.
+ The APBT provides a stable time base on SMP
+ systems, unlike the TSC, but it is more expensive to access,
+ as it is off-chip. APB timers are always running regardless of CPU
+ C states, they are used as per CPU clockevent device when possible.
# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
@@ -1035,8 +1039,8 @@ config SCHED_MC_PRIO
If unsure say Y here.
config UP_LATE_INIT
- def_bool y
- depends on !SMP && X86_LOCAL_APIC
+ def_bool y
+ depends on !SMP && X86_LOCAL_APIC
config X86_UP_APIC
bool "Local APIC support on uniprocessors" if !PCI_MSI
@@ -1185,8 +1189,8 @@ config X86_LEGACY_VM86
If unsure, say N here.
config VM86
- bool
- default X86_LEGACY_VM86
+ bool
+ default X86_LEGACY_VM86
config X86_16BIT
bool "Enable support for 16-bit segments" if EXPERT
@@ -1207,10 +1211,10 @@ config X86_ESPFIX64
depends on X86_16BIT && X86_64
config X86_VSYSCALL_EMULATION
- bool "Enable vsyscall emulation" if EXPERT
- default y
- depends on X86_64
- ---help---
+ bool "Enable vsyscall emulation" if EXPERT
+ default y
+ depends on X86_64
+ ---help---
This enables emulation of the legacy vsyscall page. Disabling
it is roughly equivalent to booting with vsyscall=none, except
that it will also disable the helpful warning if a program
@@ -1512,7 +1516,7 @@ config X86_CPA_STATISTICS
bool "Enable statistic for Change Page Attribute"
depends on DEBUG_FS
---help---
- Expose statistics about the Change Page Attribute mechanims, which
+ Expose statistics about the Change Page Attribute mechanism, which
helps to determine the effectiveness of preserving large and huge
page mappings when mapping protections are changed.
@@ -1543,12 +1547,12 @@ config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT
# Common NUMA Features
config NUMA
- bool "Numa Memory Allocation and Scheduler Support"
+ bool "NUMA Memory Allocation and Scheduler Support"
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP)
default y if X86_BIGSMP
---help---
- Enable NUMA (Non Uniform Memory Access) support.
+ Enable NUMA (Non-Uniform Memory Access) support.
The kernel will try to allocate memory used by a CPU on the
local memory controller of the CPU and add some more
@@ -1648,9 +1652,9 @@ config ARCH_PROC_KCORE_TEXT
depends on X86_64 && PROC_KCORE
config ILLEGAL_POINTER_VALUE
- hex
- default 0 if X86_32
- default 0xdead000000000000 if X86_64
+ hex
+ default 0 if X86_32
+ default 0xdead000000000000 if X86_64
config X86_PMEM_LEGACY_DEVICE
bool
@@ -1885,34 +1889,6 @@ config X86_UMIP
specific cases in protected and virtual-8086 modes. Emulated
results are dummy.
-config X86_INTEL_MPX
- prompt "Intel MPX (Memory Protection Extensions)"
- def_bool n
- # Note: only available in 64-bit mode due to VMA flags shortage
- depends on CPU_SUP_INTEL && X86_64
- select ARCH_USES_HIGH_VMA_FLAGS
- ---help---
- MPX provides hardware features that can be used in
- conjunction with compiler-instrumented code to check
- memory references. It is designed to detect buffer
- overflow or underflow bugs.
-
- This option enables running applications which are
- instrumented or otherwise use MPX. It does not use MPX
- itself inside the kernel or to protect the kernel
- against bad memory references.
-
- Enabling this option will make the kernel larger:
- ~8k of kernel text and 36 bytes of data on a 64-bit
- defconfig. It adds a long to the 'mm_struct' which
- will increase the kernel memory overhead of each
- process and adds some branches to paths used during
- exec() and munmap().
-
- For details, see Documentation/x86/intel_mpx.rst
-
- If unsure, say N.
-
config X86_INTEL_MEMORY_PROTECTION_KEYS
prompt "Intel Memory Protection Keys"
def_bool y
@@ -1991,11 +1967,12 @@ config EFI
platforms.
config EFI_STUB
- bool "EFI stub support"
- depends on EFI && !X86_USE_3DNOW
- select RELOCATABLE
- ---help---
- This kernel feature allows a bzImage to be loaded directly
+ bool "EFI stub support"
+ depends on EFI && !X86_USE_3DNOW
+ depends on $(cc-option,-mabi=ms) || X86_32
+ select RELOCATABLE
+ ---help---
+ This kernel feature allows a bzImage to be loaded directly
by EFI firmware without the use of a bootloader.
See Documentation/admin-guide/efi-stub.rst for more information.
@@ -2955,9 +2932,6 @@ config HAVE_ATOMIC_IOMAP
def_bool y
depends on X86_32
-config X86_DEV_DMA_OPS
- bool
-
source "drivers/firmware/Kconfig"
source "arch/x86/kvm/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index af9c967782f6..bc3a497c029c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -387,6 +387,14 @@ config X86_DEBUGCTLMSR
def_bool y
depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML
+config IA32_FEAT_CTL
+ def_bool y
+ depends on CPU_SUP_INTEL || CPU_SUP_CENTAUR || CPU_SUP_ZHAOXIN
+
+config X86_VMX_FEATURE_NAMES
+ def_bool y
+ depends on IA32_FEAT_CTL && X86_FEATURE_NAMES
+
menuconfig PROCESSOR_SELECT
bool "Supported processor vendors" if EXPERT
---help---
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c4eab8ed33a3..2e74690b028a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -62,26 +62,10 @@ config EARLY_PRINTK_USB_XDBC
config MCSAFE_TEST
def_bool n
-config X86_PTDUMP_CORE
- def_bool n
-
-config X86_PTDUMP
- tristate "Export kernel pagetable layout to userspace via debugfs"
- depends on DEBUG_KERNEL
- select DEBUG_FS
- select X86_PTDUMP_CORE
- ---help---
- Say Y here if you want to show the kernel pagetable layout in a
- debugfs file. This information is only useful for kernel developers
- who are working in architecture specific areas of the kernel.
- It is probably not a good idea to enable this feature in a production
- kernel.
- If in doubt, say "N"
-
config EFI_PGT_DUMP
bool "Dump the EFI pagetable"
depends on EFI
- select X86_PTDUMP_CORE
+ select PTDUMP_CORE
---help---
Enable this if you want to dump the EFI page table before
enabling virtual mode. This can be used to debug miscellaneous
@@ -90,7 +74,7 @@ config EFI_PGT_DUMP
config DEBUG_WX
bool "Warn on W+X mappings at boot"
- select X86_PTDUMP_CORE
+ select PTDUMP_CORE
---help---
Generate a warning if any W+X mappings are found at boot.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 94df0868804b..513a55562d75 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -194,9 +194,10 @@ avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
avx512_instr :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,-DCONFIG_AS_AVX512=1)
sha1_ni_instr :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA1_NI=1)
sha256_ni_instr :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,-DCONFIG_AS_SHA256_NI=1)
+adx_instr := $(call as-instr,adox %r10$(comma)%r10,-DCONFIG_AS_ADX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) $(avx512_instr) $(sha1_ni_instr) $(sha256_ni_instr) $(adx_instr)
KBUILD_LDFLAGS := -m elf_$(UTS_MACHINE)
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 95410d6ee2ff..012b82fc8617 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -45,8 +45,8 @@ setup-y += video-vesa.o
setup-y += video-bios.o
targets += $(setup-y)
-hostprogs-y := tools/build
-hostprogs-$(CONFIG_X86_FEATURE_NAMES) += mkcpustr
+hostprogs := tools/build
+hostprogs += mkcpustr
HOST_EXTRACFLAGS += -I$(srctree)/tools/include \
-include include/generated/autoconf.h \
@@ -88,7 +88,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [a-zA-Z] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p'
quiet_cmd_zoffset = ZOFFSET $@
cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 1dac210f7d44..26050ae0b27e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -58,7 +58,7 @@ KBUILD_LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \
endif
LDFLAGS_vmlinux := -T
-hostprogs-y := mkpiggy
+hostprogs := mkpiggy
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
sed-voffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(_text\|__bss_start\|_end\)$$/\#define VO_\2 _AC(0x\1,UL)/p'
@@ -89,7 +89,7 @@ vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o
$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
-vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
+vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o \
$(objtree)/drivers/firmware/efi/libstub/lib.a
vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 25019d42ae93..ef2ad7253cd5 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -393,7 +393,13 @@ int count_immovable_mem_regions(void)
table = table_addr + sizeof(struct acpi_table_srat);
while (table + sizeof(struct acpi_subtable_header) < table_end) {
+
sub_table = (struct acpi_subtable_header *)table;
+ if (!sub_table->length) {
+ debug_putstr("Invalid zero length SRAT subtable.\n");
+ return 0;
+ }
+
if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
struct acpi_srat_mem_affinity *ma;
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index 72b08fde6de6..287393d725f0 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -6,6 +6,8 @@
*
* ----------------------------------------------------------------------- */
+#pragma GCC visibility push(hidden)
+
#include <linux/efi.h>
#include <linux/pci.h>
@@ -19,32 +21,18 @@
#include "eboot.h"
static efi_system_table_t *sys_table;
+extern const bool efi_is64;
-static struct efi_config *efi_early;
-
-__pure const struct efi_config *__efi_early(void)
+__pure efi_system_table_t *efi_system_table(void)
{
- return efi_early;
+ return sys_table;
}
-#define BOOT_SERVICES(bits) \
-static void setup_boot_services##bits(struct efi_config *c) \
-{ \
- efi_system_table_##bits##_t *table; \
- \
- table = (typeof(table))sys_table; \
- \
- c->runtime_services = table->runtime; \
- c->boot_services = table->boottime; \
- c->text_output = table->con_out; \
-}
-BOOT_SERVICES(32);
-BOOT_SERVICES(64);
-
-void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str)
+__attribute_const__ bool efi_is_64bit(void)
{
- efi_call_proto(efi_simple_text_output_protocol, output_string,
- efi_early->text_output, str);
+ if (IS_ENABLED(CONFIG_EFI_MIXED))
+ return efi_is64;
+ return IS_ENABLED(CONFIG_X86_64);
}
static efi_status_t
@@ -63,17 +51,17 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
* large romsize. The UEFI spec limits the size of option ROMs to 16
* MiB so we reject any ROMs over 16 MiB in size to catch this.
*/
- romimage = (void *)(unsigned long)efi_table_attr(efi_pci_io_protocol,
- romimage, pci);
- romsize = efi_table_attr(efi_pci_io_protocol, romsize, pci);
+ romimage = efi_table_attr(pci, romimage);
+ romsize = efi_table_attr(pci, romsize);
if (!romimage || !romsize || romsize > SZ_16M)
return EFI_INVALID_PARAMETER;
size = romsize + sizeof(*rom);
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size,
+ (void **)&rom);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate memory for 'rom'\n");
+ efi_printk("Failed to allocate memory for 'rom'\n");
return status;
}
@@ -85,27 +73,24 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
rom->pcilen = pci->romsize;
*__rom = rom;
- status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
- EfiPciIoWidthUint16, PCI_VENDOR_ID, 1,
- &rom->vendor);
+ status = efi_call_proto(pci, pci.read, EfiPciIoWidthUint16,
+ PCI_VENDOR_ID, 1, &rom->vendor);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to read rom->vendor\n");
+ efi_printk("Failed to read rom->vendor\n");
goto free_struct;
}
- status = efi_call_proto(efi_pci_io_protocol, pci.read, pci,
- EfiPciIoWidthUint16, PCI_DEVICE_ID, 1,
- &rom->devid);
+ status = efi_call_proto(pci, pci.read, EfiPciIoWidthUint16,
+ PCI_DEVICE_ID, 1, &rom->devid);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to read rom->devid\n");
+ efi_printk("Failed to read rom->devid\n");
goto free_struct;
}
- status = efi_call_proto(efi_pci_io_protocol, get_location, pci,
- &rom->segment, &rom->bus, &rom->device,
- &rom->function);
+ status = efi_call_proto(pci, get_location, &rom->segment, &rom->bus,
+ &rom->device, &rom->function);
if (status != EFI_SUCCESS)
goto free_struct;
@@ -114,7 +99,7 @@ preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom)
return status;
free_struct:
- efi_call_early(free_pool, rom);
+ efi_bs_call(free_pool, rom);
return status;
}
@@ -133,27 +118,24 @@ static void setup_efi_pci(struct boot_params *params)
void **pci_handle = NULL;
efi_guid_t pci_proto = EFI_PCI_IO_PROTOCOL_GUID;
unsigned long size = 0;
- unsigned long nr_pci;
struct setup_data *data;
+ efi_handle_t h;
int i;
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- &pci_proto, NULL, &size, pci_handle);
+ status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ &pci_proto, NULL, &size, pci_handle);
if (status == EFI_BUFFER_TOO_SMALL) {
- status = efi_call_early(allocate_pool,
- EFI_LOADER_DATA,
- size, (void **)&pci_handle);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size,
+ (void **)&pci_handle);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate memory for 'pci_handle'\n");
+ efi_printk("Failed to allocate memory for 'pci_handle'\n");
return;
}
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL, &pci_proto,
- NULL, &size, pci_handle);
+ status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ &pci_proto, NULL, &size, pci_handle);
}
if (status != EFI_SUCCESS)
@@ -164,15 +146,12 @@ static void setup_efi_pci(struct boot_params *params)
while (data && data->next)
data = (struct setup_data *)(unsigned long)data->next;
- nr_pci = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
- for (i = 0; i < nr_pci; i++) {
+ for_each_efi_handle(h, pci_handle, size, i) {
efi_pci_io_protocol_t *pci = NULL;
struct pci_setup_rom *rom;
- status = efi_call_early(handle_protocol,
- efi_is_64bit() ? ((u64 *)pci_handle)[i]
- : ((u32 *)pci_handle)[i],
- &pci_proto, (void **)&pci);
+ status = efi_bs_call(handle_protocol, h, &pci_proto,
+ (void **)&pci);
if (status != EFI_SUCCESS || !pci)
continue;
@@ -189,7 +168,7 @@ static void setup_efi_pci(struct boot_params *params)
}
free_handle:
- efi_call_early(free_pool, pci_handle);
+ efi_bs_call(free_pool, pci_handle);
}
static void retrieve_apple_device_properties(struct boot_params *boot_params)
@@ -198,34 +177,34 @@ static void retrieve_apple_device_properties(struct boot_params *boot_params)
struct setup_data *data, *new;
efi_status_t status;
u32 size = 0;
- void *p;
+ apple_properties_protocol_t *p;
- status = efi_call_early(locate_protocol, &guid, NULL, &p);
+ status = efi_bs_call(locate_protocol, &guid, NULL, (void **)&p);
if (status != EFI_SUCCESS)
return;
- if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) {
- efi_printk(sys_table, "Unsupported properties proto version\n");
+ if (efi_table_attr(p, version) != 0x10000) {
+ efi_printk("Unsupported properties proto version\n");
return;
}
- efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size);
+ efi_call_proto(p, get_all, NULL, &size);
if (!size)
return;
do {
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- size + sizeof(struct setup_data), &new);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA,
+ size + sizeof(struct setup_data),
+ (void **)&new);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate memory for 'properties'\n");
+ efi_printk("Failed to allocate memory for 'properties'\n");
return;
}
- status = efi_call_proto(apple_properties_protocol, get_all, p,
- new->data, &size);
+ status = efi_call_proto(p, get_all, new->data, &size);
if (status == EFI_BUFFER_TOO_SMALL)
- efi_call_early(free_pool, new);
+ efi_bs_call(free_pool, new);
} while (status == EFI_BUFFER_TOO_SMALL);
new->type = SETUP_APPLE_PROPERTIES;
@@ -247,7 +226,7 @@ static const efi_char16_t apple[] = L"Apple";
static void setup_quirks(struct boot_params *boot_params)
{
efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long)
- efi_table_attr(efi_system_table, fw_vendor, sys_table);
+ efi_table_attr(efi_system_table(), fw_vendor);
if (!memcmp(fw_vendor, apple, sizeof(apple))) {
if (IS_ENABLED(CONFIG_APPLE_PROPERTIES))
@@ -265,17 +244,16 @@ setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size)
u32 width, height;
void **uga_handle = NULL;
efi_uga_draw_protocol_t *uga = NULL, *first_uga;
- unsigned long nr_ugas;
+ efi_handle_t handle;
int i;
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- size, (void **)&uga_handle);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size,
+ (void **)&uga_handle);
if (status != EFI_SUCCESS)
return status;
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- uga_proto, NULL, &size, uga_handle);
+ status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ uga_proto, NULL, &size, uga_handle);
if (status != EFI_SUCCESS)
goto free_handle;
@@ -283,24 +261,20 @@ setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size)
width = 0;
first_uga = NULL;
- nr_ugas = size / (efi_is_64bit() ? sizeof(u64) : sizeof(u32));
- for (i = 0; i < nr_ugas; i++) {
+ for_each_efi_handle(handle, uga_handle, size, i) {
efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID;
u32 w, h, depth, refresh;
void *pciio;
- unsigned long handle = efi_is_64bit() ? ((u64 *)uga_handle)[i]
- : ((u32 *)uga_handle)[i];
- status = efi_call_early(handle_protocol, handle,
- uga_proto, (void **)&uga);
+ status = efi_bs_call(handle_protocol, handle, uga_proto,
+ (void **)&uga);
if (status != EFI_SUCCESS)
continue;
pciio = NULL;
- efi_call_early(handle_protocol, handle, &pciio_proto, &pciio);
+ efi_bs_call(handle_protocol, handle, &pciio_proto, &pciio);
- status = efi_call_proto(efi_uga_draw_protocol, get_mode, uga,
- &w, &h, &depth, &refresh);
+ status = efi_call_proto(uga, get_mode, &w, &h, &depth, &refresh);
if (status == EFI_SUCCESS && (!first_uga || pciio)) {
width = w;
height = h;
@@ -336,7 +310,7 @@ setup_uga(struct screen_info *si, efi_guid_t *uga_proto, unsigned long size)
si->rsvd_pos = 24;
free_handle:
- efi_call_early(free_pool, uga_handle);
+ efi_bs_call(free_pool, uga_handle);
return status;
}
@@ -355,37 +329,38 @@ void setup_graphics(struct boot_params *boot_params)
memset(si, 0, sizeof(*si));
size = 0;
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- &graphics_proto, NULL, &size, gop_handle);
+ status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ &graphics_proto, NULL, &size, gop_handle);
if (status == EFI_BUFFER_TOO_SMALL)
- status = efi_setup_gop(NULL, si, &graphics_proto, size);
+ status = efi_setup_gop(si, &graphics_proto, size);
if (status != EFI_SUCCESS) {
size = 0;
- status = efi_call_early(locate_handle,
- EFI_LOCATE_BY_PROTOCOL,
- &uga_proto, NULL, &size, uga_handle);
+ status = efi_bs_call(locate_handle, EFI_LOCATE_BY_PROTOCOL,
+ &uga_proto, NULL, &size, uga_handle);
if (status == EFI_BUFFER_TOO_SMALL)
setup_uga(si, &uga_proto, size);
}
}
+void startup_32(struct boot_params *boot_params);
+
+void __noreturn efi_stub_entry(efi_handle_t handle,
+ efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params);
+
/*
* Because the x86 boot code expects to be passed a boot_params we
* need to create one ourselves (usually the bootloader would create
* one for us).
- *
- * The caller is responsible for filling out ->code32_start in the
- * returned boot_params.
*/
-struct boot_params *make_boot_params(struct efi_config *c)
+efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
+ efi_system_table_t *sys_table_arg)
{
struct boot_params *boot_params;
struct apm_bios_info *bi;
struct setup_header *hdr;
efi_loaded_image_t *image;
- void *handle;
efi_guid_t proto = LOADED_IMAGE_PROTOCOL_GUID;
int options_size = 0;
efi_status_t status;
@@ -393,31 +368,22 @@ struct boot_params *make_boot_params(struct efi_config *c)
unsigned long ramdisk_addr;
unsigned long ramdisk_size;
- efi_early = c;
- sys_table = (efi_system_table_t *)(unsigned long)efi_early->table;
- handle = (void *)(unsigned long)efi_early->image_handle;
+ sys_table = sys_table_arg;
/* Check if we were booted by the EFI firmware */
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
- return NULL;
-
- if (efi_is_64bit())
- setup_boot_services64(efi_early);
- else
- setup_boot_services32(efi_early);
+ return EFI_INVALID_PARAMETER;
- status = efi_call_early(handle_protocol, handle,
- &proto, (void *)&image);
+ status = efi_bs_call(handle_protocol, handle, &proto, (void *)&image);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
- return NULL;
+ efi_printk("Failed to get handle for LOADED_IMAGE_PROTOCOL\n");
+ return status;
}
- status = efi_low_alloc(sys_table, 0x4000, 1,
- (unsigned long *)&boot_params);
+ status = efi_low_alloc(0x4000, 1, (unsigned long *)&boot_params);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate lowmem for boot params\n");
- return NULL;
+ efi_printk("Failed to allocate lowmem for boot params\n");
+ return status;
}
memset(boot_params, 0x0, 0x4000);
@@ -439,7 +405,7 @@ struct boot_params *make_boot_params(struct efi_config *c)
hdr->type_of_loader = 0x21;
/* Convert unicode cmdline to ascii */
- cmdline_ptr = efi_convert_cmdline(sys_table, image, &options_size);
+ cmdline_ptr = efi_convert_cmdline(image, &options_size);
if (!cmdline_ptr)
goto fail;
@@ -457,15 +423,15 @@ struct boot_params *make_boot_params(struct efi_config *c)
if (status != EFI_SUCCESS)
goto fail2;
- status = handle_cmdline_files(sys_table, image,
+ status = handle_cmdline_files(image,
(char *)(unsigned long)hdr->cmd_line_ptr,
"initrd=", hdr->initrd_addr_max,
&ramdisk_addr, &ramdisk_size);
if (status != EFI_SUCCESS &&
hdr->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G) {
- efi_printk(sys_table, "Trying to load files to higher address\n");
- status = handle_cmdline_files(sys_table, image,
+ efi_printk("Trying to load files to higher address\n");
+ status = handle_cmdline_files(image,
(char *)(unsigned long)hdr->cmd_line_ptr,
"initrd=", -1UL,
&ramdisk_addr, &ramdisk_size);
@@ -478,14 +444,17 @@ struct boot_params *make_boot_params(struct efi_config *c)
boot_params->ext_ramdisk_image = (u64)ramdisk_addr >> 32;
boot_params->ext_ramdisk_size = (u64)ramdisk_size >> 32;
- return boot_params;
+ hdr->code32_start = (u32)(unsigned long)startup_32;
+
+ efi_stub_entry(handle, sys_table, boot_params);
+ /* not reached */
fail2:
- efi_free(sys_table, options_size, hdr->cmd_line_ptr);
+ efi_free(options_size, hdr->cmd_line_ptr);
fail:
- efi_free(sys_table, 0x4000, (unsigned long)boot_params);
+ efi_free(0x4000, (unsigned long)boot_params);
- return NULL;
+ return status;
}
static void add_e820ext(struct boot_params *params,
@@ -620,13 +589,13 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext,
sizeof(struct e820_entry) * nr_desc;
if (*e820ext) {
- efi_call_early(free_pool, *e820ext);
+ efi_bs_call(free_pool, *e820ext);
*e820ext = NULL;
*e820ext_size = 0;
}
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- size, (void **)e820ext);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, size,
+ (void **)e820ext);
if (status == EFI_SUCCESS)
*e820ext_size = size;
@@ -650,7 +619,7 @@ static efi_status_t allocate_e820(struct boot_params *params,
boot_map.key_ptr = NULL;
boot_map.buff_size = &buff_size;
- status = efi_get_memory_map(sys_table, &boot_map);
+ status = efi_get_memory_map(&boot_map);
if (status != EFI_SUCCESS)
return status;
@@ -672,8 +641,7 @@ struct exit_boot_struct {
struct efi_info *efi;
};
-static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
- struct efi_boot_memmap *map,
+static efi_status_t exit_boot_func(struct efi_boot_memmap *map,
void *priv)
{
const char *signature;
@@ -683,14 +651,14 @@ static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg,
: EFI32_LOADER_SIGNATURE;
memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32));
- p->efi->efi_systab = (unsigned long)sys_table_arg;
+ p->efi->efi_systab = (unsigned long)efi_system_table();
p->efi->efi_memdesc_size = *map->desc_size;
p->efi->efi_memdesc_version = *map->desc_ver;
p->efi->efi_memmap = (unsigned long)*map->map;
p->efi->efi_memmap_size = *map->map_size;
#ifdef CONFIG_X86_64
- p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32;
+ p->efi->efi_systab_hi = (unsigned long)efi_system_table() >> 32;
p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32;
#endif
@@ -722,8 +690,7 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
return status;
/* Might as well exit boot services now */
- status = efi_exit_boot_services(sys_table, handle, &map, &priv,
- exit_boot_func);
+ status = efi_exit_boot_services(handle, &map, &priv, exit_boot_func);
if (status != EFI_SUCCESS)
return status;
@@ -741,33 +708,22 @@ static efi_status_t exit_boot(struct boot_params *boot_params, void *handle)
* On success we return a pointer to a boot_params structure, and NULL
* on failure.
*/
-struct boot_params *
-efi_main(struct efi_config *c, struct boot_params *boot_params)
+struct boot_params *efi_main(efi_handle_t handle,
+ efi_system_table_t *sys_table_arg,
+ struct boot_params *boot_params)
{
struct desc_ptr *gdt = NULL;
struct setup_header *hdr = &boot_params->hdr;
efi_status_t status;
struct desc_struct *desc;
- void *handle;
- efi_system_table_t *_table;
unsigned long cmdline_paddr;
- efi_early = c;
-
- _table = (efi_system_table_t *)(unsigned long)efi_early->table;
- handle = (void *)(unsigned long)efi_early->image_handle;
-
- sys_table = _table;
+ sys_table = sys_table_arg;
/* Check if we were booted by the EFI firmware */
if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
goto fail;
- if (efi_is_64bit())
- setup_boot_services64(efi_early);
- else
- setup_boot_services32(efi_early);
-
/*
* make_boot_params() may have been called before efi_main(), in which
* case this is the second time we parse the cmdline. This is ok,
@@ -782,14 +738,14 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
* otherwise we ask the BIOS.
*/
if (boot_params->secure_boot == efi_secureboot_mode_unset)
- boot_params->secure_boot = efi_get_secureboot(sys_table);
+ boot_params->secure_boot = efi_get_secureboot();
/* Ask the firmware to clear memory on unclean shutdown */
- efi_enable_reset_attack_mitigation(sys_table);
+ efi_enable_reset_attack_mitigation();
- efi_random_get_seed(sys_table);
+ efi_random_get_seed();
- efi_retrieve_tpm2_eventlog(sys_table);
+ efi_retrieve_tpm2_eventlog();
setup_graphics(boot_params);
@@ -797,18 +753,17 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
setup_quirks(boot_params);
- status = efi_call_early(allocate_pool, EFI_LOADER_DATA,
- sizeof(*gdt), (void **)&gdt);
+ status = efi_bs_call(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt),
+ (void **)&gdt);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate memory for 'gdt' structure\n");
+ efi_printk("Failed to allocate memory for 'gdt' structure\n");
goto fail;
}
gdt->size = 0x800;
- status = efi_low_alloc(sys_table, gdt->size, 8,
- (unsigned long *)&gdt->address);
+ status = efi_low_alloc(gdt->size, 8, (unsigned long *)&gdt->address);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "Failed to allocate memory for 'gdt'\n");
+ efi_printk("Failed to allocate memory for 'gdt'\n");
goto fail;
}
@@ -818,13 +773,13 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
*/
if (hdr->pref_address != hdr->code32_start) {
unsigned long bzimage_addr = hdr->code32_start;
- status = efi_relocate_kernel(sys_table, &bzimage_addr,
+ status = efi_relocate_kernel(&bzimage_addr,
hdr->init_size, hdr->init_size,
hdr->pref_address,
hdr->kernel_alignment,
LOAD_PHYSICAL_ADDR);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
+ efi_printk("efi_relocate_kernel() failed!\n");
goto fail;
}
@@ -834,7 +789,7 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
status = exit_boot(boot_params, handle);
if (status != EFI_SUCCESS) {
- efi_printk(sys_table, "exit_boot() failed!\n");
+ efi_printk("exit_boot() failed!\n");
goto fail;
}
@@ -927,7 +882,8 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
return boot_params;
fail:
- efi_printk(sys_table, "efi_main() failed!\n");
+ efi_printk("efi_main() failed!\n");
- return NULL;
+ for (;;)
+ asm("hlt");
}
diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h
index 8297387c4676..99f35343d443 100644
--- a/arch/x86/boot/compressed/eboot.h
+++ b/arch/x86/boot/compressed/eboot.h
@@ -12,22 +12,20 @@
#define DESC_TYPE_CODE_DATA (1 << 0)
-typedef struct {
- u32 get_mode;
- u32 set_mode;
- u32 blt;
-} efi_uga_draw_protocol_32_t;
+typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t;
-typedef struct {
- u64 get_mode;
- u64 set_mode;
- u64 blt;
-} efi_uga_draw_protocol_64_t;
-
-typedef struct {
- void *get_mode;
- void *set_mode;
- void *blt;
-} efi_uga_draw_protocol_t;
+union efi_uga_draw_protocol {
+ struct {
+ efi_status_t (__efiapi *get_mode)(efi_uga_draw_protocol_t *,
+ u32*, u32*, u32*, u32*);
+ void *set_mode;
+ void *blt;
+ };
+ struct {
+ u32 get_mode;
+ u32 set_mode;
+ u32 blt;
+ } mixed_mode;
+};
#endif /* BOOT_COMPRESSED_EBOOT_H */
diff --git a/arch/x86/boot/compressed/efi_stub_32.S b/arch/x86/boot/compressed/efi_stub_32.S
deleted file mode 100644
index ed6c351d34ed..000000000000
--- a/arch/x86/boot/compressed/efi_stub_32.S
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * EFI call stub for IA32.
- *
- * This stub allows us to make EFI calls in physical mode with interrupts
- * turned off. Note that this implementation is different from the one in
- * arch/x86/platform/efi/efi_stub_32.S because we're _already_ in physical
- * mode at this point.
- */
-
-#include <linux/linkage.h>
-#include <asm/page_types.h>
-
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
-
-.text
-SYM_FUNC_START(efi_call_phys)
- /*
- * 0. The function can only be called in Linux kernel. So CS has been
- * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
- * the values of these registers are the same. And, the corresponding
- * GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prelog and epilog.
- */
-
- /*
- * 1. Because we haven't been relocated by this point we need to
- * use relative addressing.
- */
- call 1f
-1: popl %edx
- subl $1b, %edx
-
- /*
- * 2. Now on the top of stack is the return
- * address in the caller of efi_call_phys(), then parameter 1,
- * parameter 2, ..., param n. To make things easy, we save the return
- * address of efi_call_phys in a global variable.
- */
- popl %ecx
- movl %ecx, saved_return_addr(%edx)
- /* get the function pointer into ECX*/
- popl %ecx
- movl %ecx, efi_rt_function_ptr(%edx)
-
- /*
- * 3. Call the physical function.
- */
- call *%ecx
-
- /*
- * 4. Balance the stack. And because EAX contain the return value,
- * we'd better not clobber it. We need to calculate our address
- * again because %ecx and %edx are not preserved across EFI function
- * calls.
- */
- call 1f
-1: popl %edx
- subl $1b, %edx
-
- movl efi_rt_function_ptr(%edx), %ecx
- pushl %ecx
-
- /*
- * 10. Push the saved return address onto the stack and return.
- */
- movl saved_return_addr(%edx), %ecx
- pushl %ecx
- ret
-SYM_FUNC_END(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
- .long 0
-efi_rt_function_ptr:
- .long 0
diff --git a/arch/x86/boot/compressed/efi_stub_64.S b/arch/x86/boot/compressed/efi_stub_64.S
deleted file mode 100644
index 99494dff2113..000000000000
--- a/arch/x86/boot/compressed/efi_stub_64.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#include <asm/segment.h>
-#include <asm/msr.h>
-#include <asm/processor-flags.h>
-
-#include "../../platform/efi/efi_stub_64.S"
diff --git a/arch/x86/boot/compressed/efi_thunk_64.S b/arch/x86/boot/compressed/efi_thunk_64.S
index 593913692d16..8fb7f6799c52 100644
--- a/arch/x86/boot/compressed/efi_thunk_64.S
+++ b/arch/x86/boot/compressed/efi_thunk_64.S
@@ -10,7 +10,7 @@
* needs to be able to service interrupts.
*
* On the plus side, we don't have to worry about mangling 64-bit
- * addresses into 32-bits because we're executing with an identify
+ * addresses into 32-bits because we're executing with an identity
* mapped pagetable and haven't transitioned to 64-bit virtual addresses
* yet.
*/
@@ -23,16 +23,13 @@
.code64
.text
-SYM_FUNC_START(efi64_thunk)
+SYM_FUNC_START(__efi64_thunk)
push %rbp
push %rbx
- subq $8, %rsp
- leaq efi_exit32(%rip), %rax
- movl %eax, 4(%rsp)
- leaq efi_gdt64(%rip), %rax
- movl %eax, (%rsp)
- movl %eax, 2(%rax) /* Fixup the gdt base address */
+ leaq 1f(%rip), %rbp
+ leaq efi_gdt64(%rip), %rbx
+ movl %ebx, 2(%rbx) /* Fixup the gdt base address */
movl %ds, %eax
push %rax
@@ -48,15 +45,10 @@ SYM_FUNC_START(efi64_thunk)
movl %esi, 0x0(%rsp)
movl %edx, 0x4(%rsp)
movl %ecx, 0x8(%rsp)
- movq %r8, %rsi
- movl %esi, 0xc(%rsp)
- movq %r9, %rsi
- movl %esi, 0x10(%rsp)
+ movl %r8d, 0xc(%rsp)
+ movl %r9d, 0x10(%rsp)
- sgdt save_gdt(%rip)
-
- leaq 1f(%rip), %rbx
- movq %rbx, func_rt_ptr(%rip)
+ sgdt 0x14(%rsp)
/*
* Switch to gdt with 32-bit segments. This is the firmware GDT
@@ -71,9 +63,9 @@ SYM_FUNC_START(efi64_thunk)
pushq %rax
lretq
-1: addq $32, %rsp
-
- lgdt save_gdt(%rip)
+1: lgdt 0x14(%rsp)
+ addq $32, %rsp
+ movq %rdi, %rax
pop %rbx
movl %ebx, %ss
@@ -85,26 +77,13 @@ SYM_FUNC_START(efi64_thunk)
/*
* Convert 32-bit status code into 64-bit.
*/
- test %rax, %rax
- jz 1f
- movl %eax, %ecx
- andl $0x0fffffff, %ecx
- andl $0xf0000000, %eax
- shl $32, %rax
- or %rcx, %rax
-1:
- addq $8, %rsp
+ roll $1, %eax
+ rorq $1, %rax
+
pop %rbx
pop %rbp
ret
-SYM_FUNC_END(efi64_thunk)
-
-SYM_FUNC_START_LOCAL(efi_exit32)
- movq func_rt_ptr(%rip), %rax
- push %rax
- mov %rdi, %rax
- ret
-SYM_FUNC_END(efi_exit32)
+SYM_FUNC_END(__efi64_thunk)
.code32
/*
@@ -144,9 +123,7 @@ SYM_FUNC_START_LOCAL(efi_enter32)
*/
cli
- movl 56(%esp), %eax
- movl %eax, 2(%eax)
- lgdtl (%eax)
+ lgdtl (%ebx)
movl %cr4, %eax
btsl $(X86_CR4_PAE_BIT), %eax
@@ -163,9 +140,8 @@ SYM_FUNC_START_LOCAL(efi_enter32)
xorl %eax, %eax
lldt %ax
- movl 60(%esp), %eax
pushl $__KERNEL_CS
- pushl %eax
+ pushl %ebp
/* Enable paging */
movl %cr0, %eax
@@ -181,13 +157,6 @@ SYM_DATA_START(efi32_boot_gdt)
.quad 0
SYM_DATA_END(efi32_boot_gdt)
-SYM_DATA_START_LOCAL(save_gdt)
- .word 0
- .quad 0
-SYM_DATA_END(save_gdt)
-
-SYM_DATA_LOCAL(func_rt_ptr, .quad 0)
-
SYM_DATA_START(efi_gdt64)
.word efi_gdt64_end - efi_gdt64
.long 0 /* Filled out by user */
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index f2dfd6d083ef..73f17d0544dd 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -145,67 +145,16 @@ SYM_FUNC_START(startup_32)
SYM_FUNC_END(startup_32)
#ifdef CONFIG_EFI_STUB
-/*
- * We don't need the return address, so set up the stack so efi_main() can find
- * its arguments.
- */
-SYM_FUNC_START(efi_pe_entry)
- add $0x4, %esp
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- popl %ecx
- movl %ecx, efi32_config(%esi) /* Handle */
- popl %ecx
- movl %ecx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-
- call make_boot_params
- cmpl $0, %eax
- je fail
- movl %esi, BP_code32_start(%eax)
- popl %ecx
- pushl %eax
- pushl %ecx
- jmp 2f /* Skip efi_config initialization */
-SYM_FUNC_END(efi_pe_entry)
-
SYM_FUNC_START(efi32_stub_entry)
+SYM_FUNC_START_ALIAS(efi_stub_entry)
add $0x4, %esp
- popl %ecx
- popl %edx
-
- call 1f
-1: popl %esi
- subl $1b, %esi
-
- movl %ecx, efi32_config(%esi) /* Handle */
- movl %edx, efi32_config+8(%esi) /* EFI System table pointer */
-
- /* Relocate efi_config->call() */
- leal efi32_config(%esi), %eax
- add %esi, 40(%eax)
- pushl %eax
-2:
call efi_main
- cmpl $0, %eax
movl %eax, %esi
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
movl BP_code32_start(%esi), %eax
leal startup_32(%eax), %eax
jmp *%eax
SYM_FUNC_END(efi32_stub_entry)
+SYM_FUNC_END_ALIAS(efi_stub_entry)
#endif
.text
@@ -240,11 +189,9 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
/* push arguments for extract_kernel: */
pushl $z_output_len /* decompressed length, end of relocs */
- movl BP_init_size(%esi), %eax
- subl $_end, %eax
- movl %ebx, %ebp
- subl %eax, %ebp
- pushl %ebp /* output address */
+ leal _end(%ebx), %eax
+ subl BP_init_size(%esi), %eax
+ pushl %eax /* output address */
pushl $z_input_len /* input_len */
leal input_data(%ebx), %eax
@@ -262,15 +209,6 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated)
jmp *%eax
SYM_FUNC_END(.Lrelocated)
-#ifdef CONFIG_EFI_STUB
- .data
-efi32_config:
- .fill 5,8,0
- .long efi_call_phys
- .long 0
- .byte 0
-#endif
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ee60b81944a7..1f1f6c8139b3 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -208,10 +208,12 @@ SYM_FUNC_START(startup_32)
pushl $__KERNEL_CS
leal startup_64(%ebp), %eax
#ifdef CONFIG_EFI_MIXED
- movl efi32_config(%ebp), %ebx
- cmp $0, %ebx
+ movl efi32_boot_args(%ebp), %edi
+ cmp $0, %edi
jz 1f
- leal handover_entry(%ebp), %eax
+ leal efi64_stub_entry(%ebp), %eax
+ movl %esi, %edx
+ movl efi32_boot_args+4(%ebp), %esi
1:
#endif
pushl %eax
@@ -232,17 +234,14 @@ SYM_FUNC_START(efi32_stub_entry)
popl %edx
popl %esi
- leal (BP_scratch+4)(%esi), %esp
call 1f
1: pop %ebp
subl $1b, %ebp
- movl %ecx, efi32_config(%ebp)
- movl %edx, efi32_config+8(%ebp)
+ movl %ecx, efi32_boot_args(%ebp)
+ movl %edx, efi32_boot_args+4(%ebp)
sgdtl efi32_boot_gdt(%ebp)
-
- leal efi32_config(%ebp), %eax
- movl %eax, efi_config(%ebp)
+ movb $0, efi_is64(%ebp)
/* Disable paging */
movl %cr0, %eax
@@ -450,70 +449,17 @@ trampoline_return:
SYM_CODE_END(startup_64)
#ifdef CONFIG_EFI_STUB
-
-/* The entry point for the PE/COFF executable is efi_pe_entry. */
-SYM_FUNC_START(efi_pe_entry)
- movq %rcx, efi64_config(%rip) /* Handle */
- movq %rdx, efi64_config+8(%rip) /* EFI System table pointer */
-
- leaq efi64_config(%rip), %rax
- movq %rax, efi_config(%rip)
-
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- addq %rbp, efi64_config+40(%rip)
-
- movq %rax, %rdi
- call make_boot_params
- cmpq $0,%rax
- je fail
- mov %rax, %rsi
- leaq startup_32(%rip), %rax
- movl %eax, BP_code32_start(%rsi)
- jmp 2f /* Skip the relocation */
-
-handover_entry:
- call 1f
-1: popq %rbp
- subq $1b, %rbp
-
- /*
- * Relocate efi_config->call().
- */
- movq efi_config(%rip), %rax
- addq %rbp, 40(%rax)
-2:
- movq efi_config(%rip), %rdi
+ .org 0x390
+SYM_FUNC_START(efi64_stub_entry)
+SYM_FUNC_START_ALIAS(efi_stub_entry)
+ and $~0xf, %rsp /* realign the stack */
call efi_main
movq %rax,%rsi
- cmpq $0,%rax
- jne 2f
-fail:
- /* EFI init failed, so hang. */
- hlt
- jmp fail
-2:
movl BP_code32_start(%esi), %eax
leaq startup_64(%rax), %rax
jmp *%rax
-SYM_FUNC_END(efi_pe_entry)
-
- .org 0x390
-SYM_FUNC_START(efi64_stub_entry)
- movq %rdi, efi64_config(%rip) /* Handle */
- movq %rsi, efi64_config+8(%rip) /* EFI System table pointer */
-
- leaq efi64_config(%rip), %rax
- movq %rax, efi_config(%rip)
-
- movq %rdx, %rsi
- jmp handover_entry
SYM_FUNC_END(efi64_stub_entry)
+SYM_FUNC_END_ALIAS(efi_stub_entry)
#endif
.text
@@ -682,24 +628,11 @@ SYM_DATA_START_LOCAL(gdt)
.quad 0x0000000000000000 /* TS continued */
SYM_DATA_END_LABEL(gdt, SYM_L_LOCAL, gdt_end)
-#ifdef CONFIG_EFI_STUB
-SYM_DATA_LOCAL(efi_config, .quad 0)
-
#ifdef CONFIG_EFI_MIXED
-SYM_DATA_START(efi32_config)
- .fill 5,8,0
- .quad efi64_thunk
- .byte 0
-SYM_DATA_END(efi32_config)
+SYM_DATA_LOCAL(efi32_boot_args, .long 0, 0)
+SYM_DATA(efi_is64, .byte 1)
#endif
-SYM_DATA_START(efi64_config)
- .fill 5,8,0
- .quad efi_call
- .byte 1
-SYM_DATA_END(efi64_config)
-#endif /* CONFIG_EFI_STUB */
-
/*
* Stack and heap for uncompression
*/
diff --git a/arch/x86/boot/compressed/kaslr_64.c b/arch/x86/boot/compressed/kaslr_64.c
index 748456c365f4..9557c5a15b91 100644
--- a/arch/x86/boot/compressed/kaslr_64.c
+++ b/arch/x86/boot/compressed/kaslr_64.c
@@ -29,9 +29,6 @@
#define __PAGE_OFFSET __PAGE_OFFSET_BASE
#include "../../mm/ident_map.c"
-/* Used by pgtable.h asm code to force instruction serialization. */
-unsigned long __force_order;
-
/* Used to track our page table allocation area. */
struct alloc_pgt_data {
unsigned char *pgt_buf;
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 9caa10e82217..da0ccc5de538 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,6 +15,7 @@
#include "../include/asm/required-features.h"
#include "../include/asm/disabled-features.h"
#include "../include/asm/cpufeatures.h"
+#include "../include/asm/vmxfeatures.h"
#include "../kernel/cpu/capflags.c"
int main(void)
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index 0149e41d42c2..3da1c37c6dd5 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -51,7 +51,10 @@ SECTIONS
. = ALIGN(16);
_end = .;
- /DISCARD/ : { *(.note*) }
+ /DISCARD/ : {
+ *(.eh_frame)
+ *(.note*)
+ }
/*
* The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore
new file mode 100644
index 000000000000..30be0400a439
--- /dev/null
+++ b/arch/x86/crypto/.gitignore
@@ -0,0 +1 @@
+poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 958440eae27e..8c2e9eadee8a 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -11,6 +11,7 @@ avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
+adx_supported := $(call as-instr,adox %r10$(comma)%r10,yes,no)
obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
@@ -39,7 +40,11 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
-obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+
+# These modules require the assembler to support ADX.
+ifeq ($(adx_supported),yes)
+ obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o
+endif
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
@@ -73,6 +78,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
+targets += poly1305-x86_64-cryptogams.S
+endif
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
@@ -101,10 +110,8 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
ifeq ($(avx2_supported),yes)
sha1-ssse3-y += sha1_avx2_x86_64_asm.o
-poly1305-x86_64-y += poly1305-avx2-x86_64.o
endif
ifeq ($(sha1_ni_supported),yes)
sha1-ssse3-y += sha1_ni_asm.o
@@ -118,3 +125,8 @@ sha256-ssse3-y += sha256_ni_asm.o
endif
sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+
+quiet_cmd_perlasm = PERLASM $@
+ cmd_perlasm = $(PERL) $< > $@
+$(obj)/%.S: $(src)/%.pl FORCE
+ $(call if_changed,perlasm)
diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c
index 46d227122643..4623189000d8 100644
--- a/arch/x86/crypto/aegis128-aesni-glue.c
+++ b/arch/x86/crypto/aegis128-aesni-glue.c
@@ -144,10 +144,8 @@ static int crypto_aegis128_aesni_setkey(struct crypto_aead *aead, const u8 *key,
{
struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(aead);
- if (keylen != AEGIS128_KEY_SIZE) {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != AEGIS128_KEY_SIZE)
return -EINVAL;
- }
memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE);
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index d28503f99f58..cad6e1bfa7d5 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -1942,7 +1942,7 @@ SYM_FUNC_START(aesni_set_key)
SYM_FUNC_END(aesni_set_key)
/*
- * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
*/
SYM_FUNC_START(aesni_enc)
FRAME_BEGIN
@@ -2131,7 +2131,7 @@ SYM_FUNC_START_LOCAL(_aesni_enc4)
SYM_FUNC_END(_aesni_enc4)
/*
- * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
+ * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
*/
SYM_FUNC_START(aesni_dec)
FRAME_BEGIN
@@ -2716,8 +2716,8 @@ SYM_FUNC_END(aesni_ctr_enc)
pxor CTR, IV;
/*
- * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
- * bool enc, u8 *iv)
+ * void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *dst,
+ * const u8 *src, bool enc, le128 *iv)
*/
SYM_FUNC_START(aesni_xts_crypt8)
FRAME_BEGIN
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3e707e81afdb..bbbebbd35b5d 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -83,10 +83,8 @@ struct gcm_context_data {
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len);
-asmlinkage void aesni_enc(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in);
-asmlinkage void aesni_dec(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in);
+asmlinkage void aesni_enc(const void *ctx, u8 *out, const u8 *in);
+asmlinkage void aesni_dec(const void *ctx, u8 *out, const u8 *in);
asmlinkage void aesni_ecb_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len);
asmlinkage void aesni_ecb_dec(struct crypto_aes_ctx *ctx, u8 *out,
@@ -106,8 +104,8 @@ static void (*aesni_ctr_enc_tfm)(struct crypto_aes_ctx *ctx, u8 *out,
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
-asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out,
- const u8 *in, bool enc, u8 *iv);
+asmlinkage void aesni_xts_crypt8(const struct crypto_aes_ctx *ctx, u8 *out,
+ const u8 *in, bool enc, le128 *iv);
/* asmlinkage void aesni_gcm_enc()
* void *ctx, AES Key schedule. Starts on a 16 byte boundary.
@@ -318,14 +316,11 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx,
const u8 *in_key, unsigned int key_len)
{
struct crypto_aes_ctx *ctx = aes_ctx(raw_ctx);
- u32 *flags = &tfm->crt_flags;
int err;
if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 &&
- key_len != AES_KEYSIZE_256) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ key_len != AES_KEYSIZE_256)
return -EINVAL;
- }
if (!crypto_simd_usable())
err = aes_expandkey(ctx, in_key, key_len);
@@ -550,29 +545,24 @@ static int xts_aesni_setkey(struct crypto_skcipher *tfm, const u8 *key,
}
-static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in)
-{
- aesni_enc(ctx, out, in);
-}
-
-static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void aesni_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_enc);
}
-static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void aesni_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, aesni_dec);
}
-static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void aesni_xts_enc8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv);
+ aesni_xts_crypt8(ctx, dst, src, true, iv);
}
-static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void aesni_xts_dec8(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv);
+ aesni_xts_crypt8(ctx, dst, src, false, iv);
}
static const struct common_glue_ctx aesni_enc_xts = {
@@ -581,10 +571,10 @@ static const struct common_glue_ctx aesni_enc_xts = {
.funcs = { {
.num_blocks = 8,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) }
+ .fn_u = { .xts = aesni_xts_enc8 }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) }
+ .fn_u = { .xts = aesni_xts_enc }
} }
};
@@ -594,10 +584,10 @@ static const struct common_glue_ctx aesni_dec_xts = {
.funcs = { {
.num_blocks = 8,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) }
+ .fn_u = { .xts = aesni_xts_dec8 }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) }
+ .fn_u = { .xts = aesni_xts_dec }
} }
};
@@ -606,8 +596,7 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&aesni_enc_xts, req,
- XTS_TWEAK_CAST(aesni_xts_tweak),
+ return glue_xts_req_128bit(&aesni_enc_xts, req, aesni_enc,
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx),
false);
@@ -618,8 +607,7 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct aesni_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&aesni_dec_xts, req,
- XTS_TWEAK_CAST(aesni_xts_tweak),
+ return glue_xts_req_128bit(&aesni_dec_xts, req, aesni_enc,
aes_ctx(ctx->raw_tweak_ctx),
aes_ctx(ctx->raw_crypt_ctx),
true);
@@ -650,10 +638,9 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
{
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
- if (key_len < 4) {
- crypto_aead_set_flags(aead, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (key_len < 4)
return -EINVAL;
- }
+
/*Account for 4 byte nonce at the end.*/
key_len -= 4;
diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c
index 1d9ff8a45e1f..06ef2d4a4701 100644
--- a/arch/x86/crypto/blake2s-glue.c
+++ b/arch/x86/crypto/blake2s-glue.c
@@ -64,10 +64,8 @@ static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key,
{
struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm);
- if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE)
return -EINVAL;
- }
memcpy(tctx->key, key, keylen);
tctx->keylen = keylen;
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index a4f00128ea55..ccda647422d6 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -19,20 +19,17 @@
#define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32
/* 32-way AVX2/AES-NI parallel cipher functions */
-asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_ecb_enc_32way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_32way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void camellia_cbc_dec_32way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ctr_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
-asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_enc_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void camellia_xts_dec_32way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
static const struct common_glue_ctx camellia_enc = {
.num_funcs = 4,
@@ -40,16 +37,16 @@ static const struct common_glue_ctx camellia_enc = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) }
+ .fn_u = { .ecb = camellia_ecb_enc_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+ .fn_u = { .ecb = camellia_ecb_enc_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ .fn_u = { .ecb = camellia_enc_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ .fn_u = { .ecb = camellia_enc_blk }
} }
};
@@ -59,16 +56,16 @@ static const struct common_glue_ctx camellia_ctr = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) }
+ .fn_u = { .ctr = camellia_ctr_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+ .fn_u = { .ctr = camellia_ctr_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ .fn_u = { .ctr = camellia_crypt_ctr }
} }
};
@@ -78,13 +75,13 @@ static const struct common_glue_ctx camellia_enc_xts = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) }
+ .fn_u = { .xts = camellia_xts_enc_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+ .fn_u = { .xts = camellia_xts_enc_16way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+ .fn_u = { .xts = camellia_xts_enc }
} }
};
@@ -94,16 +91,16 @@ static const struct common_glue_ctx camellia_dec = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) }
+ .fn_u = { .ecb = camellia_ecb_dec_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+ .fn_u = { .ecb = camellia_ecb_dec_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ .fn_u = { .ecb = camellia_dec_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .ecb = camellia_dec_blk }
} }
};
@@ -113,16 +110,16 @@ static const struct common_glue_ctx camellia_dec_cbc = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) }
+ .fn_u = { .cbc = camellia_cbc_dec_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+ .fn_u = { .cbc = camellia_cbc_dec_16way }
}, {
.num_blocks = 2,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .cbc = camellia_dec_blk }
} }
};
@@ -132,21 +129,20 @@ static const struct common_glue_ctx camellia_dec_xts = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) }
+ .fn_u = { .xts = camellia_xts_dec_32way }
}, {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+ .fn_u = { .xts = camellia_xts_dec_16way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+ .fn_u = { .xts = camellia_xts_dec }
} }
};
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
- return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
- &tfm->base.crt_flags);
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
static int ecb_encrypt(struct skcipher_request *req)
@@ -161,8 +157,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
- req);
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -180,8 +175,7 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&camellia_enc_xts, req,
- XTS_TWEAK_CAST(camellia_enc_blk),
+ return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
@@ -190,8 +184,7 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&camellia_dec_xts, req,
- XTS_TWEAK_CAST(camellia_enc_blk),
+ return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index f28d282779b8..4e5de6ef206e 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -18,41 +18,36 @@
#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
/* 16-way parallel cipher functions (avx/aes-ni) */
-asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way);
-asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way);
-asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way);
-asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
EXPORT_SYMBOL_GPL(camellia_ctr_16way);
-asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
EXPORT_SYMBOL_GPL(camellia_xts_enc_16way);
-asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
EXPORT_SYMBOL_GPL(camellia_xts_dec_16way);
-void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(camellia_enc_blk));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_enc_blk);
}
EXPORT_SYMBOL_GPL(camellia_xts_enc);
-void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(camellia_dec_blk));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, camellia_dec_blk);
}
EXPORT_SYMBOL_GPL(camellia_xts_dec);
@@ -62,13 +57,13 @@ static const struct common_glue_ctx camellia_enc = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
+ .fn_u = { .ecb = camellia_ecb_enc_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ .fn_u = { .ecb = camellia_enc_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ .fn_u = { .ecb = camellia_enc_blk }
} }
};
@@ -78,13 +73,13 @@ static const struct common_glue_ctx camellia_ctr = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
+ .fn_u = { .ctr = camellia_ctr_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ .fn_u = { .ctr = camellia_crypt_ctr }
} }
};
@@ -94,10 +89,10 @@ static const struct common_glue_ctx camellia_enc_xts = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) }
+ .fn_u = { .xts = camellia_xts_enc_16way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) }
+ .fn_u = { .xts = camellia_xts_enc }
} }
};
@@ -107,13 +102,13 @@ static const struct common_glue_ctx camellia_dec = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
+ .fn_u = { .ecb = camellia_ecb_dec_16way }
}, {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ .fn_u = { .ecb = camellia_dec_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .ecb = camellia_dec_blk }
} }
};
@@ -123,13 +118,13 @@ static const struct common_glue_ctx camellia_dec_cbc = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
+ .fn_u = { .cbc = camellia_cbc_dec_16way }
}, {
.num_blocks = 2,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .cbc = camellia_dec_blk }
} }
};
@@ -139,18 +134,17 @@ static const struct common_glue_ctx camellia_dec_xts = {
.funcs = { {
.num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) }
+ .fn_u = { .xts = camellia_xts_dec_16way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) }
+ .fn_u = { .xts = camellia_xts_dec }
} }
};
static int camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
- return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen,
- &tfm->base.crt_flags);
+ return __camellia_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
static int ecb_encrypt(struct skcipher_request *req)
@@ -165,8 +159,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
- req);
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -183,7 +176,6 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *flags = &tfm->base.crt_flags;
int err;
err = xts_verify_key(tfm, key, keylen);
@@ -191,13 +183,12 @@ int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
return err;
/* first half of xts-key is for crypt */
- err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2);
if (err)
return err;
/* second half of xts-key is for tweak */
- return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
- flags);
+ return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
}
EXPORT_SYMBOL_GPL(xts_camellia_setkey);
@@ -206,8 +197,7 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&camellia_enc_xts, req,
- XTS_TWEAK_CAST(camellia_enc_blk),
+ return glue_xts_req_128bit(&camellia_enc_xts, req, camellia_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
@@ -216,8 +206,7 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct camellia_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&camellia_dec_xts, req,
- XTS_TWEAK_CAST(camellia_enc_blk),
+ return glue_xts_req_128bit(&camellia_dec_xts, req, camellia_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 7c62db56ffe1..242c056e5fa8 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -18,19 +18,17 @@
#include <asm/crypto/glue_helper.h>
/* regular block cipher functions */
-asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk);
-asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_dec_blk);
/* 2-way parallel cipher functions */
-asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
-asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
@@ -1229,12 +1227,10 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
}
int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
- unsigned int key_len, u32 *flags)
+ unsigned int key_len)
{
- if (key_len != 16 && key_len != 24 && key_len != 32) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
+ if (key_len != 16 && key_len != 24 && key_len != 32)
return -EINVAL;
- }
cctx->key_length = key_len;
@@ -1257,8 +1253,7 @@ EXPORT_SYMBOL_GPL(__camellia_setkey);
static int camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int key_len)
{
- return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len,
- &tfm->crt_flags);
+ return __camellia_setkey(crypto_tfm_ctx(tfm), key, key_len);
}
static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
@@ -1267,8 +1262,10 @@ static int camellia_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key,
return camellia_setkey(&tfm->base, key, key_len);
}
-void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
+void camellia_decrypt_cbc_2way(const void *ctx, u8 *d, const u8 *s)
{
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
u128 iv = *src;
camellia_dec_blk_2way(ctx, (u8 *)dst, (u8 *)src);
@@ -1277,9 +1274,11 @@ void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
}
EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
-void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
if (dst != src)
*dst = *src;
@@ -1291,9 +1290,11 @@ void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
}
EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
-void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void camellia_crypt_ctr_2way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblks[2];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
if (dst != src) {
dst[0] = src[0];
@@ -1315,10 +1316,10 @@ static const struct common_glue_ctx camellia_enc = {
.funcs = { {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
+ .fn_u = { .ecb = camellia_enc_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
+ .fn_u = { .ecb = camellia_enc_blk }
} }
};
@@ -1328,10 +1329,10 @@ static const struct common_glue_ctx camellia_ctr = {
.funcs = { {
.num_blocks = 2,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
+ .fn_u = { .ctr = camellia_crypt_ctr_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
+ .fn_u = { .ctr = camellia_crypt_ctr }
} }
};
@@ -1341,10 +1342,10 @@ static const struct common_glue_ctx camellia_dec = {
.funcs = { {
.num_blocks = 2,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
+ .fn_u = { .ecb = camellia_dec_blk_2way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .ecb = camellia_dec_blk }
} }
};
@@ -1354,10 +1355,10 @@ static const struct common_glue_ctx camellia_dec_cbc = {
.funcs = { {
.num_blocks = 2,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
+ .fn_u = { .cbc = camellia_decrypt_cbc_2way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
+ .fn_u = { .cbc = camellia_dec_blk }
} }
};
@@ -1373,8 +1374,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(camellia_enc_blk),
- req);
+ return glue_cbc_encrypt_req_128bit(camellia_enc_blk, req);
}
static int cbc_decrypt(struct skcipher_request *req)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index a8a38fffb4a9..48e0f37796fa 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -20,20 +20,17 @@
#define CAST6_PARALLEL_BLOCKS 8
-asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src);
-
-asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
+asmlinkage void cast6_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void cast6_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void cast6_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void cast6_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-asmlinkage void cast6_xts_enc_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void cast6_xts_dec_8way(struct cast6_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void cast6_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void cast6_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -41,21 +38,21 @@ static int cast6_setkey_skcipher(struct crypto_skcipher *tfm,
return cast6_setkey(&tfm->base, key, keylen);
}
-static void cast6_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void cast6_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(__cast6_encrypt));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_encrypt);
}
-static void cast6_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void cast6_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(__cast6_decrypt));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __cast6_decrypt);
}
-static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void cast6_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
@@ -70,10 +67,10 @@ static const struct common_glue_ctx cast6_enc = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
+ .fn_u = { .ecb = cast6_ecb_enc_8way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
+ .fn_u = { .ecb = __cast6_encrypt }
} }
};
@@ -83,10 +80,10 @@ static const struct common_glue_ctx cast6_ctr = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
+ .fn_u = { .ctr = cast6_ctr_8way }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
+ .fn_u = { .ctr = cast6_crypt_ctr }
} }
};
@@ -96,10 +93,10 @@ static const struct common_glue_ctx cast6_enc_xts = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc_8way) }
+ .fn_u = { .xts = cast6_xts_enc_8way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_enc) }
+ .fn_u = { .xts = cast6_xts_enc }
} }
};
@@ -109,10 +106,10 @@ static const struct common_glue_ctx cast6_dec = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
+ .fn_u = { .ecb = cast6_ecb_dec_8way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
+ .fn_u = { .ecb = __cast6_decrypt }
} }
};
@@ -122,10 +119,10 @@ static const struct common_glue_ctx cast6_dec_cbc = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
+ .fn_u = { .cbc = cast6_cbc_dec_8way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
+ .fn_u = { .cbc = __cast6_decrypt }
} }
};
@@ -135,10 +132,10 @@ static const struct common_glue_ctx cast6_dec_xts = {
.funcs = { {
.num_blocks = CAST6_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec_8way) }
+ .fn_u = { .xts = cast6_xts_dec_8way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(cast6_xts_dec) }
+ .fn_u = { .xts = cast6_xts_dec }
} }
};
@@ -154,8 +151,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__cast6_encrypt),
- req);
+ return glue_cbc_encrypt_req_128bit(__cast6_encrypt, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -177,7 +173,6 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *flags = &tfm->base.crt_flags;
int err;
err = xts_verify_key(tfm, key, keylen);
@@ -185,13 +180,12 @@ static int xts_cast6_setkey(struct crypto_skcipher *tfm, const u8 *key,
return err;
/* first half of xts-key is for crypt */
- err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2);
if (err)
return err;
/* second half of xts-key is for tweak */
- return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
- flags);
+ return __cast6_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
}
static int xts_encrypt(struct skcipher_request *req)
@@ -199,8 +193,7 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&cast6_enc_xts, req,
- XTS_TWEAK_CAST(__cast6_encrypt),
+ return glue_xts_req_128bit(&cast6_enc_xts, req, __cast6_encrypt,
&ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
@@ -209,8 +202,7 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct cast6_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&cast6_dec_xts, req,
- XTS_TWEAK_CAST(__cast6_encrypt),
+ return glue_xts_req_128bit(&cast6_dec_xts, req, __cast6_encrypt,
&ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c
index cb4ab6645106..418bd88acac8 100644
--- a/arch/x86/crypto/crc32-pclmul_glue.c
+++ b/arch/x86/crypto/crc32-pclmul_glue.c
@@ -94,10 +94,8 @@ static int crc32_pclmul_setkey(struct crypto_shash *hash, const u8 *key,
{
u32 *mctx = crypto_shash_ctx(hash);
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != sizeof(u32))
return -EINVAL;
- }
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c
index eefa0862f309..c20d1b8a82c3 100644
--- a/arch/x86/crypto/crc32c-intel_glue.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -91,10 +91,8 @@ static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key,
{
u32 *mctx = crypto_shash_ctx(hash);
- if (keylen != sizeof(u32)) {
- crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != sizeof(u32))
return -EINVAL;
- }
*mctx = le32_to_cpup((__le32 *)key);
return 0;
}
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 04d72a5a8ce9..a4b728518e28 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -57,10 +57,8 @@ static int ghash_setkey(struct crypto_shash *tfm,
be128 *x = (be128 *)key;
u64 a, b;
- if (keylen != GHASH_BLOCK_SIZE) {
- crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ if (keylen != GHASH_BLOCK_SIZE)
return -EINVAL;
- }
/* perform multiplication by 'x' in GF(2^128) */
a = be64_to_cpu(x->a);
@@ -257,16 +255,11 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
{
struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
struct crypto_ahash *child = &ctx->cryptd_tfm->base;
- int err;
crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
& CRYPTO_TFM_REQ_MASK);
- err = crypto_ahash_setkey(child, key, keylen);
- crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
- & CRYPTO_TFM_RES_MASK);
-
- return err;
+ return crypto_ahash_setkey(child, key, keylen);
}
static int ghash_async_init_tfm(struct crypto_tfm *tfm)
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index d15b99397480..d3d91a0abf88 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -134,7 +134,8 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx,
src -= num_blocks - 1;
dst -= num_blocks - 1;
- gctx->funcs[i].fn_u.cbc(ctx, dst, src);
+ gctx->funcs[i].fn_u.cbc(ctx, (u8 *)dst,
+ (const u8 *)src);
nbytes -= func_bytes;
if (nbytes < bsize)
@@ -188,7 +189,9 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
/* Process multi-block batch */
do {
- gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk);
+ gctx->funcs[i].fn_u.ctr(ctx, (u8 *)dst,
+ (const u8 *)src,
+ &ctrblk);
src += num_blocks;
dst += num_blocks;
nbytes -= func_bytes;
@@ -210,7 +213,8 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx,
be128_to_le128(&ctrblk, (be128 *)walk.iv);
memcpy(&tmp, walk.src.virt.addr, nbytes);
- gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, &tmp, &tmp,
+ gctx->funcs[gctx->num_funcs - 1].fn_u.ctr(ctx, (u8 *)&tmp,
+ (const u8 *)&tmp,
&ctrblk);
memcpy(walk.dst.virt.addr, &tmp, nbytes);
le128_to_be128((be128 *)walk.iv, &ctrblk);
@@ -240,7 +244,8 @@ static unsigned int __glue_xts_req_128bit(const struct common_glue_ctx *gctx,
if (nbytes >= func_bytes) {
do {
- gctx->funcs[i].fn_u.xts(ctx, dst, src,
+ gctx->funcs[i].fn_u.xts(ctx, (u8 *)dst,
+ (const u8 *)src,
walk->iv);
src += num_blocks;
@@ -354,8 +359,8 @@ out:
}
EXPORT_SYMBOL_GPL(glue_xts_req_128bit);
-void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
- common_glue_func_t fn)
+void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv, common_glue_func_t fn)
{
le128 ivblk = *iv;
@@ -363,13 +368,13 @@ void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv,
gf128mul_x_ble(iv, &ivblk);
/* CC <- T xor C */
- u128_xor(dst, src, (u128 *)&ivblk);
+ u128_xor((u128 *)dst, (const u128 *)src, (u128 *)&ivblk);
/* PP <- D(Key2,CC) */
- fn(ctx, (u8 *)dst, (u8 *)dst);
+ fn(ctx, dst, dst);
/* P <- T xor PP */
- u128_xor(dst, dst, (u128 *)&ivblk);
+ u128_xor((u128 *)dst, (u128 *)dst, (u128 *)&ivblk);
}
EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one);
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S
deleted file mode 100644
index d6063feda9da..000000000000
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ /dev/null
@@ -1,390 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ANMASK, "aM", @progbits, 32
-.align 32
-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
- .octa 0x0000000003ffffff0000000003ffffff
-
-.section .rodata.cst32.ORMASK, "aM", @progbits, 32
-.align 32
-ORMASK: .octa 0x00000000010000000000000001000000
- .octa 0x00000000010000000000000001000000
-
-.text
-
-#define h0 0x00(%rdi)
-#define h1 0x04(%rdi)
-#define h2 0x08(%rdi)
-#define h3 0x0c(%rdi)
-#define h4 0x10(%rdi)
-#define r0 0x00(%rdx)
-#define r1 0x04(%rdx)
-#define r2 0x08(%rdx)
-#define r3 0x0c(%rdx)
-#define r4 0x10(%rdx)
-#define u0 0x00(%r8)
-#define u1 0x04(%r8)
-#define u2 0x08(%r8)
-#define u3 0x0c(%r8)
-#define u4 0x10(%r8)
-#define w0 0x14(%r8)
-#define w1 0x18(%r8)
-#define w2 0x1c(%r8)
-#define w3 0x20(%r8)
-#define w4 0x24(%r8)
-#define y0 0x28(%r8)
-#define y1 0x2c(%r8)
-#define y2 0x30(%r8)
-#define y3 0x34(%r8)
-#define y4 0x38(%r8)
-#define m %rsi
-#define hc0 %ymm0
-#define hc1 %ymm1
-#define hc2 %ymm2
-#define hc3 %ymm3
-#define hc4 %ymm4
-#define hc0x %xmm0
-#define hc1x %xmm1
-#define hc2x %xmm2
-#define hc3x %xmm3
-#define hc4x %xmm4
-#define t1 %ymm5
-#define t2 %ymm6
-#define t1x %xmm5
-#define t2x %xmm6
-#define ruwy0 %ymm7
-#define ruwy1 %ymm8
-#define ruwy2 %ymm9
-#define ruwy3 %ymm10
-#define ruwy4 %ymm11
-#define ruwy0x %xmm7
-#define ruwy1x %xmm8
-#define ruwy2x %xmm9
-#define ruwy3x %xmm10
-#define ruwy4x %xmm11
-#define svxz1 %ymm12
-#define svxz2 %ymm13
-#define svxz3 %ymm14
-#define svxz4 %ymm15
-#define d0 %r9
-#define d1 %r10
-#define d2 %r11
-#define d3 %r12
-#define d4 %r13
-
-SYM_FUNC_START(poly1305_4block_avx2)
- # %rdi: Accumulator h[5]
- # %rsi: 64 byte input block m
- # %rdx: Poly1305 key r[5]
- # %rcx: Quadblock count
- # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
-
- # This four-block variant uses loop unrolled block processing. It
- # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
- # h = (h + m) * r => h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
-
- vzeroupper
- push %rbx
- push %r12
- push %r13
-
- # combine r0,u0,w0,y0
- vmovd y0,ruwy0x
- vmovd w0,t1x
- vpunpcklqdq t1,ruwy0,ruwy0
- vmovd u0,t1x
- vmovd r0,t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,ruwy0,ruwy0
-
- # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
- vmovd y1,ruwy1x
- vmovd w1,t1x
- vpunpcklqdq t1,ruwy1,ruwy1
- vmovd u1,t1x
- vmovd r1,t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,ruwy1,ruwy1
- vpslld $2,ruwy1,svxz1
- vpaddd ruwy1,svxz1,svxz1
-
- # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
- vmovd y2,ruwy2x
- vmovd w2,t1x
- vpunpcklqdq t1,ruwy2,ruwy2
- vmovd u2,t1x
- vmovd r2,t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,ruwy2,ruwy2
- vpslld $2,ruwy2,svxz2
- vpaddd ruwy2,svxz2,svxz2
-
- # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
- vmovd y3,ruwy3x
- vmovd w3,t1x
- vpunpcklqdq t1,ruwy3,ruwy3
- vmovd u3,t1x
- vmovd r3,t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,ruwy3,ruwy3
- vpslld $2,ruwy3,svxz3
- vpaddd ruwy3,svxz3,svxz3
-
- # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
- vmovd y4,ruwy4x
- vmovd w4,t1x
- vpunpcklqdq t1,ruwy4,ruwy4
- vmovd u4,t1x
- vmovd r4,t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,ruwy4,ruwy4
- vpslld $2,ruwy4,svxz4
- vpaddd ruwy4,svxz4,svxz4
-
-.Ldoblock4:
- # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
- # m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
- vmovd 0x00(m),hc0x
- vmovd 0x10(m),t1x
- vpunpcklqdq t1,hc0,hc0
- vmovd 0x20(m),t1x
- vmovd 0x30(m),t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,hc0,hc0
- vpand ANMASK(%rip),hc0,hc0
- vmovd h0,t1x
- vpaddd t1,hc0,hc0
- # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
- # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
- vmovd 0x03(m),hc1x
- vmovd 0x13(m),t1x
- vpunpcklqdq t1,hc1,hc1
- vmovd 0x23(m),t1x
- vmovd 0x33(m),t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,hc1,hc1
- vpsrld $2,hc1,hc1
- vpand ANMASK(%rip),hc1,hc1
- vmovd h1,t1x
- vpaddd t1,hc1,hc1
- # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
- # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
- vmovd 0x06(m),hc2x
- vmovd 0x16(m),t1x
- vpunpcklqdq t1,hc2,hc2
- vmovd 0x26(m),t1x
- vmovd 0x36(m),t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,hc2,hc2
- vpsrld $4,hc2,hc2
- vpand ANMASK(%rip),hc2,hc2
- vmovd h2,t1x
- vpaddd t1,hc2,hc2
- # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
- # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
- vmovd 0x09(m),hc3x
- vmovd 0x19(m),t1x
- vpunpcklqdq t1,hc3,hc3
- vmovd 0x29(m),t1x
- vmovd 0x39(m),t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,hc3,hc3
- vpsrld $6,hc3,hc3
- vpand ANMASK(%rip),hc3,hc3
- vmovd h3,t1x
- vpaddd t1,hc3,hc3
- # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
- # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
- vmovd 0x0c(m),hc4x
- vmovd 0x1c(m),t1x
- vpunpcklqdq t1,hc4,hc4
- vmovd 0x2c(m),t1x
- vmovd 0x3c(m),t2x
- vpunpcklqdq t2,t1,t1
- vperm2i128 $0x20,t1,hc4,hc4
- vpsrld $8,hc4,hc4
- vpor ORMASK(%rip),hc4,hc4
- vmovd h4,t1x
- vpaddd t1,hc4,hc4
-
- # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
- vpmuludq hc0,ruwy0,t1
- # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
- vpmuludq hc1,svxz4,t2
- vpaddq t2,t1,t1
- # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
- vpmuludq hc2,svxz3,t2
- vpaddq t2,t1,t1
- # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
- vpmuludq hc3,svxz2,t2
- vpaddq t2,t1,t1
- # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
- vpmuludq hc4,svxz1,t2
- vpaddq t2,t1,t1
- # d0 = t1[0] + t1[1] + t[2] + t[3]
- vpermq $0xee,t1,t2
- vpaddq t2,t1,t1
- vpsrldq $8,t1,t2
- vpaddq t2,t1,t1
- vmovq t1x,d0
-
- # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
- vpmuludq hc0,ruwy1,t1
- # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
- vpmuludq hc1,ruwy0,t2
- vpaddq t2,t1,t1
- # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
- vpmuludq hc2,svxz4,t2
- vpaddq t2,t1,t1
- # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
- vpmuludq hc3,svxz3,t2
- vpaddq t2,t1,t1
- # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
- vpmuludq hc4,svxz2,t2
- vpaddq t2,t1,t1
- # d1 = t1[0] + t1[1] + t1[3] + t1[4]
- vpermq $0xee,t1,t2
- vpaddq t2,t1,t1
- vpsrldq $8,t1,t2
- vpaddq t2,t1,t1
- vmovq t1x,d1
-
- # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
- vpmuludq hc0,ruwy2,t1
- # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
- vpmuludq hc1,ruwy1,t2
- vpaddq t2,t1,t1
- # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
- vpmuludq hc2,ruwy0,t2
- vpaddq t2,t1,t1
- # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
- vpmuludq hc3,svxz4,t2
- vpaddq t2,t1,t1
- # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
- vpmuludq hc4,svxz3,t2
- vpaddq t2,t1,t1
- # d2 = t1[0] + t1[1] + t1[2] + t1[3]
- vpermq $0xee,t1,t2
- vpaddq t2,t1,t1
- vpsrldq $8,t1,t2
- vpaddq t2,t1,t1
- vmovq t1x,d2
-
- # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
- vpmuludq hc0,ruwy3,t1
- # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
- vpmuludq hc1,ruwy2,t2
- vpaddq t2,t1,t1
- # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
- vpmuludq hc2,ruwy1,t2
- vpaddq t2,t1,t1
- # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
- vpmuludq hc3,ruwy0,t2
- vpaddq t2,t1,t1
- # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
- vpmuludq hc4,svxz4,t2
- vpaddq t2,t1,t1
- # d3 = t1[0] + t1[1] + t1[2] + t1[3]
- vpermq $0xee,t1,t2
- vpaddq t2,t1,t1
- vpsrldq $8,t1,t2
- vpaddq t2,t1,t1
- vmovq t1x,d3
-
- # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
- vpmuludq hc0,ruwy4,t1
- # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
- vpmuludq hc1,ruwy3,t2
- vpaddq t2,t1,t1
- # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
- vpmuludq hc2,ruwy2,t2
- vpaddq t2,t1,t1
- # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
- vpmuludq hc3,ruwy1,t2
- vpaddq t2,t1,t1
- # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
- vpmuludq hc4,ruwy0,t2
- vpaddq t2,t1,t1
- # d4 = t1[0] + t1[1] + t1[2] + t1[3]
- vpermq $0xee,t1,t2
- vpaddq t2,t1,t1
- vpsrldq $8,t1,t2
- vpaddq t2,t1,t1
- vmovq t1x,d4
-
- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
- # integers. It's true in a single-block implementation, but not here.
-
- # d1 += d0 >> 26
- mov d0,%rax
- shr $26,%rax
- add %rax,d1
- # h0 = d0 & 0x3ffffff
- mov d0,%rbx
- and $0x3ffffff,%ebx
-
- # d2 += d1 >> 26
- mov d1,%rax
- shr $26,%rax
- add %rax,d2
- # h1 = d1 & 0x3ffffff
- mov d1,%rax
- and $0x3ffffff,%eax
- mov %eax,h1
-
- # d3 += d2 >> 26
- mov d2,%rax
- shr $26,%rax
- add %rax,d3
- # h2 = d2 & 0x3ffffff
- mov d2,%rax
- and $0x3ffffff,%eax
- mov %eax,h2
-
- # d4 += d3 >> 26
- mov d3,%rax
- shr $26,%rax
- add %rax,d4
- # h3 = d3 & 0x3ffffff
- mov d3,%rax
- and $0x3ffffff,%eax
- mov %eax,h3
-
- # h0 += (d4 >> 26) * 5
- mov d4,%rax
- shr $26,%rax
- lea (%rax,%rax,4),%rax
- add %rax,%rbx
- # h4 = d4 & 0x3ffffff
- mov d4,%rax
- and $0x3ffffff,%eax
- mov %eax,h4
-
- # h1 += h0 >> 26
- mov %rbx,%rax
- shr $26,%rax
- add %eax,h1
- # h0 = h0 & 0x3ffffff
- andl $0x3ffffff,%ebx
- mov %ebx,h0
-
- add $0x40,m
- dec %rcx
- jnz .Ldoblock4
-
- vzeroupper
- pop %r13
- pop %r12
- pop %rbx
- ret
-SYM_FUNC_END(poly1305_4block_avx2)
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S
deleted file mode 100644
index d8ea29b96640..000000000000
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ /dev/null
@@ -1,590 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst16.ANMASK, "aM", @progbits, 16
-.align 16
-ANMASK: .octa 0x0000000003ffffff0000000003ffffff
-
-.section .rodata.cst16.ORMASK, "aM", @progbits, 16
-.align 16
-ORMASK: .octa 0x00000000010000000000000001000000
-
-.text
-
-#define h0 0x00(%rdi)
-#define h1 0x04(%rdi)
-#define h2 0x08(%rdi)
-#define h3 0x0c(%rdi)
-#define h4 0x10(%rdi)
-#define r0 0x00(%rdx)
-#define r1 0x04(%rdx)
-#define r2 0x08(%rdx)
-#define r3 0x0c(%rdx)
-#define r4 0x10(%rdx)
-#define s1 0x00(%rsp)
-#define s2 0x04(%rsp)
-#define s3 0x08(%rsp)
-#define s4 0x0c(%rsp)
-#define m %rsi
-#define h01 %xmm0
-#define h23 %xmm1
-#define h44 %xmm2
-#define t1 %xmm3
-#define t2 %xmm4
-#define t3 %xmm5
-#define t4 %xmm6
-#define mask %xmm7
-#define d0 %r8
-#define d1 %r9
-#define d2 %r10
-#define d3 %r11
-#define d4 %r12
-
-SYM_FUNC_START(poly1305_block_sse2)
- # %rdi: Accumulator h[5]
- # %rsi: 16 byte input block m
- # %rdx: Poly1305 key r[5]
- # %rcx: Block count
-
- # This single block variant tries to improve performance by doing two
- # multiplications in parallel using SSE instructions. There is quite
- # some quardword packing involved, hence the speedup is marginal.
-
- push %rbx
- push %r12
- sub $0x10,%rsp
-
- # s1..s4 = r1..r4 * 5
- mov r1,%eax
- lea (%eax,%eax,4),%eax
- mov %eax,s1
- mov r2,%eax
- lea (%eax,%eax,4),%eax
- mov %eax,s2
- mov r3,%eax
- lea (%eax,%eax,4),%eax
- mov %eax,s3
- mov r4,%eax
- lea (%eax,%eax,4),%eax
- mov %eax,s4
-
- movdqa ANMASK(%rip),mask
-
-.Ldoblock:
- # h01 = [0, h1, 0, h0]
- # h23 = [0, h3, 0, h2]
- # h44 = [0, h4, 0, h4]
- movd h0,h01
- movd h1,t1
- movd h2,h23
- movd h3,t2
- movd h4,h44
- punpcklqdq t1,h01
- punpcklqdq t2,h23
- punpcklqdq h44,h44
-
- # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
- movd 0x00(m),t1
- movd 0x03(m),t2
- psrld $2,t2
- punpcklqdq t2,t1
- pand mask,t1
- paddd t1,h01
- # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
- movd 0x06(m),t1
- movd 0x09(m),t2
- psrld $4,t1
- psrld $6,t2
- punpcklqdq t2,t1
- pand mask,t1
- paddd t1,h23
- # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
- mov 0x0c(m),%eax
- shr $8,%eax
- or $0x01000000,%eax
- movd %eax,t1
- pshufd $0xc4,t1,t1
- paddd t1,h44
-
- # t1[0] = h0 * r0 + h2 * s3
- # t1[1] = h1 * s4 + h3 * s2
- movd r0,t1
- movd s4,t2
- punpcklqdq t2,t1
- pmuludq h01,t1
- movd s3,t2
- movd s2,t3
- punpcklqdq t3,t2
- pmuludq h23,t2
- paddq t2,t1
- # t2[0] = h0 * r1 + h2 * s4
- # t2[1] = h1 * r0 + h3 * s3
- movd r1,t2
- movd r0,t3
- punpcklqdq t3,t2
- pmuludq h01,t2
- movd s4,t3
- movd s3,t4
- punpcklqdq t4,t3
- pmuludq h23,t3
- paddq t3,t2
- # t3[0] = h4 * s1
- # t3[1] = h4 * s2
- movd s1,t3
- movd s2,t4
- punpcklqdq t4,t3
- pmuludq h44,t3
- # d0 = t1[0] + t1[1] + t3[0]
- # d1 = t2[0] + t2[1] + t3[1]
- movdqa t1,t4
- punpcklqdq t2,t4
- punpckhqdq t2,t1
- paddq t4,t1
- paddq t3,t1
- movq t1,d0
- psrldq $8,t1
- movq t1,d1
-
- # t1[0] = h0 * r2 + h2 * r0
- # t1[1] = h1 * r1 + h3 * s4
- movd r2,t1
- movd r1,t2
- punpcklqdq t2,t1
- pmuludq h01,t1
- movd r0,t2
- movd s4,t3
- punpcklqdq t3,t2
- pmuludq h23,t2
- paddq t2,t1
- # t2[0] = h0 * r3 + h2 * r1
- # t2[1] = h1 * r2 + h3 * r0
- movd r3,t2
- movd r2,t3
- punpcklqdq t3,t2
- pmuludq h01,t2
- movd r1,t3
- movd r0,t4
- punpcklqdq t4,t3
- pmuludq h23,t3
- paddq t3,t2
- # t3[0] = h4 * s3
- # t3[1] = h4 * s4
- movd s3,t3
- movd s4,t4
- punpcklqdq t4,t3
- pmuludq h44,t3
- # d2 = t1[0] + t1[1] + t3[0]
- # d3 = t2[0] + t2[1] + t3[1]
- movdqa t1,t4
- punpcklqdq t2,t4
- punpckhqdq t2,t1
- paddq t4,t1
- paddq t3,t1
- movq t1,d2
- psrldq $8,t1
- movq t1,d3
-
- # t1[0] = h0 * r4 + h2 * r2
- # t1[1] = h1 * r3 + h3 * r1
- movd r4,t1
- movd r3,t2
- punpcklqdq t2,t1
- pmuludq h01,t1
- movd r2,t2
- movd r1,t3
- punpcklqdq t3,t2
- pmuludq h23,t2
- paddq t2,t1
- # t3[0] = h4 * r0
- movd r0,t3
- pmuludq h44,t3
- # d4 = t1[0] + t1[1] + t3[0]
- movdqa t1,t4
- psrldq $8,t4
- paddq t4,t1
- paddq t3,t1
- movq t1,d4
-
- # d1 += d0 >> 26
- mov d0,%rax
- shr $26,%rax
- add %rax,d1
- # h0 = d0 & 0x3ffffff
- mov d0,%rbx
- and $0x3ffffff,%ebx
-
- # d2 += d1 >> 26
- mov d1,%rax
- shr $26,%rax
- add %rax,d2
- # h1 = d1 & 0x3ffffff
- mov d1,%rax
- and $0x3ffffff,%eax
- mov %eax,h1
-
- # d3 += d2 >> 26
- mov d2,%rax
- shr $26,%rax
- add %rax,d3
- # h2 = d2 & 0x3ffffff
- mov d2,%rax
- and $0x3ffffff,%eax
- mov %eax,h2
-
- # d4 += d3 >> 26
- mov d3,%rax
- shr $26,%rax
- add %rax,d4
- # h3 = d3 & 0x3ffffff
- mov d3,%rax
- and $0x3ffffff,%eax
- mov %eax,h3
-
- # h0 += (d4 >> 26) * 5
- mov d4,%rax
- shr $26,%rax
- lea (%rax,%rax,4),%rax
- add %rax,%rbx
- # h4 = d4 & 0x3ffffff
- mov d4,%rax
- and $0x3ffffff,%eax
- mov %eax,h4
-
- # h1 += h0 >> 26
- mov %rbx,%rax
- shr $26,%rax
- add %eax,h1
- # h0 = h0 & 0x3ffffff
- andl $0x3ffffff,%ebx
- mov %ebx,h0
-
- add $0x10,m
- dec %rcx
- jnz .Ldoblock
-
- # Zeroing of key material
- mov %rcx,0x00(%rsp)
- mov %rcx,0x08(%rsp)
-
- add $0x10,%rsp
- pop %r12
- pop %rbx
- ret
-SYM_FUNC_END(poly1305_block_sse2)
-
-
-#define u0 0x00(%r8)
-#define u1 0x04(%r8)
-#define u2 0x08(%r8)
-#define u3 0x0c(%r8)
-#define u4 0x10(%r8)
-#define hc0 %xmm0
-#define hc1 %xmm1
-#define hc2 %xmm2
-#define hc3 %xmm5
-#define hc4 %xmm6
-#define ru0 %xmm7
-#define ru1 %xmm8
-#define ru2 %xmm9
-#define ru3 %xmm10
-#define ru4 %xmm11
-#define sv1 %xmm12
-#define sv2 %xmm13
-#define sv3 %xmm14
-#define sv4 %xmm15
-#undef d0
-#define d0 %r13
-
-SYM_FUNC_START(poly1305_2block_sse2)
- # %rdi: Accumulator h[5]
- # %rsi: 16 byte input block m
- # %rdx: Poly1305 key r[5]
- # %rcx: Doubleblock count
- # %r8: Poly1305 derived key r^2 u[5]
-
- # This two-block variant further improves performance by using loop
- # unrolled block processing. This is more straight forward and does
- # less byte shuffling, but requires a second Poly1305 key r^2:
- # h = (h + m) * r => h = (h + m1) * r^2 + m2 * r
-
- push %rbx
- push %r12
- push %r13
-
- # combine r0,u0
- movd u0,ru0
- movd r0,t1
- punpcklqdq t1,ru0
-
- # combine r1,u1 and s1=r1*5,v1=u1*5
- movd u1,ru1
- movd r1,t1
- punpcklqdq t1,ru1
- movdqa ru1,sv1
- pslld $2,sv1
- paddd ru1,sv1
-
- # combine r2,u2 and s2=r2*5,v2=u2*5
- movd u2,ru2
- movd r2,t1
- punpcklqdq t1,ru2
- movdqa ru2,sv2
- pslld $2,sv2
- paddd ru2,sv2
-
- # combine r3,u3 and s3=r3*5,v3=u3*5
- movd u3,ru3
- movd r3,t1
- punpcklqdq t1,ru3
- movdqa ru3,sv3
- pslld $2,sv3
- paddd ru3,sv3
-
- # combine r4,u4 and s4=r4*5,v4=u4*5
- movd u4,ru4
- movd r4,t1
- punpcklqdq t1,ru4
- movdqa ru4,sv4
- pslld $2,sv4
- paddd ru4,sv4
-
-.Ldoblock2:
- # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
- movd 0x00(m),hc0
- movd 0x10(m),t1
- punpcklqdq t1,hc0
- pand ANMASK(%rip),hc0
- movd h0,t1
- paddd t1,hc0
- # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
- movd 0x03(m),hc1
- movd 0x13(m),t1
- punpcklqdq t1,hc1
- psrld $2,hc1
- pand ANMASK(%rip),hc1
- movd h1,t1
- paddd t1,hc1
- # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
- movd 0x06(m),hc2
- movd 0x16(m),t1
- punpcklqdq t1,hc2
- psrld $4,hc2
- pand ANMASK(%rip),hc2
- movd h2,t1
- paddd t1,hc2
- # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
- movd 0x09(m),hc3
- movd 0x19(m),t1
- punpcklqdq t1,hc3
- psrld $6,hc3
- pand ANMASK(%rip),hc3
- movd h3,t1
- paddd t1,hc3
- # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
- movd 0x0c(m),hc4
- movd 0x1c(m),t1
- punpcklqdq t1,hc4
- psrld $8,hc4
- por ORMASK(%rip),hc4
- movd h4,t1
- paddd t1,hc4
-
- # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
- movdqa ru0,t1
- pmuludq hc0,t1
- # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
- movdqa sv4,t2
- pmuludq hc1,t2
- paddq t2,t1
- # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
- movdqa sv3,t2
- pmuludq hc2,t2
- paddq t2,t1
- # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
- movdqa sv2,t2
- pmuludq hc3,t2
- paddq t2,t1
- # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
- movdqa sv1,t2
- pmuludq hc4,t2
- paddq t2,t1
- # d0 = t1[0] + t1[1]
- movdqa t1,t2
- psrldq $8,t2
- paddq t2,t1
- movq t1,d0
-
- # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
- movdqa ru1,t1
- pmuludq hc0,t1
- # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
- movdqa ru0,t2
- pmuludq hc1,t2
- paddq t2,t1
- # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
- movdqa sv4,t2
- pmuludq hc2,t2
- paddq t2,t1
- # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
- movdqa sv3,t2
- pmuludq hc3,t2
- paddq t2,t1
- # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
- movdqa sv2,t2
- pmuludq hc4,t2
- paddq t2,t1
- # d1 = t1[0] + t1[1]
- movdqa t1,t2
- psrldq $8,t2
- paddq t2,t1
- movq t1,d1
-
- # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
- movdqa ru2,t1
- pmuludq hc0,t1
- # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
- movdqa ru1,t2
- pmuludq hc1,t2
- paddq t2,t1
- # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
- movdqa ru0,t2
- pmuludq hc2,t2
- paddq t2,t1
- # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
- movdqa sv4,t2
- pmuludq hc3,t2
- paddq t2,t1
- # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
- movdqa sv3,t2
- pmuludq hc4,t2
- paddq t2,t1
- # d2 = t1[0] + t1[1]
- movdqa t1,t2
- psrldq $8,t2
- paddq t2,t1
- movq t1,d2
-
- # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
- movdqa ru3,t1
- pmuludq hc0,t1
- # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
- movdqa ru2,t2
- pmuludq hc1,t2
- paddq t2,t1
- # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
- movdqa ru1,t2
- pmuludq hc2,t2
- paddq t2,t1
- # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
- movdqa ru0,t2
- pmuludq hc3,t2
- paddq t2,t1
- # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
- movdqa sv4,t2
- pmuludq hc4,t2
- paddq t2,t1
- # d3 = t1[0] + t1[1]
- movdqa t1,t2
- psrldq $8,t2
- paddq t2,t1
- movq t1,d3
-
- # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
- movdqa ru4,t1
- pmuludq hc0,t1
- # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
- movdqa ru3,t2
- pmuludq hc1,t2
- paddq t2,t1
- # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
- movdqa ru2,t2
- pmuludq hc2,t2
- paddq t2,t1
- # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
- movdqa ru1,t2
- pmuludq hc3,t2
- paddq t2,t1
- # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
- movdqa ru0,t2
- pmuludq hc4,t2
- paddq t2,t1
- # d4 = t1[0] + t1[1]
- movdqa t1,t2
- psrldq $8,t2
- paddq t2,t1
- movq t1,d4
-
- # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
- # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
- # amount. Careful: we must not assume the carry bits 'd0 >> 26',
- # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
- # integers. It's true in a single-block implementation, but not here.
-
- # d1 += d0 >> 26
- mov d0,%rax
- shr $26,%rax
- add %rax,d1
- # h0 = d0 & 0x3ffffff
- mov d0,%rbx
- and $0x3ffffff,%ebx
-
- # d2 += d1 >> 26
- mov d1,%rax
- shr $26,%rax
- add %rax,d2
- # h1 = d1 & 0x3ffffff
- mov d1,%rax
- and $0x3ffffff,%eax
- mov %eax,h1
-
- # d3 += d2 >> 26
- mov d2,%rax
- shr $26,%rax
- add %rax,d3
- # h2 = d2 & 0x3ffffff
- mov d2,%rax
- and $0x3ffffff,%eax
- mov %eax,h2
-
- # d4 += d3 >> 26
- mov d3,%rax
- shr $26,%rax
- add %rax,d4
- # h3 = d3 & 0x3ffffff
- mov d3,%rax
- and $0x3ffffff,%eax
- mov %eax,h3
-
- # h0 += (d4 >> 26) * 5
- mov d4,%rax
- shr $26,%rax
- lea (%rax,%rax,4),%rax
- add %rax,%rbx
- # h4 = d4 & 0x3ffffff
- mov d4,%rax
- and $0x3ffffff,%eax
- mov %eax,h4
-
- # h1 += h0 >> 26
- mov %rbx,%rax
- shr $26,%rax
- add %eax,h1
- # h0 = h0 & 0x3ffffff
- andl $0x3ffffff,%ebx
- mov %ebx,h0
-
- add $0x20,m
- dec %rcx
- jnz .Ldoblock2
-
- pop %r13
- pop %r12
- pop %rbx
- ret
-SYM_FUNC_END(poly1305_2block_sse2)
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
new file mode 100644
index 000000000000..7a6b5380a46f
--- /dev/null
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -0,0 +1,4265 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+#
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements Poly1305 hash for x86_64.
+#
+# March 2015
+#
+# Initial release.
+#
+# December 2016
+#
+# Add AVX512F+VL+BW code path.
+#
+# November 2017
+#
+# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be
+# executed even on Knights Landing. Trigger for modification was
+# observation that AVX512 code paths can negatively affect overall
+# Skylake-X system performance. Since we are likely to suppress
+# AVX512F capability flag [at least on Skylake-X], conversion serves
+# as kind of "investment protection". Note that next *lake processor,
+# Cannonlake, has AVX512IFMA code path to execute...
+#
+# Numbers are cycles per processed byte with poly1305_blocks alone,
+# measured with rdtsc at fixed clock frequency.
+#
+# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512
+# P4 4.46/+120% -
+# Core 2 2.41/+90% -
+# Westmere 1.88/+120% -
+# Sandy Bridge 1.39/+140% 1.10
+# Haswell 1.14/+175% 1.11 0.65
+# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35]
+# Silvermont 2.83/+95% -
+# Knights L 3.60/? 1.65 1.10 0.41(***)
+# Goldmont 1.70/+180% -
+# VIA Nano 1.82/+150% -
+# Sledgehammer 1.38/+160% -
+# Bulldozer 2.30/+130% 0.97
+# Ryzen 1.15/+200% 1.08 1.18
+#
+# (*) improvement coefficients relative to clang are more modest and
+# are ~50% on most processors, in both cases we are comparing to
+# __int128 code;
+# (**) SSE2 implementation was attempted, but among non-AVX processors
+# it was faster than integer-only code only on older Intel P4 and
+# Core processors, 50-30%, less newer processor is, but slower on
+# contemporary ones, for example almost 2x slower on Atom, and as
+# former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***) strangely enough performance seems to vary from core to core,
+# listed result is best case;
+
+$flavour = shift;
+$output = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+ die "can't locate x86_64-xlate.pl";
+
+ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+ *STDOUT=*OUT;
+
+ if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+ =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+ $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+ `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+ $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+ $avx += 1 if ($1==2.11 && $2>=8);
+ }
+
+ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+ `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+ $avx = ($1>=10) + ($1>=11);
+ }
+
+ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+ }
+} else {
+ $avx = 4; # The kernel uses ifdefs for this.
+}
+
+sub declare_function() {
+ my ($name, $align, $nargs) = @_;
+ if($kernel) {
+ $code .= ".align $align\n";
+ $code .= "SYM_FUNC_START($name)\n";
+ $code .= ".L$name:\n";
+ } else {
+ $code .= ".globl $name\n";
+ $code .= ".type $name,\@function,$nargs\n";
+ $code .= ".align $align\n";
+ $code .= "$name:\n";
+ }
+}
+
+sub end_function() {
+ my ($name) = @_;
+ if($kernel) {
+ $code .= "SYM_FUNC_END($name)\n";
+ } else {
+ $code .= ".size $name,.-$name\n";
+ }
+}
+
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long 2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long 0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad 0,12,24,64
+.L2_44_mask:
+.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad 44,44,42,64
+.L2_44_shift_lft:
+.quad 8,8,10,64
+
+.align 64
+.Lx_mask44:
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
+my ($mac,$nonce)=($inp,$len); # *_emit arguments
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
+
+sub poly1305_iteration {
+# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
+# output: $h0-$h2 *= $r0-$r1
+$code.=<<___;
+ mulq $h0 # h0*r1
+ mov %rax,$d2
+ mov $r0,%rax
+ mov %rdx,$d3
+
+ mulq $h0 # h0*r0
+ mov %rax,$h0 # future $h0
+ mov $r0,%rax
+ mov %rdx,$d1
+
+ mulq $h1 # h1*r0
+ add %rax,$d2
+ mov $s1,%rax
+ adc %rdx,$d3
+
+ mulq $h1 # h1*s1
+ mov $h2,$h1 # borrow $h1
+ add %rax,$h0
+ adc %rdx,$d1
+
+ imulq $s1,$h1 # h2*s1
+ add $h1,$d2
+ mov $d1,$h1
+ adc \$0,$d3
+
+ imulq $r0,$h2 # h2*r0
+ add $d2,$h1
+ mov \$-4,%rax # mask value
+ adc $h2,$d3
+
+ and $d3,%rax # last reduction step
+ mov $d3,$h2
+ shr \$2,$d3
+ and \$3,$h2
+ add $d3,%rax
+ add %rax,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+___
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^64
+# unsigned __int64 r[2]; # key value base 2^64
+
+$code.=<<___;
+.text
+___
+$code.=<<___ if (!$kernel);
+.extern OPENSSL_ia32cap_P
+
+.globl poly1305_init_x86_64
+.hidden poly1305_init_x86_64
+.globl poly1305_blocks_x86_64
+.hidden poly1305_blocks_x86_64
+.globl poly1305_emit_x86_64
+.hidden poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
+ xor %rax,%rax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+ cmp \$0,$inp
+ je .Lno_key
+___
+$code.=<<___ if (!$kernel);
+ lea poly1305_blocks_x86_64(%rip),%r10
+ lea poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___ if (!$kernel && $avx);
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
+ lea poly1305_blocks_avx(%rip),%rax
+ lea poly1305_emit_avx(%rip),%rcx
+ bt \$`60-32`,%r9 # AVX?
+ cmovc %rax,%r10
+ cmovc %rcx,%r11
+___
+$code.=<<___ if (!$kernel && $avx>1);
+ lea poly1305_blocks_avx2(%rip),%rax
+ bt \$`5+32`,%r9 # AVX2?
+ cmovc %rax,%r10
+___
+$code.=<<___ if (!$kernel && $avx>3);
+ mov \$`(1<<31|1<<21|1<<16)`,%rax
+ shr \$32,%r9
+ and %rax,%r9
+ cmp %rax,%r9
+ je .Linit_base2_44
+___
+$code.=<<___;
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ and 8($inp),%rcx
+ mov %rax,24($ctx)
+ mov %rcx,32($ctx)
+___
+$code.=<<___ if (!$kernel && $flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if (!$kernel && $flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+.Lno_key:
+ ret
+___
+&end_function("poly1305_init_x86_64");
+
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
+.cfi_startproc
+.Lblocks:
+ shr \$4,$len
+ jz .Lno_data # too short
+
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ push $ctx
+.cfi_push $ctx
+.Lblocks_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2
+
+ mov $s1,$r1
+ shr \$2,$s1
+ mov $r1,%rax
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+ jmp .Loop
+
+.align 32
+.Loop:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+___
+
+ &poly1305_iteration();
+
+$code.=<<___;
+ mov $r1,%rax
+ dec %r15 # len-=16
+ jnz .Loop
+
+ mov 0(%rsp),$ctx
+.cfi_restore $ctx
+
+ mov $h0,0($ctx) # store hash value
+ mov $h1,8($ctx)
+ mov $h2,16($ctx)
+
+ mov 8(%rsp),%r15
+.cfi_restore %r15
+ mov 16(%rsp),%r14
+.cfi_restore %r14
+ mov 24(%rsp),%r13
+.cfi_restore %r13
+ mov 32(%rsp),%r12
+.cfi_restore %r12
+ mov 40(%rsp),%rbx
+.cfi_restore %rbx
+ lea 48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
+.Lno_data:
+.Lblocks_epilogue:
+ ret
+.cfi_endproc
+___
+&end_function("poly1305_blocks_x86_64");
+
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
+.Lemit:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+___
+&end_function("poly1305_emit_x86_64");
+if ($avx) {
+
+if($kernel) {
+ $code .= "#ifdef CONFIG_AS_AVX\n";
+}
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int32 h[5]; # current hash value base 2^26
+# unsigned __int32 is_base2_26;
+# unsigned __int64 r[2]; # key value base 2^64
+# unsigned __int64 pad;
+# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
+#
+# where r^n are base 2^26 digits of degrees of multiplier key. There are
+# 5 digits, but last four are interleaved with multiples of 5, totalling
+# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
+
+my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
+ map("%xmm$_",(0..15));
+
+$code.=<<___;
+.type __poly1305_block,\@abi-omnipotent
+.align 32
+__poly1305_block:
+ push $ctx
+___
+ &poly1305_iteration();
+$code.=<<___;
+ pop $ctx
+ ret
+.size __poly1305_block,.-__poly1305_block
+
+.type __poly1305_init_avx,\@abi-omnipotent
+.align 32
+__poly1305_init_avx:
+ push %rbp
+ mov %rsp,%rbp
+ mov $r0,$h0
+ mov $r1,$h1
+ xor $h2,$h2
+
+ lea 48+64($ctx),$ctx # size optimization
+
+ mov $r1,%rax
+ call __poly1305_block # r^2
+
+ mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
+ mov \$0x3ffffff,%edx
+ mov $h0,$d1
+ and $h0#d,%eax
+ mov $r0,$d2
+ and $r0#d,%edx
+ mov %eax,`16*0+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*0+4-64`($ctx)
+ shr \$26,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*1+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*1+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*2+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*2+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h1,%rax
+ mov $r1,%rdx
+ shl \$12,%rax
+ shl \$12,%rdx
+ or $d1,%rax
+ or $d2,%rdx
+ and \$0x3ffffff,%eax
+ and \$0x3ffffff,%edx
+ mov %eax,`16*3+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*3+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*4+0-64`($ctx)
+ mov $h1,$d1
+ mov %edx,`16*4+4-64`($ctx)
+ mov $r1,$d2
+
+ mov \$0x3ffffff,%eax
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ shr \$14,$d2
+ and $d1#d,%eax
+ and $d2#d,%edx
+ mov %eax,`16*5+0-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov %edx,`16*5+4-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ mov %eax,`16*6+0-64`($ctx)
+ shr \$26,$d1
+ mov %edx,`16*6+4-64`($ctx)
+ shr \$26,$d2
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+0-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d2#d,`16*7+4-64`($ctx)
+ lea ($d2,$d2,4),$d2 # *5
+ mov $d1#d,`16*8+0-64`($ctx)
+ mov $d2#d,`16*8+4-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^3
+
+ mov \$0x3ffffff,%eax # save r^3 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+12-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+12-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+12-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+12-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+12-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+12-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+12-64`($ctx)
+
+ mov $r1,%rax
+ call __poly1305_block # r^4
+
+ mov \$0x3ffffff,%eax # save r^4 base 2^26
+ mov $h0,$d1
+ and $h0#d,%eax
+ shr \$26,$d1
+ mov %eax,`16*0+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ and $d1#d,%edx
+ mov %edx,`16*1+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*2+8-64`($ctx)
+
+ mov $h1,%rax
+ shl \$12,%rax
+ or $d1,%rax
+ and \$0x3ffffff,%eax
+ mov %eax,`16*3+8-64`($ctx)
+ lea (%rax,%rax,4),%eax # *5
+ mov $h1,$d1
+ mov %eax,`16*4+8-64`($ctx)
+
+ mov \$0x3ffffff,%edx
+ shr \$14,$d1
+ and $d1#d,%edx
+ mov %edx,`16*5+8-64`($ctx)
+ lea (%rdx,%rdx,4),%edx # *5
+ shr \$26,$d1
+ mov %edx,`16*6+8-64`($ctx)
+
+ mov $h2,%rax
+ shl \$24,%rax
+ or %rax,$d1
+ mov $d1#d,`16*7+8-64`($ctx)
+ lea ($d1,$d1,4),$d1 # *5
+ mov $d1#d,`16*8+8-64`($ctx)
+
+ lea -48-64($ctx),$ctx # size [de-]optimization
+ pop %rbp
+ ret
+.size __poly1305_init_avx,.-__poly1305_init_avx
+___
+
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx:
+ and \$-16,$len
+ jz .Lno_data_avx
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx
+
+ test \$31,$len
+ jz .Leven_avx
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+
+ call __poly1305_block
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ sub \$16,%r15
+ jz .Lstore_base2_26_avx
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ jmp .Lproceed_avx
+
+.align 32
+.Lstore_base2_64_avx:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx
+
+.align 16
+.Lstore_base2_26_avx:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx:
+.Lblocks_avx_epilogue:
+ ret
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx_body:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$31,$len
+ jz .Linit_avx
+
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+
+.Linit_avx:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,$H0
+ vmovd %rdx#d,$H1
+ vmovd $h0#d,$H2
+ vmovd $h1#d,$H3
+ vmovd $h2#d,$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx:
+ mov %r15,$len
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx_epilogue:
+ jmp .Ldo_avx
+.cfi_endproc
+
+.align 32
+.Leven_avx:
+.cfi_startproc
+ vmovd 4*0($ctx),$H0 # load hash value
+ vmovd 4*1($ctx),$H1
+ vmovd 4*2($ctx),$H2
+ vmovd 4*3($ctx),$H3
+ vmovd 4*4($ctx),$H4
+
+.Ldo_avx:
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ and \$-32,%rsp
+ sub \$-8,%rsp
+ lea -0x58(%rsp),%r11
+ sub \$0x178,%rsp
+___
+$code.=<<___ if ($win64);
+ lea -0xf8(%rsp),%r11
+ sub \$0x218,%rsp
+ vmovdqa %xmm6,0x50(%r11)
+ vmovdqa %xmm7,0x60(%r11)
+ vmovdqa %xmm8,0x70(%r11)
+ vmovdqa %xmm9,0x80(%r11)
+ vmovdqa %xmm10,0x90(%r11)
+ vmovdqa %xmm11,0xa0(%r11)
+ vmovdqa %xmm12,0xb0(%r11)
+ vmovdqa %xmm13,0xc0(%r11)
+ vmovdqa %xmm14,0xd0(%r11)
+ vmovdqa %xmm15,0xe0(%r11)
+.Ldo_avx_body:
+___
+$code.=<<___;
+ sub \$64,$len
+ lea -32($inp),%rax
+ cmovc %rax,$inp
+
+ vmovdqu `16*3`($ctx),$D4 # preload r0^2
+ lea `16*3+64`($ctx),$ctx # size optimization
+ lea .Lconst(%rip),%rcx
+
+ ################################################################
+ # load input
+ vmovdqu 16*2($inp),$T0
+ vmovdqu 16*3($inp),$T1
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ vpsrlq \$40,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ jbe .Lskip_loop_avx
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*1-64`($ctx),$D1
+ vmovdqu `16*2-64`($ctx),$D2
+ vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
+ vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
+ vmovdqa $D3,-0x90(%r11)
+ vmovdqa $D0,0x00(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vmovdqu `16*3-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x80(%r11)
+ vmovdqa $D1,0x10(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqu `16*4-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x70(%r11)
+ vmovdqa $D2,0x20(%rsp)
+ vpshufd \$0xEE,$D0,$D4
+ vmovdqu `16*5-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D4,-0x60(%r11)
+ vmovdqa $D0,0x30(%rsp)
+ vpshufd \$0xEE,$D1,$D3
+ vmovdqu `16*6-64`($ctx),$D0
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D3,-0x50(%r11)
+ vmovdqa $D1,0x40(%rsp)
+ vpshufd \$0xEE,$D2,$D4
+ vmovdqu `16*7-64`($ctx),$D1
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D4,-0x40(%r11)
+ vmovdqa $D2,0x50(%rsp)
+ vpshufd \$0xEE,$D0,$D3
+ vmovdqu `16*8-64`($ctx),$D2
+ vpshufd \$0x44,$D0,$D0
+ vmovdqa $D3,-0x30(%r11)
+ vmovdqa $D0,0x60(%rsp)
+ vpshufd \$0xEE,$D1,$D4
+ vpshufd \$0x44,$D1,$D1
+ vmovdqa $D4,-0x20(%r11)
+ vmovdqa $D1,0x70(%rsp)
+ vpshufd \$0xEE,$D2,$D3
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpshufd \$0x44,$D2,$D2
+ vmovdqa $D3,-0x10(%r11)
+ vmovdqa $D2,0x80(%rsp)
+
+ jmp .Loop_avx
+
+.align 32
+.Loop_avx:
+ ################################################################
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
+ # \___________________/
+ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
+ # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
+ # \___________________/ \____________________/
+ #
+ # Note that we start with inp[2:3]*r^2. This is because it
+ # doesn't depend on reduction in previous iteration.
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # though note that $Tx and $Hx are "reversed" in this section,
+ # and $D4 is preloaded with r0^2...
+
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vmovdqa $H2,0x20(%r11) # offload hash
+ vpmuludq $T2,$D4,$D2 # d3 = h2*r0
+ vmovdqa 0x10(%rsp),$H2 # r1^2
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vmovdqa $H0,0x00(%r11) #
+ vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
+ vmovdqa $H1,0x10(%r11) #
+ vpmuludq $T3,$H2,$H1 # h3*r1
+ vpaddq $H0,$D0,$D0 # d0 += h4*s1
+ vpaddq $H1,$D4,$D4 # d4 += h3*r1
+ vmovdqa $H3,0x30(%r11) #
+ vpmuludq $T2,$H2,$H0 # h2*r1
+ vpmuludq $T1,$H2,$H1 # h1*r1
+ vpaddq $H0,$D3,$D3 # d3 += h2*r1
+ vmovdqa 0x30(%rsp),$H3 # r2^2
+ vpaddq $H1,$D2,$D2 # d2 += h1*r1
+ vmovdqa $H4,0x40(%r11) #
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpmuludq $T2,$H3,$H0 # h2*r2
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa 0x40(%rsp),$H4 # s2^2
+ vpaddq $H0,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H3,$H1 # h1*r2
+ vpmuludq $T0,$H3,$H3 # h0*r2
+ vpaddq $H1,$D3,$D3 # d3 += h1*r2
+ vmovdqa 0x50(%rsp),$H2 # r3^2
+ vpaddq $H3,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H4,$H0 # h4*s2
+ vpmuludq $T3,$H4,$H4 # h3*s2
+ vpaddq $H0,$D1,$D1 # d1 += h4*s2
+ vmovdqa 0x60(%rsp),$H3 # s3^2
+ vpaddq $H4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa 0x80(%rsp),$H4 # s4^2
+ vpmuludq $T1,$H2,$H1 # h1*r3
+ vpmuludq $T0,$H2,$H2 # h0*r3
+ vpaddq $H1,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $T4,$H3,$H0 # h4*s3
+ vpmuludq $T3,$H3,$H1 # h3*s3
+ vpaddq $H0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*0($inp),$H0 # load input
+ vpaddq $H1,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H3,$H3 # h2*s3
+ vpmuludq $T2,$H4,$T2 # h2*s4
+ vpaddq $H3,$D0,$D0 # d0 += h2*s3
+
+ vmovdqu 16*1($inp),$H1 #
+ vpaddq $T2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $T3,$H4,$T3 # h3*s4
+ vpmuludq $T4,$H4,$T4 # h4*s4
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpaddq $T3,$D2,$D2 # d2 += h3*s4
+ vpaddq $T4,$D3,$D3 # d3 += h4*s4
+ vpsrldq \$6,$H1,$H3 #
+ vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
+ vpmuludq $T1,$H4,$T0 # h1*s4
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpaddq $T4,$D4,$D4 # d4 += h0*r4
+ vmovdqa -0x90(%r11),$T4 # r0^4
+ vpaddq $T0,$D0,$D0 # d0 += h1*s4
+
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ #vpsrlq \$40,$H4,$H4 # 4
+ vpsrldq \$`40/8`,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpand 0(%rcx),$H4,$H4 # .Lmask24
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpaddq 0x00(%r11),$H0,$H0 # add hash value
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ lea 16*2($inp),%rax
+ lea 16*4($inp),$inp
+ sub \$64,$len
+ cmovc %rax,$inp
+
+ ################################################################
+ # Now we accumulate (inp[0:1]+hash)*r^4
+ ################################################################
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vmovdqa -0x80(%r11),$T2 # r1^4
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T0,$D2,$D2
+ vpaddq $T1,$D3,$D3
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
+ vpaddq $T4,$D4,$D4
+
+ vpaddq $T0,$D0,$D0 # d0 += h4*s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vmovdqa -0x60(%r11),$T3 # r2^4
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpmuludq $H1,$T2,$T1 # h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T1,$D2,$D2 # d2 += h1*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+
+ vmovdqa -0x50(%r11),$T4 # s2^4
+ vpmuludq $H2,$T3,$T0 # h2*r2
+ vpmuludq $H1,$T3,$T1 # h1*r2
+ vpaddq $T0,$D4,$D4 # d4 += h2*r2
+ vpaddq $T1,$D3,$D3 # d3 += h1*r2
+ vmovdqa -0x40(%r11),$T2 # r3^4
+ vpmuludq $H0,$T3,$T3 # h0*r2
+ vpmuludq $H4,$T4,$T0 # h4*s2
+ vpaddq $T3,$D2,$D2 # d2 += h0*r2
+ vpaddq $T0,$D1,$D1 # d1 += h4*s2
+ vmovdqa -0x30(%r11),$T3 # s3^4
+ vpmuludq $H3,$T4,$T4 # h3*s2
+ vpmuludq $H1,$T2,$T1 # h1*r3
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+
+ vmovdqa -0x10(%r11),$T4 # s4^4
+ vpaddq $T1,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T2,$T2 # h0*r3
+ vpmuludq $H4,$T3,$T0 # h4*s3
+ vpaddq $T2,$D3,$D3 # d3 += h0*r3
+ vpaddq $T0,$D2,$D2 # d2 += h4*s3
+ vmovdqu 16*2($inp),$T0 # load input
+ vpmuludq $H3,$T3,$T2 # h3*s3
+ vpmuludq $H2,$T3,$T3 # h2*s3
+ vpaddq $T2,$D1,$D1 # d1 += h3*s3
+ vmovdqu 16*3($inp),$T1 #
+ vpaddq $T3,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H2,$T4,$H2 # h2*s4
+ vpmuludq $H3,$T4,$H3 # h3*s4
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $H2,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H4,$T4,$H4 # h4*s4
+ vpsrldq \$6,$T1,$T3 #
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
+ vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
+ vpmuludq $H1,$T4,$H0
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+
+ #vpsrlq \$40,$T4,$T4 # 4
+ vpsrldq \$`40/8`,$T4,$T4 # 4
+ vpsrlq \$26,$T0,$T1
+ vmovdqa 0x00(%rsp),$D4 # preload r0^2
+ vpand $MASK,$T0,$T0 # 0
+ vpsrlq \$4,$T3,$T2
+ vpand $MASK,$T1,$T1 # 1
+ vpand 0(%rcx),$T4,$T4 # .Lmask24
+ vpsrlq \$30,$T3,$T3
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D0
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D0,$H0,$H0
+ vpsllq \$2,$D0,$D0
+ vpaddq $D0,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ ja .Loop_avx
+
+.Lskip_loop_avx:
+ ################################################################
+ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
+
+ vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
+ add \$32,$len
+ jnz .Long_tail_avx
+
+ vpaddq $H2,$T2,$T2
+ vpaddq $H0,$T0,$T0
+ vpaddq $H1,$T1,$T1
+ vpaddq $H3,$T3,$T3
+ vpaddq $H4,$T4,$T4
+
+.Long_tail_avx:
+ vmovdqa $H2,0x20(%r11)
+ vmovdqa $H0,0x00(%r11)
+ vmovdqa $H1,0x10(%r11)
+ vmovdqa $H3,0x30(%r11)
+ vmovdqa $H4,0x40(%r11)
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+
+ vpmuludq $T2,$D4,$D2 # d2 = h2*r0
+ vpmuludq $T0,$D4,$D0 # d0 = h0*r0
+ vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
+ vpmuludq $T1,$D4,$D1 # d1 = h1*r0
+ vpmuludq $T3,$D4,$D3 # d3 = h3*r0
+ vpmuludq $T4,$D4,$D4 # d4 = h4*r0
+
+ vpmuludq $T3,$H2,$H0 # h3*r1
+ vpaddq $H0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
+ vpmuludq $T2,$H2,$H1 # h2*r1
+ vpaddq $H1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
+ vpmuludq $T1,$H2,$H0 # h1*r1
+ vpaddq $H0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $T0,$H2,$H2 # h0*r1
+ vpaddq $H2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $T4,$H3,$H3 # h4*s1
+ vpaddq $H3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
+ vpmuludq $T2,$H4,$H1 # h2*r2
+ vpaddq $H1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $T1,$H4,$H0 # h1*r2
+ vpaddq $H0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
+ vpmuludq $T0,$H4,$H4 # h0*r2
+ vpaddq $H4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $T4,$H2,$H1 # h4*s2
+ vpaddq $H1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
+ vpmuludq $T3,$H2,$H2 # h3*s2
+ vpaddq $H2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $T1,$H3,$H0 # h1*r3
+ vpaddq $H0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $T0,$H3,$H3 # h0*r3
+ vpaddq $H3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
+ vpmuludq $T4,$H4,$H1 # h4*s3
+ vpaddq $H1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
+ vpmuludq $T3,$H4,$H0 # h3*s3
+ vpaddq $H0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $T2,$H4,$H4 # h2*s3
+ vpaddq $H4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $T0,$H2,$H2 # h0*r4
+ vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
+ vpmuludq $T4,$H3,$H1 # h4*s4
+ vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
+ vpmuludq $T3,$H3,$H0 # h3*s4
+ vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
+ vpmuludq $T2,$H3,$H1 # h2*s4
+ vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
+ vpmuludq $T1,$H3,$H3 # h1*s4
+ vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
+
+ jz .Lshort_tail_avx
+
+ vmovdqu 16*0($inp),$H0 # load input
+ vmovdqu 16*1($inp),$H1
+
+ vpsrldq \$6,$H0,$H2 # splat input
+ vpsrldq \$6,$H1,$H3
+ vpunpckhqdq $H1,$H0,$H4 # 4
+ vpunpcklqdq $H1,$H0,$H0 # 0:1
+ vpunpcklqdq $H3,$H2,$H3 # 2:3
+
+ vpsrlq \$40,$H4,$H4 # 4
+ vpsrlq \$26,$H0,$H1
+ vpand $MASK,$H0,$H0 # 0
+ vpsrlq \$4,$H3,$H2
+ vpand $MASK,$H1,$H1 # 1
+ vpsrlq \$30,$H3,$H3
+ vpand $MASK,$H2,$H2 # 2
+ vpand $MASK,$H3,$H3 # 3
+ vpor 32(%rcx),$H4,$H4 # padbit, yes, always
+
+ vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
+ vpaddq 0x00(%r11),$H0,$H0
+ vpaddq 0x10(%r11),$H1,$H1
+ vpaddq 0x20(%r11),$H2,$H2
+ vpaddq 0x30(%r11),$H3,$H3
+ vpaddq 0x40(%r11),$H4,$H4
+
+ ################################################################
+ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
+
+ vpmuludq $H0,$T4,$T0 # h0*r0
+ vpaddq $T0,$D0,$D0 # d0 += h0*r0
+ vpmuludq $H1,$T4,$T1 # h1*r0
+ vpaddq $T1,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H2,$T4,$T0 # h2*r0
+ vpaddq $T0,$D2,$D2 # d2 += h2*r0
+ vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
+ vpmuludq $H3,$T4,$T1 # h3*r0
+ vpaddq $T1,$D3,$D3 # d3 += h3*r0
+ vpmuludq $H4,$T4,$T4 # h4*r0
+ vpaddq $T4,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T2,$T0 # h3*r1
+ vpaddq $T0,$D4,$D4 # d4 += h3*r1
+ vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
+ vpmuludq $H2,$T2,$T1 # h2*r1
+ vpaddq $T1,$D3,$D3 # d3 += h2*r1
+ vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
+ vpmuludq $H1,$T2,$T0 # h1*r1
+ vpaddq $T0,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H0,$T2,$T2 # h0*r1
+ vpaddq $T2,$D1,$D1 # d1 += h0*r1
+ vpmuludq $H4,$T3,$T3 # h4*s1
+ vpaddq $T3,$D0,$D0 # d0 += h4*s1
+
+ vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
+ vpmuludq $H2,$T4,$T1 # h2*r2
+ vpaddq $T1,$D4,$D4 # d4 += h2*r2
+ vpmuludq $H1,$T4,$T0 # h1*r2
+ vpaddq $T0,$D3,$D3 # d3 += h1*r2
+ vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
+ vpmuludq $H0,$T4,$T4 # h0*r2
+ vpaddq $T4,$D2,$D2 # d2 += h0*r2
+ vpmuludq $H4,$T2,$T1 # h4*s2
+ vpaddq $T1,$D1,$D1 # d1 += h4*s2
+ vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
+ vpmuludq $H3,$T2,$T2 # h3*s2
+ vpaddq $T2,$D0,$D0 # d0 += h3*s2
+
+ vpmuludq $H1,$T3,$T0 # h1*r3
+ vpaddq $T0,$D4,$D4 # d4 += h1*r3
+ vpmuludq $H0,$T3,$T3 # h0*r3
+ vpaddq $T3,$D3,$D3 # d3 += h0*r3
+ vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
+ vpmuludq $H4,$T4,$T1 # h4*s3
+ vpaddq $T1,$D2,$D2 # d2 += h4*s3
+ vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
+ vpmuludq $H3,$T4,$T0 # h3*s3
+ vpaddq $T0,$D1,$D1 # d1 += h3*s3
+ vpmuludq $H2,$T4,$T4 # h2*s3
+ vpaddq $T4,$D0,$D0 # d0 += h2*s3
+
+ vpmuludq $H0,$T2,$T2 # h0*r4
+ vpaddq $T2,$D4,$D4 # d4 += h0*r4
+ vpmuludq $H4,$T3,$T1 # h4*s4
+ vpaddq $T1,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$T3,$T0 # h3*s4
+ vpaddq $T0,$D2,$D2 # d2 += h3*s4
+ vpmuludq $H2,$T3,$T1 # h2*s4
+ vpaddq $T1,$D1,$D1 # d1 += h2*s4
+ vpmuludq $H1,$T3,$T3 # h1*s4
+ vpaddq $T3,$D0,$D0 # d0 += h1*s4
+
+.Lshort_tail_avx:
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D4,$T4
+ vpsrldq \$8,$D3,$T3
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$D0,$T0
+ vpsrldq \$8,$D2,$T2
+ vpaddq $T3,$D3,$D3
+ vpaddq $T4,$D4,$D4
+ vpaddq $T0,$D0,$D0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$D2,$D2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D4,$H4
+ vpand $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$H1
+ vpand $MASK,$D1,$D1
+ vpaddq $H1,$D2,$D2 # h1 -> h2
+
+ vpaddq $H4,$D0,$D0
+ vpsllq \$2,$H4,$H4
+ vpaddq $H4,$D0,$D0 # h4 -> h0
+
+ vpsrlq \$26,$D2,$H2
+ vpand $MASK,$D2,$D2
+ vpaddq $H2,$D3,$D3 # h2 -> h3
+
+ vpsrlq \$26,$D0,$H0
+ vpand $MASK,$D0,$D0
+ vpaddq $H0,$D1,$D1 # h0 -> h1
+
+ vpsrlq \$26,$D3,$H3
+ vpand $MASK,$D3,$D3
+ vpaddq $H3,$D4,$D4 # h3 -> h4
+
+ vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
+ vmovd $D1,`4*1-48-64`($ctx)
+ vmovd $D2,`4*2-48-64`($ctx)
+ vmovd $D3,`4*3-48-64`($ctx)
+ vmovd $D4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa 0x50(%r11),%xmm6
+ vmovdqa 0x60(%r11),%xmm7
+ vmovdqa 0x70(%r11),%xmm8
+ vmovdqa 0x80(%r11),%xmm9
+ vmovdqa 0x90(%r11),%xmm10
+ vmovdqa 0xa0(%r11),%xmm11
+ vmovdqa 0xb0(%r11),%xmm12
+ vmovdqa 0xc0(%r11),%xmm13
+ vmovdqa 0xd0(%r11),%xmm14
+ vmovdqa 0xe0(%r11),%xmm15
+ lea 0xf8(%r11),%rsp
+.Ldo_avx_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ ret
+.cfi_endproc
+___
+&end_function("poly1305_blocks_avx");
+
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
+ cmpl \$0,20($ctx) # is_base2_26?
+ je .Lemit
+
+ mov 0($ctx),%eax # load hash value base 2^26
+ mov 4($ctx),%ecx
+ mov 8($ctx),%r8d
+ mov 12($ctx),%r11d
+ mov 16($ctx),%r10d
+
+ shl \$26,%rcx # base 2^26 -> base 2^64
+ mov %r8,%r9
+ shl \$52,%r8
+ add %rcx,%rax
+ shr \$12,%r9
+ add %rax,%r8 # h0
+ adc \$0,%r9
+
+ shl \$14,%r11
+ mov %r10,%rax
+ shr \$24,%r10
+ add %r11,%r9
+ shl \$40,%rax
+ add %rax,%r9 # h1
+ adc \$0,%r10 # h2
+
+ mov %r10,%rax # could be partially reduced, so reduce
+ mov %r10,%rcx
+ and \$3,%r10
+ shr \$2,%rax
+ and \$-4,%rcx
+ add %rcx,%rax
+ add %rax,%r8
+ adc \$0,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+___
+&end_function("poly1305_emit_avx");
+
+if ($kernel) {
+ $code .= "#endif\n";
+}
+
+if ($avx>1) {
+
+if ($kernel) {
+ $code .= "#ifdef CONFIG_AS_AVX2\n";
+}
+
+my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
+ map("%ymm$_",(0..15));
+my $S4=$MASK;
+
+sub poly1305_blocks_avxN {
+ my ($avx512) = @_;
+ my $suffix = $avx512 ? "_avx512" : "";
+$code.=<<___;
+.cfi_startproc
+ mov 20($ctx),%r8d # is_base2_26
+ cmp \$128,$len
+ jae .Lblocks_avx2$suffix
+ test %r8d,%r8d
+ jz .Lblocks
+
+.Lblocks_avx2$suffix:
+ and \$-16,$len
+ jz .Lno_data_avx2$suffix
+
+ vzeroupper
+
+ test %r8d,%r8d
+ jz .Lbase2_64_avx2$suffix
+
+ test \$63,$len
+ jz .Leven_avx2$suffix
+
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lblocks_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 0($ctx),$d1 # load hash value
+ mov 8($ctx),$d2
+ mov 16($ctx),$h2#d
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ ################################# base 2^26 -> base 2^64
+ mov $d1#d,$h0#d
+ and \$`-1*(1<<31)`,$d1
+ mov $d2,$r1 # borrow $r1
+ mov $d2#d,$h1#d
+ and \$`-1*(1<<31)`,$d2
+
+ shr \$6,$d1
+ shl \$52,$r1
+ add $d1,$h0
+ shr \$12,$h1
+ shr \$18,$d2
+ add $r1,$h0
+ adc $d2,$h1
+
+ mov $h2,$d1
+ shl \$40,$d1
+ shr \$24,$h2
+ add $d1,$h1
+ adc \$0,$h2 # can be partially reduced...
+
+ mov \$-4,$d2 # ... so reduce
+ mov $h2,$d1
+ and $h2,$d2
+ shr \$2,$d1
+ and \$3,$h2
+ add $d2,$d1 # =*5
+ add $d1,$h0
+ adc \$0,$h1
+ adc \$0,$h2
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+.Lbase2_26_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_26_pre_avx2$suffix
+
+ test $padbit,$padbit # if $padbit is zero,
+ jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format
+
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$r0
+ mov $h1,$r1
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$r0
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $r0,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$r1
+ and \$0x3ffffff,$h1 # h[3]
+ or $r1,$h2 # h[4]
+
+ test %r15,%r15
+ jz .Lstore_base2_26_avx2$suffix
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ jmp .Lproceed_avx2$suffix
+
+.align 32
+.Lstore_base2_64_avx2$suffix:
+ mov $h0,0($ctx)
+ mov $h1,8($ctx)
+ mov $h2,16($ctx) # note that is_base2_26 is zeroed
+ jmp .Ldone_avx2$suffix
+
+.align 16
+.Lstore_base2_26_avx2$suffix:
+ mov %rax#d,0($ctx) # store hash value base 2^26
+ mov %rdx#d,4($ctx)
+ mov $h0#d,8($ctx)
+ mov $h1#d,12($ctx)
+ mov $h2#d,16($ctx)
+.align 16
+.Ldone_avx2$suffix:
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
+ ret
+.cfi_endproc
+
+.align 32
+.Lbase2_64_avx2$suffix:
+.cfi_startproc
+ push %rbp
+.cfi_push %rbp
+ mov %rsp,%rbp
+ push %rbx
+.cfi_push %rbx
+ push %r12
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+.Lbase2_64_avx2_body$suffix:
+
+ mov $len,%r15 # reassign $len
+
+ mov 24($ctx),$r0 # load r
+ mov 32($ctx),$s1
+
+ mov 0($ctx),$h0 # load hash value
+ mov 8($ctx),$h1
+ mov 16($ctx),$h2#d
+
+ mov $s1,$r1
+ mov $s1,%rax
+ shr \$2,$s1
+ add $r1,$s1 # s1 = r1 + (r1 >> 2)
+
+ test \$63,$len
+ jz .Linit_avx2$suffix
+
+.Lbase2_64_pre_avx2$suffix:
+ add 0($inp),$h0 # accumulate input
+ adc 8($inp),$h1
+ lea 16($inp),$inp
+ adc $padbit,$h2
+ sub \$16,%r15
+
+ call __poly1305_block
+ mov $r1,%rax
+
+ test \$63,%r15
+ jnz .Lbase2_64_pre_avx2$suffix
+
+.Linit_avx2$suffix:
+ ################################# base 2^64 -> base 2^26
+ mov $h0,%rax
+ mov $h0,%rdx
+ shr \$52,$h0
+ mov $h1,$d1
+ mov $h1,$d2
+ shr \$26,%rdx
+ and \$0x3ffffff,%rax # h[0]
+ shl \$12,$d1
+ and \$0x3ffffff,%rdx # h[1]
+ shr \$14,$h1
+ or $d1,$h0
+ shl \$24,$h2
+ and \$0x3ffffff,$h0 # h[2]
+ shr \$40,$d2
+ and \$0x3ffffff,$h1 # h[3]
+ or $d2,$h2 # h[4]
+
+ vmovd %rax#d,%x#$H0
+ vmovd %rdx#d,%x#$H1
+ vmovd $h0#d,%x#$H2
+ vmovd $h1#d,%x#$H3
+ vmovd $h2#d,%x#$H4
+ movl \$1,20($ctx) # set is_base2_26
+
+ call __poly1305_init_avx
+
+.Lproceed_avx2$suffix:
+ mov %r15,$len # restore $len
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+ mov \$`(1<<31|1<<30|1<<16)`,%r11d
+___
+$code.=<<___;
+ pop %r15
+.cfi_restore %r15
+ pop %r14
+.cfi_restore %r14
+ pop %r13
+.cfi_restore %r13
+ pop %r12
+.cfi_restore %r12
+ pop %rbx
+.cfi_restore %rbx
+ pop %rbp
+.cfi_restore %rbp
+.Lbase2_64_avx2_epilogue$suffix:
+ jmp .Ldo_avx2$suffix
+.cfi_endproc
+
+.align 32
+.Leven_avx2$suffix:
+.cfi_startproc
+___
+$code.=<<___ if (!$kernel);
+ mov OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
+ vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
+ vmovd 4*1($ctx),%x#$H1
+ vmovd 4*2($ctx),%x#$H2
+ vmovd 4*3($ctx),%x#$H3
+ vmovd 4*4($ctx),%x#$H4
+
+.Ldo_avx2$suffix:
+___
+$code.=<<___ if (!$kernel && $avx>2);
+ cmp \$512,$len
+ jb .Lskip_avx512
+ and %r11d,%r9d
+ test \$`1<<16`,%r9d # check for AVX512F
+ jnz .Lblocks_avx512
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+ cmp \$512,$len
+ jae .Lblocks_avx512
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),$T0 # .Lpermd_avx2
+
+ # expand and copy pre-calculated table to stack
+ vmovdqu `16*0-64`($ctx),%x#$T2
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$T3
+ vmovdqu `16*2-64`($ctx),%x#$T4
+ vmovdqu `16*3-64`($ctx),%x#$D0
+ vmovdqu `16*4-64`($ctx),%x#$D1
+ vmovdqu `16*5-64`($ctx),%x#$D2
+ lea 0x90(%rsp),%rax # size optimization
+ vmovdqu `16*6-64`($ctx),%x#$D3
+ vpermd $T2,$T0,$T2 # 00003412 -> 14243444
+ vmovdqu `16*7-64`($ctx),%x#$D4
+ vpermd $T3,$T0,$T3
+ vmovdqu `16*8-64`($ctx),%x#$MASK
+ vpermd $T4,$T0,$T4
+ vmovdqa $T2,0x00(%rsp)
+ vpermd $D0,$T0,$D0
+ vmovdqa $T3,0x20-0x90(%rax)
+ vpermd $D1,$T0,$D1
+ vmovdqa $T4,0x40-0x90(%rax)
+ vpermd $D2,$T0,$D2
+ vmovdqa $D0,0x60-0x90(%rax)
+ vpermd $D3,$T0,$D3
+ vmovdqa $D1,0x80-0x90(%rax)
+ vpermd $D4,$T0,$D4
+ vmovdqa $D2,0xa0-0x90(%rax)
+ vpermd $MASK,$T0,$MASK
+ vmovdqa $D3,0xc0-0x90(%rax)
+ vmovdqa $D4,0xe0-0x90(%rax)
+ vmovdqa $MASK,0x100-0x90(%rax)
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+
+ ################################################################
+ # load input
+ vmovdqu 16*0($inp),%x#$T0
+ vmovdqu 16*1($inp),%x#$T1
+ vinserti128 \$1,16*2($inp),$T0,$T0
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$64,$len
+ jz .Ltail_avx2$suffix
+ jmp .Loop_avx2$suffix
+
+.align 32
+.Loop_avx2$suffix:
+ ################################################################
+ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
+ # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
+ # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2
+ # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
+ # \________/\__________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqa `32*0`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqa `32*1`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqa `32*3`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
+ vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
+ # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
+ # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+ vmovdqa `32*4-0x90`(%rax),$T1 # s2
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vmovdqu 16*0($inp),%x#$T0 # load input
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+ vinserti128 \$1,16*2($inp),$T0,$T0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vmovdqu 16*1($inp),%x#$T1
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqa `32*5-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+ vinserti128 \$1,16*3($inp),$T1,$T1
+ lea 16*4($inp),$inp
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpsrldq \$6,$T1,$T3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpunpcklqdq $T3,$T2,$T3 # 2:3
+ vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # lazy reduction (interleaved with tail of input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$4,$T3,$T2
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpand $MASK,$T2,$T2 # 2
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$30,$T3,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpand $MASK,$T0,$T0 # 0
+ vpand $MASK,$T1,$T1 # 1
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+
+ sub \$64,$len
+ jnz .Loop_avx2$suffix
+
+ .byte 0x66,0x90
+.Ltail_avx2$suffix:
+ ################################################################
+ # while above multiplications were by r^4 in all lanes, in last
+ # iteration we multiply least significant lane by r^4 and most
+ # significant one by r, so copy of above except that references
+ # to the precomputed table are displaced by 4...
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+ vmovdqu `32*0+4`(%rsp),$T0 # r0^4
+ vpaddq $H1,$T1,$H1
+ vmovdqu `32*1+4`(%rsp),$T1 # r1^4
+ vpaddq $H3,$T3,$H3
+ vmovdqu `32*3+4`(%rsp),$T2 # r2^4
+ vpaddq $H4,$T4,$H4
+ vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
+ vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
+
+ vpmuludq $H2,$T0,$D2 # d2 = h2*r0
+ vpmuludq $H2,$T1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$T2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$T3,$D0 # d0 = h2*s3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+
+ vpmuludq $H0,$T1,$T4 # h0*r1
+ vpmuludq $H1,$T1,$H2 # h1*r1
+ vpaddq $T4,$D1,$D1 # d1 += h0*r1
+ vpaddq $H2,$D2,$D2 # d2 += h1*r1
+ vpmuludq $H3,$T1,$T4 # h3*r1
+ vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
+ vpaddq $T4,$D4,$D4 # d4 += h3*r1
+ vpaddq $H2,$D0,$D0 # d0 += h4*s1
+
+ vpmuludq $H0,$T0,$T4 # h0*r0
+ vpmuludq $H1,$T0,$H2 # h1*r0
+ vpaddq $T4,$D0,$D0 # d0 += h0*r0
+ vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
+ vpaddq $H2,$D1,$D1 # d1 += h1*r0
+ vpmuludq $H3,$T0,$T4 # h3*r0
+ vpmuludq $H4,$T0,$H2 # h4*r0
+ vpaddq $T4,$D3,$D3 # d3 += h3*r0
+ vpaddq $H2,$D4,$D4 # d4 += h4*r0
+
+ vpmuludq $H3,$T1,$T4 # h3*s2
+ vpmuludq $H4,$T1,$H2 # h4*s2
+ vpaddq $T4,$D0,$D0 # d0 += h3*s2
+ vpaddq $H2,$D1,$D1 # d1 += h4*s2
+ vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
+ vpmuludq $H1,$T2,$T4 # h1*r2
+ vpmuludq $H0,$T2,$T2 # h0*r2
+ vpaddq $T4,$D3,$D3 # d3 += h1*r2
+ vpaddq $T2,$D2,$D2 # d2 += h0*r2
+
+ vpmuludq $H1,$H2,$T4 # h1*r3
+ vpmuludq $H0,$H2,$H2 # h0*r3
+ vpaddq $T4,$D4,$D4 # d4 += h1*r3
+ vpaddq $H2,$D3,$D3 # d3 += h0*r3
+ vpmuludq $H3,$T3,$T4 # h3*s3
+ vpmuludq $H4,$T3,$H2 # h4*s3
+ vpaddq $T4,$D1,$D1 # d1 += h3*s3
+ vpaddq $H2,$D2,$D2 # d2 += h4*s3
+
+ vpmuludq $H3,$S4,$H3 # h3*s4
+ vpmuludq $H4,$S4,$H4 # h4*s4
+ vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
+ vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
+ vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
+ vpmuludq $H1,$S4,$H0 # h1*s4
+ vmovdqa 64(%rcx),$MASK # .Lmask26
+ vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
+ vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
+
+ ################################################################
+ # horizontal addition
+
+ vpsrldq \$8,$D1,$T1
+ vpsrldq \$8,$H2,$T2
+ vpsrldq \$8,$H3,$T3
+ vpsrldq \$8,$H4,$T4
+ vpsrldq \$8,$H0,$T0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+
+ vpermq \$0x2,$H3,$T3
+ vpermq \$0x2,$H4,$T4
+ vpermq \$0x2,$H0,$T0
+ vpermq \$0x2,$D1,$T1
+ vpermq \$0x2,$H2,$T2
+ vpaddq $T3,$H3,$H3
+ vpaddq $T4,$H4,$H4
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$D1,$D1
+ vpaddq $T2,$H2,$H2
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$D1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+___
+$code.=<<___ if ($win64);
+ vmovdqa -0xb0(%r10),%xmm6
+ vmovdqa -0xa0(%r10),%xmm7
+ vmovdqa -0x90(%r10),%xmm8
+ vmovdqa -0x80(%r10),%xmm9
+ vmovdqa -0x70(%r10),%xmm10
+ vmovdqa -0x60(%r10),%xmm11
+ vmovdqa -0x50(%r10),%xmm12
+ vmovdqa -0x40(%r10),%xmm13
+ vmovdqa -0x30(%r10),%xmm14
+ vmovdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ vzeroupper
+ ret
+.cfi_endproc
+___
+if($avx > 2 && $avx512) {
+my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
+my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
+my $PADBIT="%zmm30";
+
+map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain
+map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
+map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
+map(s/%y/%z/,($MASK));
+
+$code.=<<___;
+.cfi_startproc
+.Lblocks_avx512:
+ mov \$15,%eax
+ kmovw %eax,%k2
+___
+$code.=<<___ if (!$win64);
+ lea 8(%rsp),%r10
+.cfi_def_cfa_register %r10
+ sub \$0x128,%rsp
+___
+$code.=<<___ if ($win64);
+ lea 8(%rsp),%r10
+ sub \$0x1c8,%rsp
+ vmovdqa %xmm6,-0xb0(%r10)
+ vmovdqa %xmm7,-0xa0(%r10)
+ vmovdqa %xmm8,-0x90(%r10)
+ vmovdqa %xmm9,-0x80(%r10)
+ vmovdqa %xmm10,-0x70(%r10)
+ vmovdqa %xmm11,-0x60(%r10)
+ vmovdqa %xmm12,-0x50(%r10)
+ vmovdqa %xmm13,-0x40(%r10)
+ vmovdqa %xmm14,-0x30(%r10)
+ vmovdqa %xmm15,-0x20(%r10)
+.Ldo_avx512_body:
+___
+$code.=<<___;
+ lea .Lconst(%rip),%rcx
+ lea 48+64($ctx),$ctx # size optimization
+ vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2
+
+ # expand pre-calculated table
+ vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0}
+ and \$-512,%rsp
+ vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
+ mov \$0x20,%rax
+ vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1}
+ vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
+ vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2}
+ vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3}
+ vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3}
+ vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4}
+ vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4}
+ vpermd $D0,$T2,$R0 # 00003412 -> 14243444
+ vpbroadcastq 64(%rcx),$MASK # .Lmask26
+ vpermd $D1,$T2,$R1
+ vpermd $T0,$T2,$S1
+ vpermd $D2,$T2,$R2
+ vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0
+ vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304
+ vpermd $T1,$T2,$S2
+ vmovdqu64 $R1,0x00(%rsp,%rax){%k2}
+ vpsrlq \$32,$R1,$T1
+ vpermd $D3,$T2,$R3
+ vmovdqa64 $S1,0x40(%rsp){%k2}
+ vpermd $T3,$T2,$S3
+ vpermd $D4,$T2,$R4
+ vmovdqu64 $R2,0x40(%rsp,%rax){%k2}
+ vpermd $T4,$T2,$S4
+ vmovdqa64 $S2,0x80(%rsp){%k2}
+ vmovdqu64 $R3,0x80(%rsp,%rax){%k2}
+ vmovdqa64 $S3,0xc0(%rsp){%k2}
+ vmovdqu64 $R4,0xc0(%rsp,%rax){%k2}
+ vmovdqa64 $S4,0x100(%rsp){%k2}
+
+ ################################################################
+ # calculate 5th through 8th powers of the key
+ #
+ # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1
+ # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
+ # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3
+ # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4
+ # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0
+
+ vpmuludq $T0,$R0,$D0 # d0 = r0'*r0
+ vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
+ vpmuludq $T0,$R2,$D2 # d2 = r0'*r2
+ vpmuludq $T0,$R3,$D3 # d3 = r0'*r3
+ vpmuludq $T0,$R4,$D4 # d4 = r0'*r4
+ vpsrlq \$32,$R2,$T2
+
+ vpmuludq $T1,$S4,$M0
+ vpmuludq $T1,$R0,$M1
+ vpmuludq $T1,$R1,$M2
+ vpmuludq $T1,$R2,$M3
+ vpmuludq $T1,$R3,$M4
+ vpsrlq \$32,$R3,$T3
+ vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4
+ vpaddq $M1,$D1,$D1 # d1 += r1'*r0
+ vpaddq $M2,$D2,$D2 # d2 += r1'*r1
+ vpaddq $M3,$D3,$D3 # d3 += r1'*r2
+ vpaddq $M4,$D4,$D4 # d4 += r1'*r3
+
+ vpmuludq $T2,$S3,$M0
+ vpmuludq $T2,$S4,$M1
+ vpmuludq $T2,$R1,$M3
+ vpmuludq $T2,$R2,$M4
+ vpmuludq $T2,$R0,$M2
+ vpsrlq \$32,$R4,$T4
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
+ vpaddq $M3,$D3,$D3 # d3 += r2'*r1
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*r0
+
+ vpmuludq $T3,$S2,$M0
+ vpmuludq $T3,$R0,$M3
+ vpmuludq $T3,$R1,$M4
+ vpmuludq $T3,$S3,$M1
+ vpmuludq $T3,$S4,$M2
+ vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2
+ vpaddq $M3,$D3,$D3 # d3 += r3'*r0
+ vpaddq $M4,$D4,$D4 # d4 += r3'*r1
+ vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
+ vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4
+
+ vpmuludq $T4,$S4,$M3
+ vpmuludq $T4,$R0,$M4
+ vpmuludq $T4,$S1,$M0
+ vpmuludq $T4,$S2,$M1
+ vpmuludq $T4,$S3,$M2
+ vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4
+ vpaddq $M4,$D4,$D4 # d4 += r2'*r0
+ vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1
+ vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
+ vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3
+
+ ################################################################
+ # load input
+ vmovdqu64 16*0($inp),%z#$T3
+ vmovdqu64 16*4($inp),%z#$T4
+ lea 16*8($inp),$inp
+
+ ################################################################
+ # lazy reduction
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D4,$M4
+ vpandq $MASK,$D4,$D4
+
+ vpsrlq \$26,$D1,$M1
+ vpandq $MASK,$D1,$D1
+ vpaddq $M1,$D2,$D2 # d1 -> d2
+
+ vpaddq $M4,$D0,$D0
+ vpsllq \$2,$M4,$M4
+ vpaddq $M4,$D0,$D0 # d4 -> d0
+
+ vpsrlq \$26,$D2,$M2
+ vpandq $MASK,$D2,$D2
+ vpaddq $M2,$D3,$D3 # d2 -> d3
+
+ vpsrlq \$26,$D0,$M0
+ vpandq $MASK,$D0,$D0
+ vpaddq $M0,$D1,$D1 # d0 -> d1
+
+ vpsrlq \$26,$D3,$M3
+ vpandq $MASK,$D3,$D3
+ vpaddq $M3,$D4,$D4 # d3 -> d4
+
+ ################################################################
+ # at this point we have 14243444 in $R0-$S4 and 05060708 in
+ # $D0-$D4, ...
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ # ... since input 64-bit lanes are ordered as 73625140, we could
+ # "vperm" it to 76543210 (here and in each loop iteration), *or*
+ # we could just flow along, hence the goal for $R0-$S4 is
+ # 1858286838784888 ...
+
+ vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512:
+ mov \$0x7777,%eax
+ kmovw %eax,%k1
+
+ vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
+ vpermd $R1,$M0,$R1
+ vpermd $R2,$M0,$R2
+ vpermd $R3,$M0,$R3
+ vpermd $R4,$M0,$R4
+
+ vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888
+ vpermd $D1,$M0,${R1}{%k1}
+ vpermd $D2,$M0,${R2}{%k1}
+ vpermd $D3,$M0,${R3}{%k1}
+ vpermd $D4,$M0,${R4}{%k1}
+
+ vpslld \$2,$R1,$S1 # *5
+ vpslld \$2,$R2,$S2
+ vpslld \$2,$R3,$S3
+ vpslld \$2,$R4,$S4
+ vpaddd $R1,$S1,$S1
+ vpaddd $R2,$S2,$S2
+ vpaddd $R3,$S3,$S3
+ vpaddd $R4,$S4,$S4
+
+ vpbroadcastq 32(%rcx),$PADBIT # .L129
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+ vporq $T3,$T2,$T2
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$14,$T4,$T3
+ vpsrlq \$40,$T4,$T4 # 4
+ vpandq $MASK,$T2,$T2 # 2
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ vpaddq $H2,$T2,$H2 # accumulate input
+ sub \$192,$len
+ jbe .Ltail_avx512
+ jmp .Loop_avx512
+
+.align 32
+.Loop_avx512:
+ ################################################################
+ # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8
+ # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7
+ # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6
+ # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
+ # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4
+ # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
+ # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2
+ # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1
+ # \________/\___________/
+ ################################################################
+ #vpaddq $H2,$T2,$H2 # accumulate input
+
+ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
+ # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
+ # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
+ # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
+ # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
+ #
+ # however, as h2 is "chronologically" first one available pull
+ # corresponding operations up, so it's
+ #
+ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4
+ # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0
+ # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1
+ # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
+ # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpaddq $H0,$T0,$H0
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu64 16*0($inp),$T3 # load input
+ vmovdqu64 16*4($inp),$T4
+ lea 16*8($inp),$inp
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vpunpcklqdq $T4,$T3,$T0 # transpose input
+ vpunpckhqdq $T4,$T3,$T4
+
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h4*s4
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$52,$T0,$T2 # splat input
+ vpsllq \$12,$T4,$T3
+
+ vpsrlq \$26,$D3,$H3
+ vpandq $MASK,$D3,$D3
+ vpaddq $H3,$D4,$H4 # h3 -> h4
+
+ vporq $T3,$T2,$T2
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpandq $MASK,$T2,$T2 # 2
+
+ vpsrlq \$26,$H4,$D4
+ vpandq $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpandq $MASK,$H1,$H1
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpaddq $T2,$H2,$H2 # modulo-scheduled
+ vpsrlq \$26,$T0,$T1
+
+ vpsrlq \$26,$H2,$D2
+ vpandq $MASK,$H2,$H2
+ vpaddq $D2,$D3,$H3 # h2 -> h3
+
+ vpsrlq \$14,$T4,$T3
+
+ vpsrlq \$26,$H0,$D0
+ vpandq $MASK,$H0,$H0
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$40,$T4,$T4 # 4
+
+ vpsrlq \$26,$H3,$D3
+ vpandq $MASK,$H3,$H3
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpandq $MASK,$T0,$T0 # 0
+ #vpandq $MASK,$T1,$T1 # 1
+ #vpandq $MASK,$T3,$T3 # 3
+ #vporq $PADBIT,$T4,$T4 # padbit, yes, always
+
+ sub \$128,$len
+ ja .Loop_avx512
+
+.Ltail_avx512:
+ ################################################################
+ # while above multiplications were by r^8 in all lanes, in last
+ # iteration we multiply least significant lane by r^8 and most
+ # significant one by r, that's why table gets shifted...
+
+ vpsrlq \$32,$R0,$R0 # 0105020603070408
+ vpsrlq \$32,$R1,$R1
+ vpsrlq \$32,$R2,$R2
+ vpsrlq \$32,$S3,$S3
+ vpsrlq \$32,$S4,$S4
+ vpsrlq \$32,$R3,$R3
+ vpsrlq \$32,$R4,$R4
+ vpsrlq \$32,$S1,$S1
+ vpsrlq \$32,$S2,$S2
+
+ ################################################################
+ # load either next or last 64 byte of input
+ lea ($inp,$len),$inp
+
+ #vpaddq $H2,$T2,$H2 # accumulate input
+ vpaddq $H0,$T0,$H0
+
+ vpmuludq $H2,$R1,$D3 # d3 = h2*r1
+ vpmuludq $H2,$R2,$D4 # d4 = h2*r2
+ vpmuludq $H2,$S3,$D0 # d0 = h2*s3
+ vpandq $MASK,$T1,$T1 # 1
+ vpmuludq $H2,$S4,$D1 # d1 = h2*s4
+ vpandq $MASK,$T3,$T3 # 3
+ vpmuludq $H2,$R0,$D2 # d2 = h2*r0
+ vporq $PADBIT,$T4,$T4 # padbit, yes, always
+ vpaddq $H1,$T1,$H1 # accumulate input
+ vpaddq $H3,$T3,$H3
+ vpaddq $H4,$T4,$H4
+
+ vmovdqu 16*0($inp),%x#$T0
+ vpmuludq $H0,$R3,$M3
+ vpmuludq $H0,$R4,$M4
+ vpmuludq $H0,$R0,$M0
+ vpmuludq $H0,$R1,$M1
+ vpaddq $M3,$D3,$D3 # d3 += h0*r3
+ vpaddq $M4,$D4,$D4 # d4 += h0*r4
+ vpaddq $M0,$D0,$D0 # d0 += h0*r0
+ vpaddq $M1,$D1,$D1 # d1 += h0*r1
+
+ vmovdqu 16*1($inp),%x#$T1
+ vpmuludq $H1,$R2,$M3
+ vpmuludq $H1,$R3,$M4
+ vpmuludq $H1,$S4,$M0
+ vpmuludq $H0,$R2,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h1*r2
+ vpaddq $M4,$D4,$D4 # d4 += h1*r3
+ vpaddq $M0,$D0,$D0 # d0 += h1*s4
+ vpaddq $M2,$D2,$D2 # d2 += h0*r2
+
+ vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0
+ vpmuludq $H3,$R0,$M3
+ vpmuludq $H3,$R1,$M4
+ vpmuludq $H1,$R0,$M1
+ vpmuludq $H1,$R1,$M2
+ vpaddq $M3,$D3,$D3 # d3 += h3*r0
+ vpaddq $M4,$D4,$D4 # d4 += h3*r1
+ vpaddq $M1,$D1,$D1 # d1 += h1*r0
+ vpaddq $M2,$D2,$D2 # d2 += h1*r1
+
+ vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
+ vpmuludq $H4,$S4,$M3
+ vpmuludq $H4,$R0,$M4
+ vpmuludq $H3,$S2,$M0
+ vpmuludq $H3,$S3,$M1
+ vpmuludq $H3,$S4,$M2
+ vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4
+ vpaddq $M4,$D4,$D4 # d4 += h4*r0
+ vpaddq $M0,$D0,$D0 # d0 += h3*s2
+ vpaddq $M1,$D1,$D1 # d1 += h3*s3
+ vpaddq $M2,$D2,$D2 # d2 += h3*s4
+
+ vpmuludq $H4,$S1,$M0
+ vpmuludq $H4,$S2,$M1
+ vpmuludq $H4,$S3,$M2
+ vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1
+ vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
+ vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ vpermq \$0xb1,$H3,$D3
+ vpermq \$0xb1,$D4,$H4
+ vpermq \$0xb1,$H0,$D0
+ vpermq \$0xb1,$H1,$D1
+ vpermq \$0xb1,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ kmovw %eax,%k3
+ vpermq \$0x2,$H3,$D3
+ vpermq \$0x2,$H4,$D4
+ vpermq \$0x2,$H0,$D0
+ vpermq \$0x2,$H1,$D1
+ vpermq \$0x2,$H2,$D2
+ vpaddq $D3,$H3,$H3
+ vpaddq $D4,$H4,$H4
+ vpaddq $D0,$H0,$H0
+ vpaddq $D1,$H1,$H1
+ vpaddq $D2,$H2,$H2
+
+ vextracti64x4 \$0x1,$H3,%y#$D3
+ vextracti64x4 \$0x1,$H4,%y#$D4
+ vextracti64x4 \$0x1,$H0,%y#$D0
+ vextracti64x4 \$0x1,$H1,%y#$D1
+ vextracti64x4 \$0x1,$H2,%y#$D2
+ vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case
+ vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2
+ vpaddq $D0,$H0,${H0}{%k3}{z}
+ vpaddq $D1,$H1,${H1}{%k3}{z}
+ vpaddq $D2,$H2,${H2}{%k3}{z}
+___
+map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT));
+map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
+$code.=<<___;
+ ################################################################
+ # lazy reduction (interleaved with input splat)
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpsrldq \$6,$T0,$T2 # splat input
+ vpsrldq \$6,$T1,$T3
+ vpunpckhqdq $T1,$T0,$T4 # 4
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpunpcklqdq $T3,$T2,$T2 # 2:3
+ vpunpcklqdq $T1,$T0,$T0 # 0:1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H4,$D4
+ vpand $MASK,$H4,$H4
+
+ vpsrlq \$26,$H1,$D1
+ vpand $MASK,$H1,$H1
+ vpsrlq \$30,$T2,$T3
+ vpsrlq \$4,$T2,$T2
+ vpaddq $D1,$H2,$H2 # h1 -> h2
+
+ vpaddq $D4,$H0,$H0
+ vpsllq \$2,$D4,$D4
+ vpsrlq \$26,$T0,$T1
+ vpsrlq \$40,$T4,$T4 # 4
+ vpaddq $D4,$H0,$H0 # h4 -> h0
+
+ vpsrlq \$26,$H2,$D2
+ vpand $MASK,$H2,$H2
+ vpand $MASK,$T2,$T2 # 2
+ vpand $MASK,$T0,$T0 # 0
+ vpaddq $D2,$H3,$H3 # h2 -> h3
+
+ vpsrlq \$26,$H0,$D0
+ vpand $MASK,$H0,$H0
+ vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2
+ vpand $MASK,$T1,$T1 # 1
+ vpaddq $D0,$H1,$H1 # h0 -> h1
+
+ vpsrlq \$26,$H3,$D3
+ vpand $MASK,$H3,$H3
+ vpand $MASK,$T3,$T3 # 3
+ vpor 32(%rcx),$T4,$T4 # padbit, yes, always
+ vpaddq $D3,$H4,$H4 # h3 -> h4
+
+ lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2
+ add \$64,$len
+ jnz .Ltail_avx2$suffix
+
+ vpsubq $T2,$H2,$H2 # undo input accumulation
+ vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
+ vmovd %x#$H1,`4*1-48-64`($ctx)
+ vmovd %x#$H2,`4*2-48-64`($ctx)
+ vmovd %x#$H3,`4*3-48-64`($ctx)
+ vmovd %x#$H4,`4*4-48-64`($ctx)
+ vzeroall
+___
+$code.=<<___ if ($win64);
+ movdqa -0xb0(%r10),%xmm6
+ movdqa -0xa0(%r10),%xmm7
+ movdqa -0x90(%r10),%xmm8
+ movdqa -0x80(%r10),%xmm9
+ movdqa -0x70(%r10),%xmm10
+ movdqa -0x60(%r10),%xmm11
+ movdqa -0x50(%r10),%xmm12
+ movdqa -0x40(%r10),%xmm13
+ movdqa -0x30(%r10),%xmm14
+ movdqa -0x20(%r10),%xmm15
+ lea -8(%r10),%rsp
+.Ldo_avx512_epilogue:
+___
+$code.=<<___ if (!$win64);
+ lea -8(%r10),%rsp
+.cfi_def_cfa_register %rsp
+___
+$code.=<<___;
+ ret
+.cfi_endproc
+___
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+if($kernel) {
+ $code .= "#endif\n";
+}
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+if($kernel) {
+ $code .= "#ifdef CONFIG_AS_AVX512\n";
+}
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if ($kernel) {
+ $code .= "#endif\n";
+}
+
+if (!$kernel && $avx>3) {
+########################################################################
+# VPMADD52 version using 2^44 radix.
+#
+# One can argue that base 2^52 would be more natural. Well, even though
+# some operations would be more natural, one has to recognize couple of
+# things. Base 2^52 doesn't provide advantage over base 2^44 if you look
+# at amount of multiply-n-accumulate operations. Secondly, it makes it
+# impossible to pre-compute multiples of 5 [referred to as s[]/sN in
+# reference implementations], which means that more such operations
+# would have to be performed in inner loop, which in turn makes critical
+# path longer. In other words, even though base 2^44 reduction might
+# look less elegant, overall critical path is actually shorter...
+
+########################################################################
+# Layout of opaque area is following.
+#
+# unsigned __int64 h[3]; # current hash value base 2^44
+# unsigned __int64 s[2]; # key value*20 base 2^44
+# unsigned __int64 r[3]; # key value base 2^44
+# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
+# # r^n positions reflect
+# # placement in register, not
+# # memory, R[3] is R[1]*20
+
+$code.=<<___;
+.type poly1305_init_base2_44,\@function,3
+.align 32
+poly1305_init_base2_44:
+ xor %rax,%rax
+ mov %rax,0($ctx) # initialize hash value
+ mov %rax,8($ctx)
+ mov %rax,16($ctx)
+
+.Linit_base2_44:
+ lea poly1305_blocks_vpmadd52(%rip),%r10
+ lea poly1305_emit_base2_44(%rip),%r11
+
+ mov \$0x0ffffffc0fffffff,%rax
+ mov \$0x0ffffffc0ffffffc,%rcx
+ and 0($inp),%rax
+ mov \$0x00000fffffffffff,%r8
+ and 8($inp),%rcx
+ mov \$0x00000fffffffffff,%r9
+ and %rax,%r8
+ shrd \$44,%rcx,%rax
+ mov %r8,40($ctx) # r0
+ and %r9,%rax
+ shr \$24,%rcx
+ mov %rax,48($ctx) # r1
+ lea (%rax,%rax,4),%rax # *5
+ mov %rcx,56($ctx) # r2
+ shl \$2,%rax # magic <<2
+ lea (%rcx,%rcx,4),%rcx # *5
+ shl \$2,%rcx # magic <<2
+ mov %rax,24($ctx) # s1
+ mov %rcx,32($ctx) # s2
+ movq \$-1,64($ctx) # write impossible value
+___
+$code.=<<___ if ($flavour !~ /elf32/);
+ mov %r10,0(%rdx)
+ mov %r11,8(%rdx)
+___
+$code.=<<___ if ($flavour =~ /elf32/);
+ mov %r10d,0(%rdx)
+ mov %r11d,4(%rdx)
+___
+$code.=<<___;
+ mov \$1,%eax
+ ret
+.size poly1305_init_base2_44,.-poly1305_init_base2_44
+___
+{
+my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17));
+my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21));
+my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52,\@function,4
+.align 32
+poly1305_blocks_vpmadd52:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52 # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ # if powers of the key are not calculated yet, process up to 3
+ # blocks with this single-block subroutine, otherwise ensure that
+ # length is divisible by 2 blocks and pass the rest down to next
+ # subroutine...
+
+ mov \$3,%rax
+ mov \$1,%r10
+ cmp \$4,$len # is input long
+ cmovae %r10,%rax
+ test %r8,%r8 # is power value impossible?
+ cmovns %r10,%rax
+
+ and $len,%rax # is input of favourable length?
+ jz .Lblocks_vpmadd52_4x
+
+ sub %rax,$len
+ mov \$7,%r10d
+ mov \$1,%r11d
+ kmovw %r10d,%k7
+ lea .L2_44_inp_permd(%rip),%r10
+ kmovw %r11d,%k1
+
+ vmovq $padbit,%x#$PAD
+ vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd
+ vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift
+ vpermq \$0xcf,$PAD,$PAD
+ vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask
+
+ vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value
+ vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys
+ vmovdqu64 32($ctx),${r1r0s2}{%k7}{z}
+ vmovdqu64 24($ctx),${r0s2s1}{%k7}{z}
+
+ vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt
+ vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft
+
+ jmp .Loop_vpmadd52
+
+.align 32
+.Loop_vpmadd52:
+ vmovdqu32 0($inp),%x#$T0 # load input as ----3210
+ lea 16($inp),$inp
+
+ vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110
+ vpsrlvq $inp_shift,$T0,$T0
+ vpandq $reduc_mask,$T0,$T0
+ vporq $PAD,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo # accumulate input
+
+ vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value
+ vpermq \$0b01010101,$Dlo,${H1}{%k7}{z}
+ vpermq \$0b10101010,$Dlo,${H2}{%k7}{z}
+
+ vpxord $Dlo,$Dlo,$Dlo
+ vpxord $Dhi,$Dhi,$Dhi
+
+ vpmadd52luq $r2r1r0,$H0,$Dlo
+ vpmadd52huq $r2r1r0,$H0,$Dhi
+
+ vpmadd52luq $r1r0s2,$H1,$Dlo
+ vpmadd52huq $r1r0s2,$H1,$Dhi
+
+ vpmadd52luq $r0s2s1,$H2,$Dlo
+ vpmadd52huq $r0s2s1,$H2,$Dhi
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword
+ vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpaddq $T0,$Dhi,$Dhi
+
+ vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword
+
+ vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-)
+
+ vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word
+ vpandq $reduc_mask,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ vpermq \$0b10010011,$Dlo,${T0}{%k1}{z}
+
+ vpaddq $T0,$Dlo,$Dlo
+ vpsllq \$2,$T0,$T0
+
+ vpaddq $T0,$Dlo,$Dlo
+
+ dec %rax # len-=16
+ jnz .Loop_vpmadd52
+
+ vmovdqu64 $Dlo,0($ctx){%k7} # store hash value
+
+ test $len,$len
+ jnz .Lblocks_vpmadd52_4x
+
+.Lno_data_vpmadd52:
+ ret
+.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
+___
+}
+{
+########################################################################
+# As implied by its name 4x subroutine processes 4 blocks in parallel
+# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power
+# and is handled in 256-bit %ymm registers.
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_4x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_4x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_4x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+.Lblocks_vpmadd52_4x:
+ vpbroadcastq $padbit,$PAD
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ mov \$5,%eax
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+ kmovw %eax,%k1 # used in 2x path
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Lblocks_vpmadd52_2x_do
+
+.Lblocks_vpmadd52_4x_do:
+ vpbroadcastq 64($ctx),$R0 # load 4th power of the key
+ vpbroadcastq 96($ctx),$R1
+ vpbroadcastq 128($ctx),$R2
+ vpbroadcastq 160($ctx),$S1
+
+.Lblocks_vpmadd52_4x_key_loaded:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ test \$7,$len # is len 8*n?
+ jz .Lblocks_vpmadd52_8x
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 3-1-2-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$4,$len
+ jz .Ltail_vpmadd52_4x
+ jmp .Loop_vpmadd52_4x
+ ud2
+
+.align 32
+.Linit_vpmadd52:
+ vmovq 24($ctx),%x#$S1 # load key
+ vmovq 56($ctx),%x#$H2
+ vmovq 32($ctx),%x#$S2
+ vmovq 40($ctx),%x#$R0
+ vmovq 48($ctx),%x#$R1
+
+ vmovdqa $R0,$H0
+ vmovdqa $R1,$H1
+ vmovdqa $H2,$R2
+
+ mov \$2,%eax
+
+.Lmul_init_vpmadd52:
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ dec %eax
+ jz .Ldone_init_vpmadd52
+
+ vpunpcklqdq $R1,$H1,$R1 # 1,2
+ vpbroadcastq %x#$H1,%x#$H1 # 2,2
+ vpunpcklqdq $R2,$H2,$R2
+ vpbroadcastq %x#$H2,%x#$H2
+ vpunpcklqdq $R0,$H0,$R0
+ vpbroadcastq %x#$H0,%x#$H0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R1,$S1,$S1
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S1,$S1
+ vpsllq \$2,$S2,$S2
+
+ jmp .Lmul_init_vpmadd52
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52:
+ vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
+ vinserti128 \$1,%x#$R2,$H2,$R2
+ vinserti128 \$1,%x#$R0,$H0,$R0
+
+ vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
+ vpermq \$0b11011000,$R2,$R2
+ vpermq \$0b11011000,$R0,$R0
+
+ vpsllq \$2,$R1,$S1 # S1 = R1*5*4
+ vpaddq $R1,$S1,$S1
+ vpsllq \$2,$S1,$S1
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+ test \$3,$len # is length 4*n+2?
+ jnz .Ldone_init_vpmadd52_2x
+
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpbroadcastq %x#$R0,$R0 # broadcast 4th power
+ vmovdqu64 $R1,96($ctx)
+ vpbroadcastq %x#$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpbroadcastq %x#$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpbroadcastq %x#$S1,$S1
+
+ jmp .Lblocks_vpmadd52_4x_key_loaded
+ ud2
+
+.align 32
+.Ldone_init_vpmadd52_2x:
+ vmovdqu64 $R0,64($ctx) # save key powers
+ vpsrldq \$8,$R0,$R0 # 0-1-0-2
+ vmovdqu64 $R1,96($ctx)
+ vpsrldq \$8,$R1,$R1
+ vmovdqu64 $R2,128($ctx)
+ vpsrldq \$8,$R2,$R2
+ vmovdqu64 $S1,160($ctx)
+ vpsrldq \$8,$S1,$S1
+ jmp .Lblocks_vpmadd52_2x_key_loaded
+ ud2
+
+.align 32
+.Lblocks_vpmadd52_2x_do:
+ vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers
+ vmovdqu64 160+8($ctx),${S1}{%k1}{z}
+ vmovdqu64 64+8($ctx),${R0}{%k1}{z}
+ vmovdqu64 96+8($ctx),${R1}{%k1}{z}
+
+.Lblocks_vpmadd52_2x_key_loaded:
+ vmovdqu64 16*0($inp),$T2 # load data
+ vpxorq $T3,$T3,$T3
+ lea 16*2($inp),$inp
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as x-1-x-0
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ jmp .Ltail_vpmadd52_2x
+ ud2
+
+.align 32
+.Loop_vpmadd52_4x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*2($inp),$T3
+ lea 16*4($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$4,$len # len-=64
+ jnz .Loop_vpmadd52_4x
+
+.Ltail_vpmadd52_4x:
+ vmovdqu64 128($ctx),$R2 # load all key powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+.Ltail_vpmadd52_2x:
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+ # at this point $len is
+ # either 4*n+2 or 0...
+ sub \$2,$len # len-=32
+ ja .Lblocks_vpmadd52_4x_do
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_4x:
+ ret
+.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
+___
+}
+{
+########################################################################
+# As implied by its name 8x subroutine processes 8 blocks in parallel...
+# This is intermediate version, as it's used only in cases when input
+# length is either 8*n, 8*n+1 or 8*n+2...
+
+my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17));
+my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23));
+my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31));
+my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10));
+
+$code.=<<___;
+.type poly1305_blocks_vpmadd52_8x,\@function,4
+.align 32
+poly1305_blocks_vpmadd52_8x:
+ shr \$4,$len
+ jz .Lno_data_vpmadd52_8x # too short
+
+ shl \$40,$padbit
+ mov 64($ctx),%r8 # peek on power of the key
+
+ vmovdqa64 .Lx_mask44(%rip),$mask44
+ vmovdqa64 .Lx_mask42(%rip),$mask42
+
+ test %r8,%r8 # is power value impossible?
+ js .Linit_vpmadd52 # if it is, then init R[4]
+
+ vmovq 0($ctx),%x#$H0 # load current hash value
+ vmovq 8($ctx),%x#$H1
+ vmovq 16($ctx),%x#$H2
+
+.Lblocks_vpmadd52_8x:
+ ################################################################
+ # fist we calculate more key powers
+
+ vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
+ vmovdqu64 160($ctx),$S1
+ vmovdqu64 64($ctx),$R0
+ vmovdqu64 96($ctx),$R1
+
+ vpsllq \$2,$R2,$S2 # S2 = R2*5*4
+ vpaddq $R2,$S2,$S2
+ vpsllq \$2,$S2,$S2
+
+ vpbroadcastq %x#$R2,$RR2 # broadcast 4th power
+ vpbroadcastq %x#$R0,$RR0
+ vpbroadcastq %x#$R1,$RR1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $RR2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $RR2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $RR2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $RR2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $RR2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $RR2,$R0,$D2hi
+
+ vpmadd52luq $RR0,$R0,$D0lo
+ vpmadd52huq $RR0,$R0,$D0hi
+ vpmadd52luq $RR0,$R1,$D1lo
+ vpmadd52huq $RR0,$R1,$D1hi
+ vpmadd52luq $RR0,$R2,$D2lo
+ vpmadd52huq $RR0,$R2,$D2hi
+
+ vpmadd52luq $RR1,$S2,$D0lo
+ vpmadd52huq $RR1,$S2,$D0hi
+ vpmadd52luq $RR1,$R0,$D1lo
+ vpmadd52huq $RR1,$R0,$D1hi
+ vpmadd52luq $RR1,$R1,$D2lo
+ vpmadd52huq $RR1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$RR0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$RR1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$RR2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$RR0,$RR0
+
+ vpsrlq \$44,$RR0,$tmp # additional step
+ vpandq $mask44,$RR0,$RR0
+
+ vpaddq $tmp,$RR1,$RR1
+
+ ################################################################
+ # At this point Rx holds 1324 powers, RRx - 5768, and the goal
+ # is 15263748, which reflects how data is loaded...
+
+ vpunpcklqdq $R2,$RR2,$T2 # 3748
+ vpunpckhqdq $R2,$RR2,$R2 # 1526
+ vpunpcklqdq $R0,$RR0,$T0
+ vpunpckhqdq $R0,$RR0,$R0
+ vpunpcklqdq $R1,$RR1,$T1
+ vpunpckhqdq $R1,$RR1,$R1
+___
+######## switch to %zmm
+map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2);
+
+$code.=<<___;
+ vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748
+ vshufi64x2 \$0x44,$R0,$T0,$RR0
+ vshufi64x2 \$0x44,$R1,$T1,$RR1
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+
+ vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4
+ vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4
+ vpaddq $RR2,$SS2,$SS2
+ vpaddq $RR1,$SS1,$SS1
+ vpsllq \$2,$SS2,$SS2
+ vpsllq \$2,$SS1,$SS1
+
+ vpbroadcastq $padbit,$PAD
+ vpbroadcastq %x#$mask44,$mask44
+ vpbroadcastq %x#$mask42,$mask42
+
+ vpbroadcastq %x#$SS1,$S1 # broadcast 8th power
+ vpbroadcastq %x#$SS2,$S2
+ vpbroadcastq %x#$RR0,$R0
+ vpbroadcastq %x#$RR1,$R1
+ vpbroadcastq %x#$RR2,$R2
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+
+ # at this point 64-bit lanes are ordered as 73625140
+
+ vpsrlq \$24,$T3,$T2 # splat the data
+ vporq $PAD,$T2,$T2
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ sub \$8,$len
+ jz .Ltail_vpmadd52_8x
+ jmp .Loop_vpmadd52_8x
+
+.align 32
+.Loop_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$S1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$S1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$S2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$S2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$R0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$R0,$D2hi
+
+ vmovdqu64 16*0($inp),$T2 # load data
+ vmovdqu64 16*4($inp),$T3
+ lea 16*8($inp),$inp
+ vpmadd52luq $H0,$R0,$D0lo
+ vpmadd52huq $H0,$R0,$D0hi
+ vpmadd52luq $H0,$R1,$D1lo
+ vpmadd52huq $H0,$R1,$D1hi
+ vpmadd52luq $H0,$R2,$D2lo
+ vpmadd52huq $H0,$R2,$D2hi
+
+ vpunpcklqdq $T3,$T2,$T1 # transpose data
+ vpunpckhqdq $T3,$T2,$T3
+ vpmadd52luq $H1,$S2,$D0lo
+ vpmadd52huq $H1,$S2,$D0hi
+ vpmadd52luq $H1,$R0,$D1lo
+ vpmadd52huq $H1,$R0,$D1hi
+ vpmadd52luq $H1,$R1,$D2lo
+ vpmadd52huq $H1,$R1,$D2hi
+
+ ################################################################
+ # partial reduction (interleaved with data splat)
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpsrlq \$24,$T3,$T2
+ vporq $PAD,$T2,$T2
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpandq $mask44,$T1,$T0
+ vpsrlq \$44,$T1,$T1
+ vpsllq \$20,$T3,$T3
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vporq $T3,$T1,$T1
+ vpandq $mask44,$T1,$T1
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ sub \$8,$len # len-=128
+ jnz .Loop_vpmadd52_8x
+
+.Ltail_vpmadd52_8x:
+ #vpaddq $T2,$H2,$H2 # accumulate input
+ vpaddq $T0,$H0,$H0
+ vpaddq $T1,$H1,$H1
+
+ vpxorq $D0lo,$D0lo,$D0lo
+ vpmadd52luq $H2,$SS1,$D0lo
+ vpxorq $D0hi,$D0hi,$D0hi
+ vpmadd52huq $H2,$SS1,$D0hi
+ vpxorq $D1lo,$D1lo,$D1lo
+ vpmadd52luq $H2,$SS2,$D1lo
+ vpxorq $D1hi,$D1hi,$D1hi
+ vpmadd52huq $H2,$SS2,$D1hi
+ vpxorq $D2lo,$D2lo,$D2lo
+ vpmadd52luq $H2,$RR0,$D2lo
+ vpxorq $D2hi,$D2hi,$D2hi
+ vpmadd52huq $H2,$RR0,$D2hi
+
+ vpmadd52luq $H0,$RR0,$D0lo
+ vpmadd52huq $H0,$RR0,$D0hi
+ vpmadd52luq $H0,$RR1,$D1lo
+ vpmadd52huq $H0,$RR1,$D1hi
+ vpmadd52luq $H0,$RR2,$D2lo
+ vpmadd52huq $H0,$RR2,$D2hi
+
+ vpmadd52luq $H1,$SS2,$D0lo
+ vpmadd52huq $H1,$SS2,$D0hi
+ vpmadd52luq $H1,$RR0,$D1lo
+ vpmadd52huq $H1,$RR0,$D1hi
+ vpmadd52luq $H1,$RR1,$D2lo
+ vpmadd52huq $H1,$RR1,$D2hi
+
+ ################################################################
+ # horizontal addition
+
+ mov \$1,%eax
+ kmovw %eax,%k1
+ vpsrldq \$8,$D0lo,$T0
+ vpsrldq \$8,$D0hi,$H0
+ vpsrldq \$8,$D1lo,$T1
+ vpsrldq \$8,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpsrldq \$8,$D2lo,$T2
+ vpsrldq \$8,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vpermq \$0x2,$D0lo,$T0
+ vpermq \$0x2,$D0hi,$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vpermq \$0x2,$D1lo,$T1
+ vpermq \$0x2,$D1hi,$H1
+ vpaddq $T0,$D0lo,$D0lo
+ vpaddq $H0,$D0hi,$D0hi
+ vpermq \$0x2,$D2lo,$T2
+ vpermq \$0x2,$D2hi,$H2
+ vpaddq $T1,$D1lo,$D1lo
+ vpaddq $H1,$D1hi,$D1hi
+ vextracti64x4 \$1,$D0lo,%y#$T0
+ vextracti64x4 \$1,$D0hi,%y#$H0
+ vpaddq $T2,$D2lo,$D2lo
+ vpaddq $H2,$D2hi,$D2hi
+
+ vextracti64x4 \$1,$D1lo,%y#$T1
+ vextracti64x4 \$1,$D1hi,%y#$H1
+ vextracti64x4 \$1,$D2lo,%y#$T2
+ vextracti64x4 \$1,$D2hi,%y#$H2
+___
+######## switch back to %ymm
+map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2);
+map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi);
+map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD);
+
+$code.=<<___;
+ vpaddq $T0,$D0lo,${D0lo}{%k1}{z}
+ vpaddq $H0,$D0hi,${D0hi}{%k1}{z}
+ vpaddq $T1,$D1lo,${D1lo}{%k1}{z}
+ vpaddq $H1,$D1hi,${D1hi}{%k1}{z}
+ vpaddq $T2,$D2lo,${D2lo}{%k1}{z}
+ vpaddq $H2,$D2hi,${D2hi}{%k1}{z}
+
+ ################################################################
+ # partial reduction
+ vpsrlq \$44,$D0lo,$tmp
+ vpsllq \$8,$D0hi,$D0hi
+ vpandq $mask44,$D0lo,$H0
+ vpaddq $tmp,$D0hi,$D0hi
+
+ vpaddq $D0hi,$D1lo,$D1lo
+
+ vpsrlq \$44,$D1lo,$tmp
+ vpsllq \$8,$D1hi,$D1hi
+ vpandq $mask44,$D1lo,$H1
+ vpaddq $tmp,$D1hi,$D1hi
+
+ vpaddq $D1hi,$D2lo,$D2lo
+
+ vpsrlq \$42,$D2lo,$tmp
+ vpsllq \$10,$D2hi,$D2hi
+ vpandq $mask42,$D2lo,$H2
+ vpaddq $tmp,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+ vpsllq \$2,$D2hi,$D2hi
+
+ vpaddq $D2hi,$H0,$H0
+
+ vpsrlq \$44,$H0,$tmp # additional step
+ vpandq $mask44,$H0,$H0
+
+ vpaddq $tmp,$H1,$H1
+
+ ################################################################
+
+ vmovq %x#$H0,0($ctx)
+ vmovq %x#$H1,8($ctx)
+ vmovq %x#$H2,16($ctx)
+ vzeroall
+
+.Lno_data_vpmadd52_8x:
+ ret
+.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
+___
+}
+$code.=<<___;
+.type poly1305_emit_base2_44,\@function,3
+.align 32
+poly1305_emit_base2_44:
+ mov 0($ctx),%r8 # load hash value
+ mov 8($ctx),%r9
+ mov 16($ctx),%r10
+
+ mov %r9,%rax
+ shr \$20,%r9
+ shl \$44,%rax
+ mov %r10,%rcx
+ shr \$40,%r10
+ shl \$24,%rcx
+
+ add %rax,%r8
+ adc %rcx,%r9
+ adc \$0,%r10
+
+ mov %r8,%rax
+ add \$5,%r8 # compare to modulus
+ mov %r9,%rcx
+ adc \$0,%r9
+ adc \$0,%r10
+ shr \$2,%r10 # did 130-bit value overflow?
+ cmovnz %r8,%rax
+ cmovnz %r9,%rcx
+
+ add 0($nonce),%rax # accumulate nonce
+ adc 8($nonce),%rcx
+ mov %rax,0($mac) # write result
+ mov %rcx,8($mac)
+
+ ret
+.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
+___
+} } }
+}
+
+if (!$kernel)
+{ # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_enc
+ nop
+.Loop_enc_xmm:
+ movdqu ($inp,$otp),%xmm0
+ pxor ($otp),%xmm0
+ movdqu %xmm0,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_enc_xmm
+
+ and \$15,%r10 # len % 16
+ jz .Ldone_enc
+
+.Ltail_enc:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+.Loop_enc_byte:
+ mov ($inp,$otp),%al
+ xor ($otp),%al
+ mov %al,($out,$otp)
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_enc_byte
+
+ xor %eax,%eax
+.Loop_enc_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_enc_pad
+
+.Ldone_enc:
+ mov $otp,%rax
+ ret
+.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+ sub $otp,$inp
+ sub $otp,$out
+ mov $len,%r10 # put len aside
+ shr \$4,$len # len / 16
+ jz .Ltail_dec
+ nop
+.Loop_dec_xmm:
+ movdqu ($inp,$otp),%xmm0
+ movdqa ($otp),%xmm1
+ pxor %xmm0,%xmm1
+ movdqu %xmm1,($out,$otp)
+ movdqa %xmm0,($otp)
+ lea 16($otp),$otp
+ dec $len
+ jnz .Loop_dec_xmm
+
+ pxor %xmm1,%xmm1
+ and \$15,%r10 # len % 16
+ jz .Ldone_dec
+
+.Ltail_dec:
+ mov \$16,$len
+ sub %r10,$len
+ xor %eax,%eax
+ xor %r11,%r11
+.Loop_dec_byte:
+ mov ($inp,$otp),%r11b
+ mov ($otp),%al
+ xor %r11b,%al
+ mov %al,($out,$otp)
+ mov %r11b,($otp)
+ lea 1($otp),$otp
+ dec %r10
+ jnz .Loop_dec_byte
+
+ xor %eax,%eax
+.Loop_dec_pad:
+ mov %al,($otp)
+ lea 1($otp),$otp
+ dec $len
+ jnz .Loop_dec_pad
+
+.Ldone_dec:
+ mov $otp,%rax
+ ret
+.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+# CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+.type se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<.Lprologue
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=.Lepilogue
+ jae .Lcommon_seh_tail
+
+ lea 48(%rax),%rax
+
+ mov -8(%rax),%rbx
+ mov -16(%rax),%rbp
+ mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
+ mov %rbx,144($context) # restore context->Rbx
+ mov %rbp,160($context) # restore context->Rbp
+ mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R14
+
+ jmp .Lcommon_seh_tail
+.size se_handler,.-se_handler
+
+.type avx_handler,\@abi-omnipotent
+.align 16
+avx_handler:
+ push %rsi
+ push %rdi
+ push %rbx
+ push %rbp
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+ pushfq
+ sub \$64,%rsp
+
+ mov 120($context),%rax # pull context->Rax
+ mov 248($context),%rbx # pull context->Rip
+
+ mov 8($disp),%rsi # disp->ImageBase
+ mov 56($disp),%r11 # disp->HandlerData
+
+ mov 0(%r11),%r10d # HandlerData[0]
+ lea (%rsi,%r10),%r10 # prologue label
+ cmp %r10,%rbx # context->Rip<prologue label
+ jb .Lcommon_seh_tail
+
+ mov 152($context),%rax # pull context->Rsp
+
+ mov 4(%r11),%r10d # HandlerData[1]
+ lea (%rsi,%r10),%r10 # epilogue label
+ cmp %r10,%rbx # context->Rip>=epilogue label
+ jae .Lcommon_seh_tail
+
+ mov 208($context),%rax # pull context->R11
+
+ lea 0x50(%rax),%rsi
+ lea 0xf8(%rax),%rax
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+
+.Lcommon_seh_tail:
+ mov 8(%rax),%rdi
+ mov 16(%rax),%rsi
+ mov %rax,152($context) # restore context->Rsp
+ mov %rsi,168($context) # restore context->Rsi
+ mov %rdi,176($context) # restore context->Rdi
+
+ mov 40($disp),%rdi # disp->ContextRecord
+ mov $context,%rsi # context
+ mov \$154,%ecx # sizeof(CONTEXT)
+ .long 0xa548f3fc # cld; rep movsq
+
+ mov $disp,%rsi
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
+ mov 40(%rsi),%r10 # disp->ContextRecord
+ lea 56(%rsi),%r11 # &disp->HandlerData
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
+ mov %r10,32(%rsp) # arg5
+ mov %r11,40(%rsp) # arg6
+ mov %r12,48(%rsp) # arg7
+ mov %rcx,56(%rsp) # arg8, (NULL)
+ call *__imp_RtlVirtualUnwind(%rip)
+
+ mov \$1,%eax # ExceptionContinueSearch
+ add \$64,%rsp
+ popfq
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ pop %rdi
+ pop %rsi
+ ret
+.size avx_handler,.-avx_handler
+
+.section .pdata
+.align 4
+ .rva .LSEH_begin_poly1305_init_x86_64
+ .rva .LSEH_end_poly1305_init_x86_64
+ .rva .LSEH_info_poly1305_init_x86_64
+
+ .rva .LSEH_begin_poly1305_blocks_x86_64
+ .rva .LSEH_end_poly1305_blocks_x86_64
+ .rva .LSEH_info_poly1305_blocks_x86_64
+
+ .rva .LSEH_begin_poly1305_emit_x86_64
+ .rva .LSEH_end_poly1305_emit_x86_64
+ .rva .LSEH_info_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+ .rva .LSEH_begin_poly1305_blocks_avx
+ .rva .Lbase2_64_avx
+ .rva .LSEH_info_poly1305_blocks_avx_1
+
+ .rva .Lbase2_64_avx
+ .rva .Leven_avx
+ .rva .LSEH_info_poly1305_blocks_avx_2
+
+ .rva .Leven_avx
+ .rva .LSEH_end_poly1305_blocks_avx
+ .rva .LSEH_info_poly1305_blocks_avx_3
+
+ .rva .LSEH_begin_poly1305_emit_avx
+ .rva .LSEH_end_poly1305_emit_avx
+ .rva .LSEH_info_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+ .rva .LSEH_begin_poly1305_blocks_avx2
+ .rva .Lbase2_64_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_1
+
+ .rva .Lbase2_64_avx2
+ .rva .Leven_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_2
+
+ .rva .Leven_avx2
+ .rva .LSEH_end_poly1305_blocks_avx2
+ .rva .LSEH_info_poly1305_blocks_avx2_3
+___
+$code.=<<___ if ($avx>2);
+ .rva .LSEH_begin_poly1305_blocks_avx512
+ .rva .LSEH_end_poly1305_blocks_avx512
+ .rva .LSEH_info_poly1305_blocks_avx512
+___
+$code.=<<___;
+.section .xdata
+.align 8
+.LSEH_info_poly1305_init_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
+
+.LSEH_info_poly1305_blocks_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_body,.Lblocks_epilogue
+
+.LSEH_info_poly1305_emit_x86_64:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
+___
+$code.=<<___ if ($avx);
+.LSEH_info_poly1305_blocks_avx_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_emit_avx:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
+___
+$code.=<<___ if ($avx>1);
+.LSEH_info_poly1305_blocks_avx2_1:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_2:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
+
+.LSEH_info_poly1305_blocks_avx2_3:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
+___
+$code.=<<___ if ($avx>2);
+.LSEH_info_poly1305_blocks_avx512:
+ .byte 9,0,0,0
+ .rva avx_handler
+ .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[]
+___
+}
+
+open SELF,$0;
+while(<SELF>) {
+ next if (/^#!/);
+ last if (!s/^#/\/\// and !/^$/);
+ print;
+}
+close SELF;
+
+foreach (split('\n',$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+ s/%r([a-z]+)#d/%e$1/g;
+ s/%r([0-9]+)#d/%r$1d/g;
+ s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
+
+ if ($kernel) {
+ s/(^\.type.*),[0-9]+$/\1/;
+ s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+ next if /^\.cfi.*/;
+ }
+
+ print $_,"\n";
+}
+close STDOUT;
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 0cc4537e6617..79bb58737d52 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -1,8 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <crypto/algapi.h>
@@ -13,108 +11,166 @@
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <asm/intel-family.h>
#include <asm/simd.h>
-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
- const u32 *r, unsigned int blocks);
-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
- unsigned int blocks, const u32 *u);
-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
- unsigned int blocks, const u32 *u);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
+asmlinkage void poly1305_init_x86_64(void *ctx,
+ const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+ const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+ const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+struct poly1305_arch_internal {
+ union {
+ struct {
+ u32 h[5];
+ u32 is_base2_26;
+ };
+ u64 hs[3];
+ };
+ u64 r[2];
+ u64 pad;
+ struct { u32 r2, r1, r4, r3; } rn[9];
+};
-static void poly1305_simd_mult(u32 *a, const u32 *b)
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
{
- u8 m[POLY1305_BLOCK_SIZE];
-
- memset(m, 0, sizeof(m));
- /* The poly1305 block function adds a hi-bit to the accumulator which
- * we don't need for key multiplication; compensate for it. */
- a[4] -= 1 << 24;
- poly1305_block_sse2(a, m, b, 1);
+ struct poly1305_arch_internal *state = ctx;
+ u32 cy;
+
+ if (!state->is_base2_26)
+ return;
+
+ cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+ cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+ cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+ cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+ state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+ state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+ state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+ cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+ state->hs[2] &= 3;
+ state->hs[0] += cy;
+ state->hs[1] += (cy = ULT(state->hs[0], cy));
+ state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+ state->is_base2_26 = 0;
}
-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen)
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
{
- unsigned int datalen;
-
- if (unlikely(!dctx->sset)) {
- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
- src += srclen - datalen;
- srclen = datalen;
- }
- if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_core_blocks(&dctx->h, dctx->r, src,
- srclen / POLY1305_BLOCK_SIZE, 1);
- srclen %= POLY1305_BLOCK_SIZE;
- }
- return srclen;
+ poly1305_init_x86_64(ctx, key);
}
-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
- const u8 *src, unsigned int srclen)
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+ const u32 padbit)
{
- unsigned int blocks, datalen;
-
- if (unlikely(!dctx->sset)) {
- datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
- src += srclen - datalen;
- srclen = datalen;
+ struct poly1305_arch_internal *state = ctx;
+
+ /* SIMD disables preemption, so relax after processing each page. */
+ BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+ PAGE_SIZE % POLY1305_BLOCK_SIZE);
+
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+ (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+ !crypto_simd_usable()) {
+ convert_to_base2_64(ctx);
+ poly1305_blocks_x86_64(ctx, inp, len, padbit);
+ return;
}
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- static_branch_likely(&poly1305_use_avx2) &&
- srclen >= POLY1305_BLOCK_SIZE * 4) {
- if (unlikely(dctx->rset < 4)) {
- if (dctx->rset < 2) {
- dctx->r[1] = dctx->r[0];
- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
- }
- dctx->r[2] = dctx->r[1];
- poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
- dctx->r[3] = dctx->r[2];
- poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
- dctx->rset = 4;
- }
- blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
- dctx->r[1].r);
- src += POLY1305_BLOCK_SIZE * 4 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
+ for (;;) {
+ const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+ kernel_fpu_begin();
+ if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
+ poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+ else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
+ poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+ else
+ poly1305_blocks_avx(ctx, inp, bytes, padbit);
+ kernel_fpu_end();
+ len -= bytes;
+ if (!len)
+ break;
+ inp += bytes;
}
+}
- if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
- if (unlikely(dctx->rset < 2)) {
- dctx->r[1] = dctx->r[0];
- poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
- dctx->rset = 2;
- }
- blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
- blocks, dctx->r[1].r);
- src += POLY1305_BLOCK_SIZE * 2 * blocks;
- srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
- }
- if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
- srclen -= POLY1305_BLOCK_SIZE;
- }
- return srclen;
+static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+ const u32 nonce[4])
+{
+ if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx))
+ poly1305_emit_x86_64(ctx, mac, nonce);
+ else
+ poly1305_emit_avx(ctx, mac, nonce);
}
-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
{
- poly1305_init_generic(desc, key);
+ poly1305_simd_init(&dctx->h, key);
+ dctx->s[0] = get_unaligned_le32(&key[16]);
+ dctx->s[1] = get_unaligned_le32(&key[20]);
+ dctx->s[2] = get_unaligned_le32(&key[24]);
+ dctx->s[3] = get_unaligned_le32(&key[28]);
+ dctx->buflen = 0;
+ dctx->sset = true;
}
EXPORT_SYMBOL(poly1305_init_arch);
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+ const u8 *inp, unsigned int len)
+{
+ unsigned int acc = 0;
+ if (unlikely(!dctx->sset)) {
+ if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+ poly1305_simd_init(&dctx->h, inp);
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->rset = 1;
+ }
+ if (len >= POLY1305_BLOCK_SIZE) {
+ dctx->s[0] = get_unaligned_le32(&inp[0]);
+ dctx->s[1] = get_unaligned_le32(&inp[4]);
+ dctx->s[2] = get_unaligned_le32(&inp[8]);
+ dctx->s[3] = get_unaligned_le32(&inp[12]);
+ inp += POLY1305_BLOCK_SIZE;
+ len -= POLY1305_BLOCK_SIZE;
+ acc += POLY1305_BLOCK_SIZE;
+ dctx->sset = true;
+ }
+ }
+ return acc;
+}
+
void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
unsigned int srclen)
{
- unsigned int bytes;
+ unsigned int bytes, used;
if (unlikely(dctx->buflen)) {
bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
@@ -124,31 +180,19 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
dctx->buflen += bytes;
if (dctx->buflen == POLY1305_BLOCK_SIZE) {
- if (static_branch_likely(&poly1305_use_simd) &&
- likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- poly1305_simd_blocks(dctx, dctx->buf,
- POLY1305_BLOCK_SIZE);
- kernel_fpu_end();
- } else {
- poly1305_scalar_blocks(dctx, dctx->buf,
- POLY1305_BLOCK_SIZE);
- }
+ if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
dctx->buflen = 0;
}
}
if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
- if (static_branch_likely(&poly1305_use_simd) &&
- likely(crypto_simd_usable())) {
- kernel_fpu_begin();
- bytes = poly1305_simd_blocks(dctx, src, srclen);
- kernel_fpu_end();
- } else {
- bytes = poly1305_scalar_blocks(dctx, src, srclen);
- }
- src += srclen - bytes;
- srclen = bytes;
+ bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+ srclen -= bytes;
+ used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+ if (likely(bytes - used))
+ poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
+ src += bytes;
}
if (unlikely(srclen)) {
@@ -158,9 +202,17 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
}
EXPORT_SYMBOL(poly1305_update_arch);
-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest)
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
{
- poly1305_final_generic(desc, digest);
+ if (unlikely(dctx->buflen)) {
+ dctx->buf[dctx->buflen++] = 1;
+ memset(dctx->buf + dctx->buflen, 0,
+ POLY1305_BLOCK_SIZE - dctx->buflen);
+ poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
+ }
+
+ poly1305_simd_emit(&dctx->h, dst, dctx->s);
+ *dctx = (struct poly1305_desc_ctx){};
}
EXPORT_SYMBOL(poly1305_final_arch);
@@ -168,38 +220,34 @@ static int crypto_poly1305_init(struct shash_desc *desc)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- poly1305_core_init(&dctx->h);
- dctx->buflen = 0;
- dctx->rset = 0;
- dctx->sset = false;
-
+ *dctx = (struct poly1305_desc_ctx){};
return 0;
}
-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+static int crypto_poly1305_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- if (unlikely(!dctx->sset))
- return -ENOKEY;
-
- poly1305_final_generic(dctx, dst);
+ poly1305_update_arch(dctx, src, srclen);
return 0;
}
-static int poly1305_simd_update(struct shash_desc *desc,
- const u8 *src, unsigned int srclen)
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
{
struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
- poly1305_update_arch(dctx, src, srclen);
+ if (unlikely(!dctx->sset))
+ return -ENOKEY;
+
+ poly1305_final_arch(dctx, dst);
return 0;
}
static struct shash_alg alg = {
.digestsize = POLY1305_DIGEST_SIZE,
.init = crypto_poly1305_init,
- .update = poly1305_simd_update,
+ .update = crypto_poly1305_update,
.final = crypto_poly1305_final,
.descsize = sizeof(struct poly1305_desc_ctx),
.base = {
@@ -213,17 +261,19 @@ static struct shash_alg alg = {
static int __init poly1305_simd_mod_init(void)
{
- if (!boot_cpu_has(X86_FEATURE_XMM2))
- return 0;
-
- static_branch_enable(&poly1305_use_simd);
-
- if (IS_ENABLED(CONFIG_AS_AVX2) &&
- boot_cpu_has(X86_FEATURE_AVX) &&
+ if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+ static_branch_enable(&poly1305_use_avx);
+ if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
boot_cpu_has(X86_FEATURE_AVX2) &&
cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
static_branch_enable(&poly1305_use_avx2);
-
+ if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+ /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+ boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+ static_branch_enable(&poly1305_use_avx512);
return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
}
@@ -237,7 +287,7 @@ module_init(poly1305_simd_mod_init);
module_exit(poly1305_simd_mod_exit);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
MODULE_DESCRIPTION("Poly1305 authenticator");
MODULE_ALIAS_CRYPTO("poly1305");
MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 13fd8d3d2da0..f973ace44ad3 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -19,18 +19,16 @@
#define SERPENT_AVX2_PARALLEL_BLOCKS 16
/* 16-way AVX2 parallel cipher functions */
-asmlinkage void serpent_ecb_enc_16way(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void serpent_ecb_dec_16way(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void serpent_cbc_dec_16way(void *ctx, u128 *dst, const u128 *src);
+asmlinkage void serpent_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void serpent_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void serpent_ctr_16way(void *ctx, u128 *dst, const u128 *src,
+asmlinkage void serpent_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-asmlinkage void serpent_xts_enc_16way(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void serpent_xts_dec_16way(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void serpent_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void serpent_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -44,13 +42,13 @@ static const struct common_glue_ctx serpent_enc = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_16way) }
+ .fn_u = { .ecb = serpent_ecb_enc_16way }
}, {
.num_blocks = 8,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+ .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ .fn_u = { .ecb = __serpent_encrypt }
} }
};
@@ -60,13 +58,13 @@ static const struct common_glue_ctx serpent_ctr = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_16way) }
+ .fn_u = { .ctr = serpent_ctr_16way }
}, {
.num_blocks = 8,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+ .fn_u = { .ctr = serpent_ctr_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+ .fn_u = { .ctr = __serpent_crypt_ctr }
} }
};
@@ -76,13 +74,13 @@ static const struct common_glue_ctx serpent_enc_xts = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_16way) }
+ .fn_u = { .xts = serpent_xts_enc_16way }
}, {
.num_blocks = 8,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+ .fn_u = { .xts = serpent_xts_enc_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+ .fn_u = { .xts = serpent_xts_enc }
} }
};
@@ -92,13 +90,13 @@ static const struct common_glue_ctx serpent_dec = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_16way) }
+ .fn_u = { .ecb = serpent_ecb_dec_16way }
}, {
.num_blocks = 8,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+ .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .ecb = __serpent_decrypt }
} }
};
@@ -108,13 +106,13 @@ static const struct common_glue_ctx serpent_dec_cbc = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_16way) }
+ .fn_u = { .cbc = serpent_cbc_dec_16way }
}, {
.num_blocks = 8,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+ .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .cbc = __serpent_decrypt }
} }
};
@@ -124,13 +122,13 @@ static const struct common_glue_ctx serpent_dec_xts = {
.funcs = { {
.num_blocks = 16,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_16way) }
+ .fn_u = { .xts = serpent_xts_dec_16way }
}, {
.num_blocks = 8,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+ .fn_u = { .xts = serpent_xts_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+ .fn_u = { .xts = serpent_xts_dec }
} }
};
@@ -146,8 +144,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
- req);
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -166,8 +163,8 @@ static int xts_encrypt(struct skcipher_request *req)
struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&serpent_enc_xts, req,
- XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -176,8 +173,8 @@ static int xts_decrypt(struct skcipher_request *req)
struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&serpent_dec_xts, req,
- XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, true);
}
static struct skcipher_alg serpent_algs[] = {
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7d3dca38a5a2..7806d1cbe854 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -20,33 +20,35 @@
#include <asm/crypto/serpent-avx.h>
/* 8-way parallel cipher functions */
-asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(serpent_ecb_enc_8way_avx);
-asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(serpent_ecb_dec_8way_avx);
-asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
EXPORT_SYMBOL_GPL(serpent_cbc_dec_8way_avx);
-asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
EXPORT_SYMBOL_GPL(serpent_ctr_8way_avx);
-asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
const u8 *src, le128 *iv);
EXPORT_SYMBOL_GPL(serpent_xts_enc_8way_avx);
-asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src, le128 *iv);
EXPORT_SYMBOL_GPL(serpent_xts_dec_8way_avx);
-void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void __serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
@@ -56,17 +58,15 @@ void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
}
EXPORT_SYMBOL_GPL(__serpent_crypt_ctr);
-void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(__serpent_encrypt));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_encrypt);
}
EXPORT_SYMBOL_GPL(serpent_xts_enc);
-void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(__serpent_decrypt));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, __serpent_decrypt);
}
EXPORT_SYMBOL_GPL(serpent_xts_dec);
@@ -102,10 +102,10 @@ static const struct common_glue_ctx serpent_enc = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
+ .fn_u = { .ecb = serpent_ecb_enc_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ .fn_u = { .ecb = __serpent_encrypt }
} }
};
@@ -115,10 +115,10 @@ static const struct common_glue_ctx serpent_ctr = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
+ .fn_u = { .ctr = serpent_ctr_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(__serpent_crypt_ctr) }
+ .fn_u = { .ctr = __serpent_crypt_ctr }
} }
};
@@ -128,10 +128,10 @@ static const struct common_glue_ctx serpent_enc_xts = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc_8way_avx) }
+ .fn_u = { .xts = serpent_xts_enc_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_enc) }
+ .fn_u = { .xts = serpent_xts_enc }
} }
};
@@ -141,10 +141,10 @@ static const struct common_glue_ctx serpent_dec = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
+ .fn_u = { .ecb = serpent_ecb_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .ecb = __serpent_decrypt }
} }
};
@@ -154,10 +154,10 @@ static const struct common_glue_ctx serpent_dec_cbc = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
+ .fn_u = { .cbc = serpent_cbc_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .cbc = __serpent_decrypt }
} }
};
@@ -167,10 +167,10 @@ static const struct common_glue_ctx serpent_dec_xts = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec_8way_avx) }
+ .fn_u = { .xts = serpent_xts_dec_8way_avx }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(serpent_xts_dec) }
+ .fn_u = { .xts = serpent_xts_dec }
} }
};
@@ -186,8 +186,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
- req);
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -206,8 +205,8 @@ static int xts_encrypt(struct skcipher_request *req)
struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&serpent_enc_xts, req,
- XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx, false);
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, false);
}
static int xts_decrypt(struct skcipher_request *req)
@@ -216,8 +215,8 @@ static int xts_decrypt(struct skcipher_request *req)
struct serpent_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
return glue_xts_req_128bit(&serpent_dec_xts, req,
- XTS_TWEAK_CAST(__serpent_encrypt),
- &ctx->tweak_ctx, &ctx->crypt_ctx, true);
+ __serpent_encrypt, &ctx->tweak_ctx,
+ &ctx->crypt_ctx, true);
}
static struct skcipher_alg serpent_algs[] = {
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 5fdf1931d069..4fed8d26b91a 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -31,9 +31,11 @@ static int serpent_setkey_skcipher(struct crypto_skcipher *tfm,
return __serpent_setkey(crypto_skcipher_ctx(tfm), key, keylen);
}
-static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
+static void serpent_decrypt_cbc_xway(const void *ctx, u8 *d, const u8 *s)
{
u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
unsigned int j;
for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
@@ -45,9 +47,11 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
}
-static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void serpent_crypt_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
le128_to_be128(&ctrblk, iv);
le128_inc(iv);
@@ -56,10 +60,12 @@ static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
u128_xor(dst, src, (u128 *)&ctrblk);
}
-static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
+static void serpent_crypt_ctr_xway(const void *ctx, u8 *d, const u8 *s,
le128 *iv)
{
be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
unsigned int i;
for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
@@ -79,10 +85,10 @@ static const struct common_glue_ctx serpent_enc = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) }
+ .fn_u = { .ecb = serpent_enc_blk_xway }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
+ .fn_u = { .ecb = __serpent_encrypt }
} }
};
@@ -92,10 +98,10 @@ static const struct common_glue_ctx serpent_ctr = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) }
+ .fn_u = { .ctr = serpent_crypt_ctr_xway }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
+ .fn_u = { .ctr = serpent_crypt_ctr }
} }
};
@@ -105,10 +111,10 @@ static const struct common_glue_ctx serpent_dec = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) }
+ .fn_u = { .ecb = serpent_dec_blk_xway }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .ecb = __serpent_decrypt }
} }
};
@@ -118,10 +124,10 @@ static const struct common_glue_ctx serpent_dec_cbc = {
.funcs = { {
.num_blocks = SERPENT_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) }
+ .fn_u = { .cbc = serpent_decrypt_cbc_xway }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
+ .fn_u = { .cbc = __serpent_decrypt }
} }
};
@@ -137,7 +143,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(__serpent_encrypt),
+ return glue_cbc_encrypt_req_128bit(__serpent_encrypt,
req);
}
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 6decc85ef7b7..1e594d60afa5 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -62,11 +62,11 @@
*Visit http://software.intel.com/en-us/articles/
*and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
*
- *Updates 20-byte SHA-1 record in 'hash' for even number of
- *'num_blocks' consecutive 64-byte blocks
+ *Updates 20-byte SHA-1 record at start of 'state', from 'input', for
+ *even number of 'blocks' consecutive 64-byte blocks.
*
*extern "C" void sha1_transform_avx2(
- * int *hash, const char* input, size_t num_blocks );
+ * struct sha1_state *state, const u8* input, int blocks );
*/
#include <linux/linkage.h>
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index 5d03c1173690..12e2d19d7402 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -457,9 +457,13 @@ W_PRECALC_SSSE3
movdqu \a,\b
.endm
-/* SSSE3 optimized implementation:
- * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
- * unsigned int rounds);
+/*
+ * SSSE3 optimized implementation:
+ *
+ * extern "C" void sha1_transform_ssse3(struct sha1_state *state,
+ * const u8 *data, int blocks);
+ *
+ * Note that struct sha1_state is assumed to begin with u32 state[5].
*/
SHA1_VECTOR_ASM sha1_transform_ssse3
@@ -545,8 +549,8 @@ W_PRECALC_AVX
/* AVX optimized implementation:
- * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
- * unsigned int rounds);
+ * extern "C" void sha1_transform_avx(struct sha1_state *state,
+ * const u8 *data, int blocks);
*/
SHA1_VECTOR_ASM sha1_transform_avx
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 639d4c2fd6a8..d70b40ad594c 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -27,11 +27,8 @@
#include <crypto/sha1_base.h>
#include <asm/simd.h>
-typedef void (sha1_transform_fn)(u32 *digest, const char *data,
- unsigned int rounds);
-
static int sha1_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha1_transform_fn *sha1_xform)
+ unsigned int len, sha1_block_fn *sha1_xform)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
@@ -39,48 +36,47 @@ static int sha1_update(struct shash_desc *desc, const u8 *data,
(sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
return crypto_sha1_update(desc, data, len);
- /* make sure casting to sha1_block_fn() is safe */
+ /*
+ * Make sure struct sha1_state begins directly with the SHA1
+ * 160-bit internal state, as this is what the asm functions expect.
+ */
BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
kernel_fpu_begin();
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_xform);
+ sha1_base_do_update(desc, data, len, sha1_xform);
kernel_fpu_end();
return 0;
}
static int sha1_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha1_transform_fn *sha1_xform)
+ unsigned int len, u8 *out, sha1_block_fn *sha1_xform)
{
if (!crypto_simd_usable())
return crypto_sha1_finup(desc, data, len, out);
kernel_fpu_begin();
if (len)
- sha1_base_do_update(desc, data, len,
- (sha1_block_fn *)sha1_xform);
- sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_xform);
+ sha1_base_do_update(desc, data, len, sha1_xform);
+ sha1_base_do_finalize(desc, sha1_xform);
kernel_fpu_end();
return sha1_base_finish(desc, out);
}
-asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data,
- unsigned int rounds);
+asmlinkage void sha1_transform_ssse3(struct sha1_state *state,
+ const u8 *data, int blocks);
static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha1_update(desc, data, len,
- (sha1_transform_fn *) sha1_transform_ssse3);
+ return sha1_update(desc, data, len, sha1_transform_ssse3);
}
static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return sha1_finup(desc, data, len, out,
- (sha1_transform_fn *) sha1_transform_ssse3);
+ return sha1_finup(desc, data, len, out, sha1_transform_ssse3);
}
/* Add padding and return the message digest. */
@@ -119,21 +115,19 @@ static void unregister_sha1_ssse3(void)
}
#ifdef CONFIG_AS_AVX
-asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
- unsigned int rounds);
+asmlinkage void sha1_transform_avx(struct sha1_state *state,
+ const u8 *data, int blocks);
static int sha1_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha1_update(desc, data, len,
- (sha1_transform_fn *) sha1_transform_avx);
+ return sha1_update(desc, data, len, sha1_transform_avx);
}
static int sha1_avx_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return sha1_finup(desc, data, len, out,
- (sha1_transform_fn *) sha1_transform_avx);
+ return sha1_finup(desc, data, len, out, sha1_transform_avx);
}
static int sha1_avx_final(struct shash_desc *desc, u8 *out)
@@ -190,8 +184,8 @@ static inline void unregister_sha1_avx(void) { }
#if defined(CONFIG_AS_AVX2) && (CONFIG_AS_AVX)
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
-asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds);
+asmlinkage void sha1_transform_avx2(struct sha1_state *state,
+ const u8 *data, int blocks);
static bool avx2_usable(void)
{
@@ -203,28 +197,26 @@ static bool avx2_usable(void)
return false;
}
-static void sha1_apply_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds)
+static void sha1_apply_transform_avx2(struct sha1_state *state,
+ const u8 *data, int blocks)
{
/* Select the optimal transform based on data block size */
- if (rounds >= SHA1_AVX2_BLOCK_OPTSIZE)
- sha1_transform_avx2(digest, data, rounds);
+ if (blocks >= SHA1_AVX2_BLOCK_OPTSIZE)
+ sha1_transform_avx2(state, data, blocks);
else
- sha1_transform_avx(digest, data, rounds);
+ sha1_transform_avx(state, data, blocks);
}
static int sha1_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha1_update(desc, data, len,
- (sha1_transform_fn *) sha1_apply_transform_avx2);
+ return sha1_update(desc, data, len, sha1_apply_transform_avx2);
}
static int sha1_avx2_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return sha1_finup(desc, data, len, out,
- (sha1_transform_fn *) sha1_apply_transform_avx2);
+ return sha1_finup(desc, data, len, out, sha1_apply_transform_avx2);
}
static int sha1_avx2_final(struct shash_desc *desc, u8 *out)
@@ -267,21 +259,19 @@ static inline void unregister_sha1_avx2(void) { }
#endif
#ifdef CONFIG_AS_SHA1_NI
-asmlinkage void sha1_ni_transform(u32 *digest, const char *data,
- unsigned int rounds);
+asmlinkage void sha1_ni_transform(struct sha1_state *digest, const u8 *data,
+ int rounds);
static int sha1_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
- return sha1_update(desc, data, len,
- (sha1_transform_fn *) sha1_ni_transform);
+ return sha1_update(desc, data, len, sha1_ni_transform);
}
static int sha1_ni_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
- return sha1_finup(desc, data, len, out,
- (sha1_transform_fn *) sha1_ni_transform);
+ return sha1_finup(desc, data, len, out, sha1_ni_transform);
}
static int sha1_ni_final(struct shash_desc *desc, u8 *out)
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 22e14c8dd2e4..fcbc30f58c38 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -341,8 +341,8 @@ a = TMP_
.endm
########################################################################
-## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to digest
+## void sha256_transform_avx(state sha256_state *state, const u8 *data, int blocks)
+## arg 1 : pointer to state
## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 519b551ad576..499d9ec129de 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -520,8 +520,8 @@ STACK_SIZE = _RSP + _RSP_SIZE
.endm
########################################################################
-## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to digest
+## void sha256_transform_rorx(struct sha256_state *state, const u8 *data, int blocks)
+## arg 1 : pointer to state
## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 69cc2f91dc4c..ddfa863b4ee3 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -347,8 +347,10 @@ a = TMP_
.endm
########################################################################
-## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to digest
+## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
+## int blocks);
+## arg 1 : pointer to state
+## (struct sha256_state is assumed to begin with u32 state[8])
## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index f9aff31fe59e..03ad657c04bd 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -41,12 +41,11 @@
#include <linux/string.h>
#include <asm/simd.h>
-asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
- u64 rounds);
-typedef void (sha256_transform_fn)(u32 *digest, const char *data, u64 rounds);
+asmlinkage void sha256_transform_ssse3(struct sha256_state *state,
+ const u8 *data, int blocks);
static int _sha256_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha256_transform_fn *sha256_xform)
+ unsigned int len, sha256_block_fn *sha256_xform)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
@@ -54,28 +53,29 @@ static int _sha256_update(struct shash_desc *desc, const u8 *data,
(sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
return crypto_sha256_update(desc, data, len);
- /* make sure casting to sha256_block_fn() is safe */
+ /*
+ * Make sure struct sha256_state begins directly with the SHA256
+ * 256-bit internal state, as this is what the asm functions expect.
+ */
BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
kernel_fpu_begin();
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_xform);
+ sha256_base_do_update(desc, data, len, sha256_xform);
kernel_fpu_end();
return 0;
}
static int sha256_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha256_transform_fn *sha256_xform)
+ unsigned int len, u8 *out, sha256_block_fn *sha256_xform)
{
if (!crypto_simd_usable())
return crypto_sha256_finup(desc, data, len, out);
kernel_fpu_begin();
if (len)
- sha256_base_do_update(desc, data, len,
- (sha256_block_fn *)sha256_xform);
- sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_xform);
+ sha256_base_do_update(desc, data, len, sha256_xform);
+ sha256_base_do_finalize(desc, sha256_xform);
kernel_fpu_end();
return sha256_base_finish(desc, out);
@@ -145,8 +145,8 @@ static void unregister_sha256_ssse3(void)
}
#ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
- u64 rounds);
+asmlinkage void sha256_transform_avx(struct sha256_state *state,
+ const u8 *data, int blocks);
static int sha256_avx_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
@@ -227,8 +227,8 @@ static inline void unregister_sha256_avx(void) { }
#endif
#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
-asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
- u64 rounds);
+asmlinkage void sha256_transform_rorx(struct sha256_state *state,
+ const u8 *data, int blocks);
static int sha256_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
@@ -307,8 +307,8 @@ static inline void unregister_sha256_avx2(void) { }
#endif
#ifdef CONFIG_AS_SHA256_NI
-asmlinkage void sha256_ni_transform(u32 *digest, const char *data,
- u64 rounds); /*unsigned int rounds);*/
+asmlinkage void sha256_ni_transform(struct sha256_state *digest,
+ const u8 *data, int rounds);
static int sha256_ni_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 3704ddd7e5d5..90ea945ba5e6 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -271,11 +271,12 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_avx(void* D, const void* M, u64 L)
-# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
-# The size of the message pointed to by M must be an integer multiple of SHA512
-# message blocks.
-# L is the message length in SHA512 blocks
+# void sha512_transform_avx(sha512_state *state, const u8 *data, int blocks)
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_avx)
cmp $0, msglen
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 80d830e7ee09..3dd886b14e7d 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -563,11 +563,12 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
-# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
-# The size of the message pointed to by M must be an integer multiple of SHA512
-# message blocks.
-# L is the message length in SHA512 blocks
+# void sha512_transform_rorx(sha512_state *state, const u8 *data, int blocks)
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "blocks" is the message length in SHA512 blocks
########################################################################
SYM_FUNC_START(sha512_transform_rorx)
# Allocate Stack Space
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index 838f984e95d9..7946a1bee85b 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -269,11 +269,14 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
-# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
-# The size of the message pointed to by M must be an integer multiple of SHA512
-# message blocks.
-# L is the message length in SHA512 blocks.
+## void sha512_transform_ssse3(struct sha512_state *state, const u8 *data,
+## int blocks);
+# (struct sha512_state is assumed to begin with u64 state[8])
+# Purpose: Updates the SHA512 digest stored at "state" with the message
+# stored in "data".
+# The size of the message pointed to by "data" must be an integer multiple
+# of SHA512 message blocks.
+# "blocks" is the message length in SHA512 blocks.
########################################################################
SYM_FUNC_START(sha512_transform_ssse3)
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 458356a3f124..1c444f41037c 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -39,13 +39,11 @@
#include <crypto/sha512_base.h>
#include <asm/simd.h>
-asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
- u64 rounds);
-
-typedef void (sha512_transform_fn)(u64 *digest, const char *data, u64 rounds);
+asmlinkage void sha512_transform_ssse3(struct sha512_state *state,
+ const u8 *data, int blocks);
static int sha512_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, sha512_transform_fn *sha512_xform)
+ unsigned int len, sha512_block_fn *sha512_xform)
{
struct sha512_state *sctx = shash_desc_ctx(desc);
@@ -53,28 +51,29 @@ static int sha512_update(struct shash_desc *desc, const u8 *data,
(sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
return crypto_sha512_update(desc, data, len);
- /* make sure casting to sha512_block_fn() is safe */
+ /*
+ * Make sure struct sha512_state begins directly with the SHA512
+ * 512-bit internal state, as this is what the asm functions expect.
+ */
BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
kernel_fpu_begin();
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_xform);
+ sha512_base_do_update(desc, data, len, sha512_xform);
kernel_fpu_end();
return 0;
}
static int sha512_finup(struct shash_desc *desc, const u8 *data,
- unsigned int len, u8 *out, sha512_transform_fn *sha512_xform)
+ unsigned int len, u8 *out, sha512_block_fn *sha512_xform)
{
if (!crypto_simd_usable())
return crypto_sha512_finup(desc, data, len, out);
kernel_fpu_begin();
if (len)
- sha512_base_do_update(desc, data, len,
- (sha512_block_fn *)sha512_xform);
- sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_xform);
+ sha512_base_do_update(desc, data, len, sha512_xform);
+ sha512_base_do_finalize(desc, sha512_xform);
kernel_fpu_end();
return sha512_base_finish(desc, out);
@@ -144,8 +143,8 @@ static void unregister_sha512_ssse3(void)
}
#ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
- u64 rounds);
+asmlinkage void sha512_transform_avx(struct sha512_state *state,
+ const u8 *data, int blocks);
static bool avx_usable(void)
{
if (!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
@@ -225,8 +224,8 @@ static inline void unregister_sha512_avx(void) { }
#endif
#if defined(CONFIG_AS_AVX2) && defined(CONFIG_AS_AVX)
-asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
- u64 rounds);
+asmlinkage void sha512_transform_rorx(struct sha512_state *state,
+ const u8 *data, int blocks);
static int sha512_avx2_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index d561c821788b..2dbc8ce3730e 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -22,20 +22,17 @@
#define TWOFISH_PARALLEL_BLOCKS 8
/* 8-way parallel cipher functions */
-asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void twofish_ecb_enc_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_ecb_dec_8way(const void *ctx, u8 *dst, const u8 *src);
-asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void twofish_cbc_dec_8way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_ctr_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
-asmlinkage void twofish_xts_enc_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void twofish_xts_dec_8way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void twofish_xts_enc_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void twofish_xts_dec_8way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
const u8 *key, unsigned int keylen)
@@ -43,22 +40,19 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
return twofish_setkey(&tfm->base, key, keylen);
}
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static void twofish_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(twofish_enc_blk));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_enc_blk);
}
-static void twofish_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+static void twofish_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv)
{
- glue_xts_crypt_128bit_one(ctx, dst, src, iv,
- GLUE_FUNC_CAST(twofish_dec_blk));
+ glue_xts_crypt_128bit_one(ctx, dst, src, iv, twofish_dec_blk);
}
struct twofish_xts_ctx {
@@ -70,7 +64,6 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen)
{
struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *flags = &tfm->base.crt_flags;
int err;
err = xts_verify_key(tfm, key, keylen);
@@ -78,13 +71,12 @@ static int xts_twofish_setkey(struct crypto_skcipher *tfm, const u8 *key,
return err;
/* first half of xts-key is for crypt */
- err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
+ err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2);
if (err)
return err;
/* second half of xts-key is for tweak */
- return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
- flags);
+ return __twofish_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2);
}
static const struct common_glue_ctx twofish_enc = {
@@ -93,13 +85,13 @@ static const struct common_glue_ctx twofish_enc = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
+ .fn_u = { .ecb = twofish_ecb_enc_8way }
}, {
.num_blocks = 3,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+ .fn_u = { .ecb = twofish_enc_blk_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+ .fn_u = { .ecb = twofish_enc_blk }
} }
};
@@ -109,13 +101,13 @@ static const struct common_glue_ctx twofish_ctr = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
+ .fn_u = { .ctr = twofish_ctr_8way }
}, {
.num_blocks = 3,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+ .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr) }
+ .fn_u = { .ctr = twofish_enc_blk_ctr }
} }
};
@@ -125,10 +117,10 @@ static const struct common_glue_ctx twofish_enc_xts = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc_8way) }
+ .fn_u = { .xts = twofish_xts_enc_8way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_enc) }
+ .fn_u = { .xts = twofish_xts_enc }
} }
};
@@ -138,13 +130,13 @@ static const struct common_glue_ctx twofish_dec = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
+ .fn_u = { .ecb = twofish_ecb_dec_8way }
}, {
.num_blocks = 3,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+ .fn_u = { .ecb = twofish_dec_blk_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+ .fn_u = { .ecb = twofish_dec_blk }
} }
};
@@ -154,13 +146,13 @@ static const struct common_glue_ctx twofish_dec_cbc = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
+ .fn_u = { .cbc = twofish_cbc_dec_8way }
}, {
.num_blocks = 3,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+ .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+ .fn_u = { .cbc = twofish_dec_blk }
} }
};
@@ -170,10 +162,10 @@ static const struct common_glue_ctx twofish_dec_xts = {
.funcs = { {
.num_blocks = TWOFISH_PARALLEL_BLOCKS,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec_8way) }
+ .fn_u = { .xts = twofish_xts_dec_8way }
}, {
.num_blocks = 1,
- .fn_u = { .xts = GLUE_XTS_FUNC_CAST(twofish_xts_dec) }
+ .fn_u = { .xts = twofish_xts_dec }
} }
};
@@ -189,8 +181,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk),
- req);
+ return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
}
static int cbc_decrypt(struct skcipher_request *req)
@@ -208,8 +199,7 @@ static int xts_encrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&twofish_enc_xts, req,
- XTS_TWEAK_CAST(twofish_enc_blk),
+ return glue_xts_req_128bit(&twofish_enc_xts, req, twofish_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, false);
}
@@ -218,8 +208,7 @@ static int xts_decrypt(struct skcipher_request *req)
struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
struct twofish_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
- return glue_xts_req_128bit(&twofish_dec_xts, req,
- XTS_TWEAK_CAST(twofish_enc_blk),
+ return glue_xts_req_128bit(&twofish_dec_xts, req, twofish_enc_blk,
&ctx->tweak_ctx, &ctx->crypt_ctx, true);
}
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 1dc9e29f221e..768af6075479 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -25,21 +25,22 @@ static int twofish_setkey_skcipher(struct crypto_skcipher *tfm,
return twofish_setkey(&tfm->base, key, keylen);
}
-static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, false);
}
-static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst,
+static inline void twofish_enc_blk_xor_3way(const void *ctx, u8 *dst,
const u8 *src)
{
__twofish_enc_blk_3way(ctx, dst, src, true);
}
-void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
+void twofish_dec_blk_cbc_3way(const void *ctx, u8 *d, const u8 *s)
{
u128 ivs[2];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
ivs[0] = src[0];
ivs[1] = src[1];
@@ -51,9 +52,11 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
}
EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
-void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
+void twofish_enc_blk_ctr(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblk;
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
if (dst != src)
*dst = *src;
@@ -66,10 +69,11 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
}
EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
-void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
- le128 *iv)
+void twofish_enc_blk_ctr_3way(const void *ctx, u8 *d, const u8 *s, le128 *iv)
{
be128 ctrblks[3];
+ u128 *dst = (u128 *)d;
+ const u128 *src = (const u128 *)s;
if (dst != src) {
dst[0] = src[0];
@@ -94,10 +98,10 @@ static const struct common_glue_ctx twofish_enc = {
.funcs = { {
.num_blocks = 3,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
+ .fn_u = { .ecb = twofish_enc_blk_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk) }
+ .fn_u = { .ecb = twofish_enc_blk }
} }
};
@@ -107,10 +111,10 @@ static const struct common_glue_ctx twofish_ctr = {
.funcs = { {
.num_blocks = 3,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr_3way) }
+ .fn_u = { .ctr = twofish_enc_blk_ctr_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_ctr) }
+ .fn_u = { .ctr = twofish_enc_blk_ctr }
} }
};
@@ -120,10 +124,10 @@ static const struct common_glue_ctx twofish_dec = {
.funcs = { {
.num_blocks = 3,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
+ .fn_u = { .ecb = twofish_dec_blk_3way }
}, {
.num_blocks = 1,
- .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk) }
+ .fn_u = { .ecb = twofish_dec_blk }
} }
};
@@ -133,10 +137,10 @@ static const struct common_glue_ctx twofish_dec_cbc = {
.funcs = { {
.num_blocks = 3,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
+ .fn_u = { .cbc = twofish_dec_blk_cbc_3way }
}, {
.num_blocks = 1,
- .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk) }
+ .fn_u = { .cbc = twofish_dec_blk }
} }
};
@@ -152,8 +156,7 @@ static int ecb_decrypt(struct skcipher_request *req)
static int cbc_encrypt(struct skcipher_request *req)
{
- return glue_cbc_encrypt_req_128bit(GLUE_FUNC_CAST(twofish_enc_blk),
- req);
+ return glue_cbc_encrypt_req_128bit(twofish_enc_blk, req);
}
static int cbc_decrypt(struct skcipher_request *req)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 76942cbd95a1..f2bb91e87877 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1728,7 +1728,7 @@ SYM_CODE_END(nmi)
SYM_CODE_START(ignore_sysret)
UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
- sysret
+ sysretl
SYM_CODE_END(ignore_sysret)
#endif
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 15908eb9b17e..c17cb77eb150 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -440,3 +440,5 @@
433 i386 fspick sys_fspick __ia32_sys_fspick
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
435 i386 clone3 sys_clone3 __ia32_sys_clone3
+437 i386 openat2 sys_openat2 __ia32_sys_openat2
+438 i386 pidfd_getfd sys_pidfd_getfd __ia32_sys_pidfd_getfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c29976eca4a8..44d510bc9b78 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -357,6 +357,8 @@
433 common fspick __x64_sys_fspick
434 common pidfd_open __x64_sys_pidfd_open
435 common clone3 __x64_sys_clone3/ptregs
+437 common openat2 __x64_sys_openat2
+438 common pidfd_getfd __x64_sys_pidfd_getfd
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 2b75e80f6b41..433a1259f61d 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -59,7 +59,7 @@ $(obj)/vdso64.so.dbg: $(obj)/vdso.lds $(vobjs) FORCE
$(call if_changed,vdso_and_check)
HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/$(SUBARCH)/include/uapi
-hostprogs-y += vdso2c
+hostprogs += vdso2c
quiet_cmd_vdso2c = VDSO2C $@
cmd_vdso2c = $(obj)/vdso2c $< $(<:%.dbg=%) $@
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index 93c6dc7812d0..ea7e0155c604 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -16,18 +16,23 @@ SECTIONS
* segment.
*/
- vvar_start = . - 3 * PAGE_SIZE;
- vvar_page = vvar_start;
+ vvar_start = . - 4 * PAGE_SIZE;
+ vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset;
-#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
pvclock_page = vvar_start + PAGE_SIZE;
hvclock_page = vvar_start + 2 * PAGE_SIZE;
+ timens_page = vvar_start + 3 * PAGE_SIZE;
+
+#undef _ASM_X86_VVAR_H
+ /* Place all vvars in timens too at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) timens_ ## name = timens_page + offset;
+#include <asm/vvar.h>
+#undef EMIT_VVAR
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 3a4d8d4d39f8..3842873b3ae3 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -75,12 +75,14 @@ enum {
sym_vvar_page,
sym_pvclock_page,
sym_hvclock_page,
+ sym_timens_page,
};
const int special_pages[] = {
sym_vvar_page,
sym_pvclock_page,
sym_hvclock_page,
+ sym_timens_page,
};
struct vdso_sym {
@@ -93,6 +95,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_page] = {"vvar_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
[sym_hvclock_page] = {"hvclock_page", true},
+ [sym_timens_page] = {"timens_page", true},
{"VDSO32_NOTE_MASK", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index 240626e7f55a..43842fade8fa 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -11,6 +11,7 @@
#include <linux/smp.h>
#include <linux/kernel.h>
#include <linux/mm_types.h>
+#include <linux/elf.h>
#include <asm/processor.h>
#include <asm/vdso.h>
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index f5937742b290..c1b8496b5606 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -14,16 +14,30 @@
#include <linux/elf.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
+#include <linux/time_namespace.h>
+
#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
#include <asm/vvar.h>
+#include <asm/tlb.h>
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/cpufeature.h>
#include <clocksource/hyperv_timer.h>
+#undef _ASM_X86_VVAR_H
+#define EMIT_VVAR(name, offset) \
+ const size_t name ## _offset = offset;
+#include <asm/vvar.h>
+
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+ return (struct vdso_data *)(vvar_page + _vdso_data_offset);
+}
+#undef EMIT_VVAR
+
#if defined(CONFIG_X86_64)
unsigned int __read_mostly vdso64_enabled = 1;
#endif
@@ -37,6 +51,7 @@ void __init init_vdso_image(const struct vdso_image *image)
image->alt_len));
}
+static const struct vm_special_mapping vvar_mapping;
struct linux_binprm;
static vm_fault_t vdso_fault(const struct vm_special_mapping *sm,
@@ -84,10 +99,74 @@ static int vdso_mremap(const struct vm_special_mapping *sm,
return 0;
}
+static int vvar_mremap(const struct vm_special_mapping *sm,
+ struct vm_area_struct *new_vma)
+{
+ const struct vdso_image *image = new_vma->vm_mm->context.vdso_image;
+ unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+
+ if (new_size != -image->sym_vvar_start)
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_TIME_NS
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_data() for details.
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+
+ if (down_write_killable(&mm->mmap_sem))
+ return -EINTR;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (vma_is_special_mapping(vma, &vvar_mapping))
+ zap_page_range(vma, vma->vm_start, size);
+ }
+
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+#else
+static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ return NULL;
+}
+#endif
+
static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
const struct vdso_image *image = vma->vm_mm->context.vdso_image;
+ unsigned long pfn;
long sym_offset;
if (!image)
@@ -107,8 +186,36 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
return VM_FAULT_SIGBUS;
if (sym_offset == image->sym_vvar_page) {
- return vmf_insert_pfn(vma, vmf->address,
- __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
+ struct page *timens_page = find_timens_vvar_page(vma);
+
+ pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+
+ /*
+ * If a task belongs to a time namespace then a namespace
+ * specific VVAR is mapped with the sym_vvar_page offset and
+ * the real VVAR page is mapped with the sym_timens_page
+ * offset.
+ * See also the comment near timens_setup_vdso_data().
+ */
+ if (timens_page) {
+ unsigned long addr;
+ vm_fault_t err;
+
+ /*
+ * Optimization: inside time namespace pre-fault
+ * VVAR page too. As on timens page there are only
+ * offsets for clocks on VVAR, it'll be faulted
+ * shortly by VDSO code.
+ */
+ addr = vmf->address + (image->sym_timens_page - sym_offset);
+ err = vmf_insert_pfn(vma, addr, pfn);
+ if (unlikely(err & VM_FAULT_ERROR))
+ return err;
+
+ pfn = page_to_pfn(timens_page);
+ }
+
+ return vmf_insert_pfn(vma, vmf->address, pfn);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
pvclock_get_pvti_cpu0_va();
@@ -123,6 +230,14 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK))
return vmf_insert_pfn(vma, vmf->address,
virt_to_phys(tsc_pg) >> PAGE_SHIFT);
+ } else if (sym_offset == image->sym_timens_page) {
+ struct page *timens_page = find_timens_vvar_page(vma);
+
+ if (!timens_page)
+ return VM_FAULT_SIGBUS;
+
+ pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT;
+ return vmf_insert_pfn(vma, vmf->address, pfn);
}
return VM_FAULT_SIGBUS;
@@ -136,6 +251,7 @@ static const struct vm_special_mapping vdso_mapping = {
static const struct vm_special_mapping vvar_mapping = {
.name = "[vvar]",
.fault = vvar_fault,
+ .mremap = vvar_mremap,
};
/*
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index a7752cd78b89..39eb276d0277 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -14,6 +14,10 @@
static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
static unsigned long perf_nmi_window;
+/* AMD Event 0xFFF: Merge. Used with Large Increment per Cycle events */
+#define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL)
+#define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE)
+
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -246,6 +250,7 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] =
[PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60,
+ [PERF_COUNT_HW_CACHE_MISSES] = 0x0964,
[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
[PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287,
@@ -301,6 +306,25 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel)
return offset;
}
+/*
+ * AMD64 events are detected based on their event codes.
+ */
+static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
+{
+ return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+}
+
+static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
+{
+ if (!(x86_pmu.flags & PMU_FL_PAIR))
+ return false;
+
+ switch (amd_get_event_code(hwc)) {
+ case 0x003: return true; /* Retired SSE/AVX FLOPs */
+ default: return false;
+ }
+}
+
static int amd_core_hw_config(struct perf_event *event)
{
if (event->attr.exclude_host && event->attr.exclude_guest)
@@ -316,15 +340,10 @@ static int amd_core_hw_config(struct perf_event *event)
else if (event->attr.exclude_guest)
event->hw.config |= AMD64_EVENTSEL_HOSTONLY;
- return 0;
-}
+ if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
+ event->hw.flags |= PERF_X86_EVENT_PAIR;
-/*
- * AMD64 events are detected based on their event codes.
- */
-static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
-{
- return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
+ return 0;
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -855,6 +874,29 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
}
}
+static struct event_constraint pair_constraint;
+
+static struct event_constraint *
+amd_get_event_constraints_f17h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (amd_is_pair_event_code(hwc))
+ return &pair_constraint;
+
+ return &unconstrained;
+}
+
+static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (is_counter_pair(hwc))
+ --cpuc->n_pair;
+}
+
static ssize_t amd_event_sysfs_show(char *page, u64 config)
{
u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
@@ -898,33 +940,15 @@ static __initconst const struct x86_pmu amd_pmu = {
static int __init amd_core_pmu_init(void)
{
+ u64 even_ctr_mask = 0ULL;
+ int i;
+
if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
return 0;
- /* Avoid calulating the value each time in the NMI handler */
+ /* Avoid calculating the value each time in the NMI handler */
perf_nmi_window = msecs_to_jiffies(100);
- switch (boot_cpu_data.x86) {
- case 0x15:
- pr_cont("Fam15h ");
- x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
- break;
- case 0x17:
- pr_cont("Fam17h ");
- /*
- * In family 17h, there are no event constraints in the PMC hardware.
- * We fallback to using default amd_get_event_constraints.
- */
- break;
- case 0x18:
- pr_cont("Fam18h ");
- /* Using default amd_get_event_constraints. */
- break;
- default:
- pr_err("core perfctr but no constraints; unknown hardware!\n");
- return -ENODEV;
- }
-
/*
* If core performance counter extensions exists, we must use
* MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
@@ -939,6 +963,32 @@ static int __init amd_core_pmu_init(void)
*/
x86_pmu.amd_nb_constraints = 0;
+ if (boot_cpu_data.x86 == 0x15) {
+ pr_cont("Fam15h ");
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
+ }
+ if (boot_cpu_data.x86 >= 0x17) {
+ pr_cont("Fam17h+ ");
+ /*
+ * Family 17h and compatibles have constraints for Large
+ * Increment per Cycle events: they may only be assigned an
+ * even numbered counter that has a consecutive adjacent odd
+ * numbered counter following it.
+ */
+ for (i = 0; i < x86_pmu.num_counters - 1; i += 2)
+ even_ctr_mask |= 1 << i;
+
+ pair_constraint = (struct event_constraint)
+ __EVENT_CONSTRAINT(0, even_ctr_mask, 0,
+ x86_pmu.num_counters / 2, 0,
+ PERF_X86_EVENT_PAIR);
+
+ x86_pmu.get_event_constraints = amd_get_event_constraints_f17h;
+ x86_pmu.put_event_constraints = amd_put_event_constraints_f17h;
+ x86_pmu.perf_ctr_pair_en = AMD_MERGE_EVENT_ENABLE;
+ x86_pmu.flags |= PMU_FL_PAIR;
+ }
+
pr_cont("core perfctr, ");
return 0;
}
diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c
index a6ea07f2aa84..4d867a752f0e 100644
--- a/arch/x86/events/amd/uncore.c
+++ b/arch/x86/events/amd/uncore.c
@@ -190,15 +190,12 @@ static int amd_uncore_event_init(struct perf_event *event)
/*
* NB and Last level cache counters (MSRs) are shared across all cores
- * that share the same NB / Last level cache. Interrupts can be directed
- * to a single target core, however, event counts generated by processes
- * running on other cores cannot be masked out. So we do not support
- * sampling and per-thread events.
+ * that share the same NB / Last level cache. On family 16h and below,
+ * Interrupts can be directed to a single target core, however, event
+ * counts generated by processes running on other cores cannot be masked
+ * out. So we do not support sampling and per-thread events via
+ * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts:
*/
- if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
- return -EINVAL;
-
- /* and we do not enable counter overflow interrupts */
hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB;
hwc->idx = -1;
@@ -306,7 +303,7 @@ static struct pmu amd_nb_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
};
static struct pmu amd_llc_pmu = {
@@ -317,7 +314,7 @@ static struct pmu amd_llc_pmu = {
.start = amd_uncore_start,
.stop = amd_uncore_stop,
.read = amd_uncore_read,
- .capabilities = PERF_PMU_CAP_NO_EXCLUDE,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT,
};
static struct amd_uncore *amd_uncore_alloc(unsigned int cpu)
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f118af9f0718..3bb738f5a472 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -618,6 +618,7 @@ void x86_pmu_disable_all(void)
int idx;
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
u64 val;
if (!test_bit(idx, cpuc->active_mask))
@@ -627,6 +628,8 @@ void x86_pmu_disable_all(void)
continue;
val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
wrmsrl(x86_pmu_config_addr(idx), val);
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(idx + 1), 0);
}
}
@@ -699,7 +702,7 @@ struct sched_state {
int counter; /* counter index */
int unassigned; /* number of events to be assigned left */
int nr_gp; /* number of GP counters used */
- unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ u64 used;
};
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
@@ -756,8 +759,12 @@ static bool perf_sched_restore_state(struct perf_sched *sched)
sched->saved_states--;
sched->state = sched->saved[sched->saved_states];
- /* continue with next counter: */
- clear_bit(sched->state.counter++, sched->state.used);
+ /* this assignment didn't work out */
+ /* XXX broken vs EVENT_PAIR */
+ sched->state.used &= ~BIT_ULL(sched->state.counter);
+
+ /* try the next one */
+ sched->state.counter++;
return true;
}
@@ -782,20 +789,32 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
idx = INTEL_PMC_IDX_FIXED;
for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
- if (!__test_and_set_bit(idx, sched->state.used))
- goto done;
+ u64 mask = BIT_ULL(idx);
+
+ if (sched->state.used & mask)
+ continue;
+
+ sched->state.used |= mask;
+ goto done;
}
}
/* Grab the first unused counter starting with idx */
idx = sched->state.counter;
for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
- if (!__test_and_set_bit(idx, sched->state.used)) {
- if (sched->state.nr_gp++ >= sched->max_gp)
- return false;
+ u64 mask = BIT_ULL(idx);
- goto done;
- }
+ if (c->flags & PERF_X86_EVENT_PAIR)
+ mask |= mask << 1;
+
+ if (sched->state.used & mask)
+ continue;
+
+ if (sched->state.nr_gp++ >= sched->max_gp)
+ return false;
+
+ sched->state.used |= mask;
+ goto done;
}
return false;
@@ -872,12 +891,10 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
struct event_constraint *c;
- unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
-
- bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ u64 used_mask = 0;
/*
* Compute the number of events already present; see x86_pmu_add(),
@@ -920,6 +937,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* fastpath, try to reuse previous register
*/
for (i = 0; i < n; i++) {
+ u64 mask;
+
hwc = &cpuc->event_list[i]->hw;
c = cpuc->event_constraint[i];
@@ -931,11 +950,16 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (!test_bit(hwc->idx, c->idxmsk))
break;
+ mask = BIT_ULL(hwc->idx);
+ if (is_counter_pair(hwc))
+ mask |= mask << 1;
+
/* not already used */
- if (test_bit(hwc->idx, used_mask))
+ if (used_mask & mask)
break;
- __set_bit(hwc->idx, used_mask);
+ used_mask |= mask;
+
if (assign)
assign[i] = hwc->idx;
}
@@ -958,6 +982,15 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
READ_ONCE(cpuc->excl_cntrs->exclusive_present))
gpmax /= 2;
+ /*
+ * Reduce the amount of available counters to allow fitting
+ * the extra Merge events needed by large increment events.
+ */
+ if (x86_pmu.flags & PMU_FL_PAIR) {
+ gpmax = x86_pmu.num_counters - cpuc->n_pair;
+ WARN_ON(gpmax <= 0);
+ }
+
unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
wmax, gpmax, assign);
}
@@ -1038,6 +1071,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
return -EINVAL;
cpuc->event_list[n] = leader;
n++;
+ if (is_counter_pair(&leader->hw))
+ cpuc->n_pair++;
}
if (!dogrp)
return n;
@@ -1052,6 +1087,8 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
cpuc->event_list[n] = event;
n++;
+ if (is_counter_pair(&event->hw))
+ cpuc->n_pair++;
}
return n;
}
@@ -1238,6 +1275,13 @@ int x86_perf_event_set_period(struct perf_event *event)
wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
/*
+ * Clear the Merge event counter's upper 16 bits since
+ * we currently declare a 48-bit counter width
+ */
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_event_addr(idx + 1), 0);
+
+ /*
* Due to erratum on certan cpu we need
* a second write to be sure the register
* is updated properly
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 3be51aa06e67..dff6623804c2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4765,6 +4765,7 @@ __init int intel_pmu_init(void)
break;
case INTEL_FAM6_ATOM_TREMONT_D:
+ case INTEL_FAM6_ATOM_TREMONT:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index e1daf4151e11..4814c964692c 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,17 +40,18 @@
* Model specific counters:
* MSR_CORE_C1_RES: CORE C1 Residency Counter
* perf code: 0x00
- * Available model: SLM,AMT,GLM,CNL
+ * Available model: SLM,AMT,GLM,CNL,TNT
* Scope: Core (each processor core has a MSR)
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM,
- * CNL,KBL,CML
+ * CNL,KBL,CML,TNT
* Scope: Core
* MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
+ * TNT
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
@@ -60,17 +61,18 @@
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
* Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL
+ * KBL,CML,ICL,TGL,TNT
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
- * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
- * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL
+ * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
+ * TNT
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
@@ -87,7 +89,8 @@
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
- * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL
+ * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
+ * TNT
* Scope: Package (physical package)
*
*/
@@ -640,8 +643,9 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates),
-
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT_D, glm_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT, glm_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates),
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ce83950036c5..dc43cc124e09 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -7,6 +7,7 @@
#include <asm/perf_event.h>
#include <asm/tlbflush.h>
#include <asm/insn.h>
+#include <asm/io.h>
#include "../perf_event.h"
@@ -1713,6 +1714,8 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
old = ((s64)(prev_raw_count << shift) >> shift);
local64_add(new - old + count * period, &event->count);
+ local64_set(&hwc->period_left, -new);
+
perf_event_update_userpage(event);
return 0;
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index 5053a403e4ae..09913121e726 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -741,6 +741,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl),
X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L, model_skl),
+ X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE, model_skl),
{},
};
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 6f86650b3f77..a949f6f55991 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -75,8 +75,9 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ATOM_GOLDMONT:
case INTEL_FAM6_ATOM_GOLDMONT_D:
-
case INTEL_FAM6_ATOM_GOLDMONT_PLUS:
+ case INTEL_FAM6_ATOM_TREMONT_D:
+ case INTEL_FAM6_ATOM_TREMONT:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 930611db8f9a..f1cd1ca1a77b 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -77,6 +77,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */
#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */
#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */
+#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */
struct amd_nb {
int nb_id; /* NorthBridge id */
@@ -272,6 +273,7 @@ struct cpu_hw_events {
struct amd_nb *amd_nb;
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
+ int n_pair; /* Large increment events */
void *kfree_on_online[X86_PERF_KFREE_MAX];
};
@@ -694,6 +696,7 @@ struct x86_pmu {
* AMD bits
*/
unsigned int amd_nb_constraints : 1;
+ u64 perf_ctr_pair_en;
/*
* Extra registers for events
@@ -743,6 +746,7 @@ do { \
#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */
#define PMU_FL_TFA 0x20 /* deal with TSX force abort */
+#define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -838,6 +842,11 @@ int x86_pmu_hw_config(struct perf_event *event);
void x86_pmu_disable_all(void);
+static inline bool is_counter_pair(struct hw_perf_event *hwc)
+{
+ return hwc->flags & PERF_X86_EVENT_PAIR;
+}
+
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
u64 enable_mask)
{
@@ -845,6 +854,14 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
if (hwc->extra_reg.reg)
wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
+
+ /*
+ * Add enabled Merge event on next counter
+ * if large increment event being enabled on this counter
+ */
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en);
+
wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
}
@@ -861,6 +878,9 @@ static inline void x86_pmu_disable_event(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
wrmsrl(hwc->config_base, hwc->config);
+
+ if (is_counter_pair(hwc))
+ wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0);
}
void x86_pmu_enable_event(struct perf_event *event);
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index caaf4dce99bf..b0da5320bcff 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -21,11 +21,15 @@
#include <linux/hyperv.h>
#include <linux/slab.h>
#include <linux/cpuhotplug.h>
+#include <linux/syscore_ops.h>
#include <clocksource/hyperv_timer.h>
void *hv_hypercall_pg;
EXPORT_SYMBOL_GPL(hv_hypercall_pg);
+/* Storage to save the hypercall page temporarily for hibernation */
+static void *hv_hypercall_pg_saved;
+
u32 *hv_vp_index;
EXPORT_SYMBOL_GPL(hv_vp_index);
@@ -246,6 +250,48 @@ static int __init hv_pci_init(void)
return 1;
}
+static int hv_suspend(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+ /*
+ * Reset the hypercall page as it is going to be invalidated
+ * accross hibernation. Setting hv_hypercall_pg to NULL ensures
+ * that any subsequent hypercall operation fails safely instead of
+ * crashing due to an access of an invalid page. The hypercall page
+ * pointer is restored on resume.
+ */
+ hv_hypercall_pg_saved = hv_hypercall_pg;
+ hv_hypercall_pg = NULL;
+
+ /* Disable the hypercall page in the hypervisor */
+ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 0;
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ return 0;
+}
+
+static void hv_resume(void)
+{
+ union hv_x64_msr_hypercall_contents hypercall_msr;
+
+ /* Re-enable the hypercall page */
+ rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+ hypercall_msr.enable = 1;
+ hypercall_msr.guest_physical_address =
+ vmalloc_to_pfn(hv_hypercall_pg_saved);
+ wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64);
+
+ hv_hypercall_pg = hv_hypercall_pg_saved;
+ hv_hypercall_pg_saved = NULL;
+}
+
+static struct syscore_ops hv_syscore_ops = {
+ .suspend = hv_suspend,
+ .resume = hv_resume,
+};
+
/*
* This function is to be invoked early in the boot sequence after the
* hypervisor has been detected.
@@ -330,6 +376,8 @@ void __init hyperv_init(void)
x86_init.pci.arch_init = hv_pci_init;
+ register_syscore_ops(&hv_syscore_ops);
+
return;
remove_cpuhp_state:
@@ -349,6 +397,8 @@ void hyperv_cleanup(void)
{
union hv_x64_msr_hypercall_contents hypercall_msr;
+ unregister_syscore_ops(&hv_syscore_ops);
+
/* Reset our OS id */
wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 30416d7f19d4..a3aefe9b9401 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -114,8 +114,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
err |= fpu__restore_sig(buf, 1);
- force_iret();
-
return err;
}
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 8b52bc5ddf69..ea34464d6221 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -7,7 +7,6 @@ generated-y += unistd_32_ia32.h
generated-y += unistd_64_x32.h
generated-y += xen-hypercalls.h
-generic-y += dma-contiguous.h
generic-y += early_ioremap.h
generic-y += export.h
generic-y += mcs_spinlock.h
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index bc9693c9107e..ca0976456a6b 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -13,7 +13,6 @@
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/mpspec.h>
-#include <asm/realmode.h>
#include <asm/x86_init.h>
#ifdef CONFIG_ACPI_APEI
@@ -62,7 +61,7 @@ static inline void acpi_disable_pci(void)
extern int (*acpi_suspend_lowlevel)(void);
/* Physical address to resume after wakeup */
-#define acpi_wakeup_address ((unsigned long)(real_mode_header->wakeup_start))
+unsigned long acpi_get_wakeup_address(void);
/*
* Check if the CPU can handle C2 and deeper
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2ebc17d9c72c..19e94af9cc5d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -140,6 +140,7 @@ extern void apic_soft_disable(void);
extern void lapic_shutdown(void);
extern void sync_Arb_IDs(void);
extern void init_bsp_APIC(void);
+extern void apic_intr_mode_select(void);
extern void apic_intr_mode_init(void);
extern void init_apic_mappings(void);
void register_lapic_address(unsigned long address);
@@ -188,6 +189,7 @@ static inline void disable_local_APIC(void) { }
# define setup_secondary_APIC_clock x86_init_noop
static inline void lapic_update_tsc_freq(void) { }
static inline void init_bsp_APIC(void) { }
+static inline void apic_intr_mode_select(void) { }
static inline void apic_intr_mode_init(void) { }
static inline void lapic_assign_system_vectors(void) { }
static inline void lapic_assign_legacy_vector(unsigned int i, bool r) { }
@@ -452,6 +454,14 @@ static inline void ack_APIC_irq(void)
apic_eoi();
}
+
+static inline bool lapic_vector_set_in_irr(unsigned int vector)
+{
+ u32 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+
+ return !!(irr & (1U << (vector % 32)));
+}
+
static inline unsigned default_get_apic_id(unsigned long x)
{
unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index af45e1452f09..7a4bb1bd4bdb 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -27,7 +27,7 @@
/* Unconditional execution of RDRAND and RDSEED */
-static inline bool rdrand_long(unsigned long *v)
+static inline bool __must_check rdrand_long(unsigned long *v)
{
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
@@ -41,7 +41,7 @@ static inline bool rdrand_long(unsigned long *v)
return false;
}
-static inline bool rdrand_int(unsigned int *v)
+static inline bool __must_check rdrand_int(unsigned int *v)
{
bool ok;
unsigned int retry = RDRAND_RETRY_LOOPS;
@@ -55,7 +55,7 @@ static inline bool rdrand_int(unsigned int *v)
return false;
}
-static inline bool rdseed_long(unsigned long *v)
+static inline bool __must_check rdseed_long(unsigned long *v)
{
bool ok;
asm volatile(RDSEED_LONG
@@ -64,7 +64,7 @@ static inline bool rdseed_long(unsigned long *v)
return ok;
}
-static inline bool rdseed_int(unsigned int *v)
+static inline bool __must_check rdseed_int(unsigned int *v)
{
bool ok;
asm volatile(RDSEED_INT
@@ -73,10 +73,6 @@ static inline bool rdseed_int(unsigned int *v)
return ok;
}
-/* Conditional execution based on CPU type */
-#define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND)
-#define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED)
-
/*
* These are the generic interfaces; they must not be declared if the
* stubs in <linux/random.h> are to be invoked,
@@ -84,24 +80,24 @@ static inline bool rdseed_int(unsigned int *v)
*/
#ifdef CONFIG_ARCH_RANDOM
-static inline bool arch_get_random_long(unsigned long *v)
+static inline bool __must_check arch_get_random_long(unsigned long *v)
{
- return arch_has_random() ? rdrand_long(v) : false;
+ return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false;
}
-static inline bool arch_get_random_int(unsigned int *v)
+static inline bool __must_check arch_get_random_int(unsigned int *v)
{
- return arch_has_random() ? rdrand_int(v) : false;
+ return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false;
}
-static inline bool arch_get_random_seed_long(unsigned long *v)
+static inline bool __must_check arch_get_random_seed_long(unsigned long *v)
{
- return arch_has_random_seed() ? rdseed_long(v) : false;
+ return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false;
}
-static inline bool arch_get_random_seed_int(unsigned int *v)
+static inline bool __must_check arch_get_random_seed_int(unsigned int *v)
{
- return arch_has_random_seed() ? rdseed_int(v) : false;
+ return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false;
}
extern void x86_init_rdrand(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
index 794eb2129bc6..92ae28389940 100644
--- a/arch/x86/include/asm/bugs.h
+++ b/arch/x86/include/asm/bugs.h
@@ -6,12 +6,6 @@
extern void check_bugs(void);
-#if defined(CONFIG_CPU_SUP_INTEL)
-void check_mpx_erratum(struct cpuinfo_x86 *c);
-#else
-static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {}
-#endif
-
#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
int ppro_with_ram_bug(void);
#else
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 22c4dfe65992..52e9f3480f69 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -177,23 +177,6 @@ typedef struct user_regs_struct compat_elf_gregset_t;
(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
#endif
-/*
- * A pointer passed in from user mode. This should not
- * be used for syscall parameters, just declare them
- * as pointers because the syscall entry code will have
- * appropriately converted them already.
- */
-
-static inline void __user *compat_ptr(compat_uptr_t uptr)
-{
- return (void __user *)(unsigned long)uptr;
-}
-
-static inline compat_uptr_t ptr_to_compat(void __user *uptr)
-{
- return (u32)(unsigned long)uptr;
-}
-
static inline void __user *arch_compat_alloc_user_space(long len)
{
compat_uptr_t sp;
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 804734058c77..02c0078d3787 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -6,6 +6,7 @@
#include <linux/percpu-defs.h>
#include <asm/processor.h>
#include <asm/intel_ds.h>
+#include <asm/pgtable_areas.h>
#ifdef CONFIG_X86_64
@@ -134,15 +135,6 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
-/* Single page reserved for the readonly IDT mapping: */
-#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
-#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
-
-#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
-
-#define CPU_ENTRY_AREA_MAP_SIZE \
- (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
-
extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
static inline struct entry_stack *cpu_entry_stack(int cpu)
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index e9b62498fe75..f3327cb56edf 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -220,6 +220,7 @@
#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */
+#define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -357,6 +358,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */
#define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */
#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
index a5d86fc0593f..f6d91861cb14 100644
--- a/arch/x86/include/asm/crypto/camellia.h
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -26,71 +26,66 @@ struct camellia_xts_ctx {
extern int __camellia_setkey(struct camellia_ctx *cctx,
const unsigned char *key,
- unsigned int key_len, u32 *flags);
+ unsigned int key_len);
extern int xts_camellia_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen);
/* regular block cipher functions */
-asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void __camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk(const void *ctx, u8 *dst, const u8 *src);
/* 2-way parallel cipher functions */
-asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void __camellia_enc_blk_2way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void camellia_dec_blk_2way(const void *ctx, u8 *dst, const u8 *src);
/* 16-way parallel cipher functions (avx/aes-ni) */
-asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
-
-asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-
-asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
-
-static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src)
+asmlinkage void camellia_ecb_enc_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ecb_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void camellia_cbc_dec_16way(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void camellia_ctr_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+asmlinkage void camellia_xts_enc_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+asmlinkage void camellia_xts_dec_16way(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+
+static inline void camellia_enc_blk(const void *ctx, u8 *dst, const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, false);
}
-static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void camellia_enc_blk_xor(const void *ctx, u8 *dst, const u8 *src)
{
__camellia_enc_blk(ctx, dst, src, true);
}
-static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
+static inline void camellia_enc_blk_2way(const void *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, false);
}
-static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
+static inline void camellia_enc_blk_xor_2way(const void *ctx, u8 *dst,
const u8 *src)
{
__camellia_enc_blk_2way(ctx, dst, src, true);
}
/* glue helpers */
-extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
-extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+extern void camellia_decrypt_cbc_2way(const void *ctx, u8 *dst, const u8 *src);
+extern void camellia_crypt_ctr(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
+extern void camellia_crypt_ctr_2way(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void camellia_xts_enc(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
+extern void camellia_xts_dec(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 8d4a8e1226ee..777c0f63418c 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -11,18 +11,13 @@
#include <asm/fpu/api.h>
#include <crypto/b128ops.h>
-typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
-typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
-typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
+typedef void (*common_glue_func_t)(const void *ctx, u8 *dst, const u8 *src);
+typedef void (*common_glue_cbc_func_t)(const void *ctx, u8 *dst, const u8 *src);
+typedef void (*common_glue_ctr_func_t)(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src,
+typedef void (*common_glue_xts_func_t)(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
-#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
-#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn))
-#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn))
-
struct common_glue_func_entry {
unsigned int num_blocks; /* number of blocks that @fn will process */
union {
@@ -116,7 +111,8 @@ extern int glue_xts_req_128bit(const struct common_glue_ctx *gctx,
common_glue_func_t tweak_fn, void *tweak_ctx,
void *crypt_ctx, bool decrypt);
-extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src,
- le128 *iv, common_glue_func_t fn);
+extern void glue_xts_crypt_128bit_one(const void *ctx, u8 *dst,
+ const u8 *src, le128 *iv,
+ common_glue_func_t fn);
#endif /* _CRYPTO_GLUE_HELPER_H */
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index db7c9cc32234..251c2c89d7cf 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -15,26 +15,26 @@ struct serpent_xts_ctx {
struct serpent_ctx crypt_ctx;
};
-asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_enc_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
-asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_ecb_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
-asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_cbc_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src);
-asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src, le128 *iv);
+asmlinkage void serpent_ctr_8way_avx(const void *ctx, u8 *dst, const u8 *src,
+ le128 *iv);
-asmlinkage void serpent_xts_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_xts_enc_8way_avx(const void *ctx, u8 *dst,
const u8 *src, le128 *iv);
-asmlinkage void serpent_xts_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_xts_dec_8way_avx(const void *ctx, u8 *dst,
const u8 *src, le128 *iv);
-extern void __serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
+extern void __serpent_crypt_ctr(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-extern void serpent_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv);
-extern void serpent_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv);
+extern void serpent_xts_enc(const void *ctx, u8 *dst, const u8 *src, le128 *iv);
+extern void serpent_xts_dec(const void *ctx, u8 *dst, const u8 *src, le128 *iv);
extern int xts_serpent_setkey(struct crypto_skcipher *tfm, const u8 *key,
unsigned int keylen);
diff --git a/arch/x86/include/asm/crypto/serpent-sse2.h b/arch/x86/include/asm/crypto/serpent-sse2.h
index 1a345e8a7496..860ca248914b 100644
--- a/arch/x86/include/asm/crypto/serpent-sse2.h
+++ b/arch/x86/include/asm/crypto/serpent-sse2.h
@@ -9,25 +9,23 @@
#define SERPENT_PARALLEL_BLOCKS 4
-asmlinkage void __serpent_enc_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void __serpent_enc_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
-asmlinkage void serpent_dec_blk_4way(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_dec_blk_4way(const struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
-static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
{
__serpent_enc_blk_4way(ctx, dst, src, false);
}
-static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
{
__serpent_enc_blk_4way(ctx, dst, src, true);
}
-static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
{
serpent_dec_blk_4way(ctx, dst, src);
}
@@ -36,25 +34,23 @@ static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
#define SERPENT_PARALLEL_BLOCKS 8
-asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void __serpent_enc_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
-asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
+asmlinkage void serpent_dec_blk_8way(const struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
-static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_enc_blk_xway(const void *ctx, u8 *dst, const u8 *src)
{
__serpent_enc_blk_8way(ctx, dst, src, false);
}
-static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_enc_blk_xway_xor(const struct serpent_ctx *ctx,
+ u8 *dst, const u8 *src)
{
__serpent_enc_blk_8way(ctx, dst, src, true);
}
-static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
- const u8 *src)
+static inline void serpent_dec_blk_xway(const void *ctx, u8 *dst, const u8 *src)
{
serpent_dec_blk_8way(ctx, dst, src);
}
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index f618bf272b90..2c377a8042e1 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -7,22 +7,19 @@
#include <crypto/b128ops.h>
/* regular block cipher functions from twofish_x86_64 module */
-asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
-asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void twofish_enc_blk(const void *ctx, u8 *dst, const u8 *src);
+asmlinkage void twofish_dec_blk(const void *ctx, u8 *dst, const u8 *src);
/* 3-way parallel cipher functions */
-asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src, bool xor);
-asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
- const u8 *src);
+asmlinkage void __twofish_enc_blk_3way(const void *ctx, u8 *dst, const u8 *src,
+ bool xor);
+asmlinkage void twofish_dec_blk_3way(const void *ctx, u8 *dst, const u8 *src);
/* helpers from twofish_x86_64-3way module */
-extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
-extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
+extern void twofish_dec_blk_cbc_3way(const void *ctx, u8 *dst, const u8 *src);
+extern void twofish_enc_blk_ctr(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
-extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
+extern void twofish_enc_blk_ctr_3way(const void *ctx, u8 *dst, const u8 *src,
le128 *iv);
#endif /* ASM_X86_TWOFISH_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index 5e12c63b47aa..7e31f7f1bb06 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,16 +8,6 @@ struct dev_archdata {
#endif
};
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-struct dma_domain {
- struct list_head node;
- const struct dma_map_ops *dma_ops;
- int domain_nr;
-};
-void add_dma_domain(struct dma_domain *domain);
-void del_dma_domain(struct dma_domain *domain);
-#endif
-
struct pdev_archdata {
};
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index 8e1d0bb46361..4ea8584682f9 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -10,12 +10,6 @@
* cpu_feature_enabled().
*/
-#ifdef CONFIG_X86_INTEL_MPX
-# define DISABLE_MPX 0
-#else
-# define DISABLE_MPX (1<<(X86_FEATURE_MPX & 31))
-#endif
-
#ifdef CONFIG_X86_SMAP
# define DISABLE_SMAP 0
#else
@@ -74,7 +68,7 @@
#define DISABLED_MASK6 0
#define DISABLED_MASK7 (DISABLE_PTI)
#define DISABLED_MASK8 0
-#define DISABLED_MASK9 (DISABLE_MPX|DISABLE_SMAP)
+#define DISABLED_MASK9 (DISABLE_SMAP)
#define DISABLED_MASK10 0
#define DISABLED_MASK11 0
#define DISABLED_MASK12 0
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index d028e9acdf1c..86169a24b0d8 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -8,6 +8,7 @@
#include <asm/tlb.h>
#include <asm/nospec-branch.h>
#include <asm/mmu_context.h>
+#include <linux/build_bug.h>
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -19,13 +20,16 @@
* This is the main reason why we're doing stable VA mappings for RT
* services.
*
- * This flag is used in conjunction with a chicken bit called
- * "efi=old_map" which can be used as a fallback to the old runtime
- * services mapping method in case there's some b0rkage with a
- * particular EFI implementation (haha, it is hard to hold up the
- * sarcasm here...).
+ * SGI UV1 machines are known to be incompatible with this scheme, so we
+ * provide an opt-out for these machines via a DMI quirk that sets the
+ * attribute below.
*/
-#define EFI_OLD_MEMMAP EFI_ARCH_1
+#define EFI_UV1_MEMMAP EFI_ARCH_1
+
+static inline bool efi_have_uv1_memmap(void)
+{
+ return IS_ENABLED(CONFIG_X86_UV) && efi_enabled(EFI_UV1_MEMMAP);
+}
#define EFI32_LOADER_SIGNATURE "EL32"
#define EFI64_LOADER_SIGNATURE "EL64"
@@ -34,10 +38,46 @@
#define ARCH_EFI_IRQ_FLAGS_MASK X86_EFLAGS_IF
-#ifdef CONFIG_X86_32
+/*
+ * The EFI services are called through variadic functions in many cases. These
+ * functions are implemented in assembler and support only a fixed number of
+ * arguments. The macros below allows us to check at build time that we don't
+ * try to call them with too many arguments.
+ *
+ * __efi_nargs() will return the number of arguments if it is 7 or less, and
+ * cause a BUILD_BUG otherwise. The limitations of the C preprocessor make it
+ * impossible to calculate the exact number of arguments beyond some
+ * pre-defined limit. The maximum number of arguments currently supported by
+ * any of the thunks is 7, so this is good enough for now and can be extended
+ * in the obvious way if we ever need more.
+ */
-extern asmlinkage unsigned long efi_call_phys(void *, ...);
+#define __efi_nargs(...) __efi_nargs_(__VA_ARGS__)
+#define __efi_nargs_(...) __efi_nargs__(0, ##__VA_ARGS__, \
+ __efi_arg_sentinel(7), __efi_arg_sentinel(6), \
+ __efi_arg_sentinel(5), __efi_arg_sentinel(4), \
+ __efi_arg_sentinel(3), __efi_arg_sentinel(2), \
+ __efi_arg_sentinel(1), __efi_arg_sentinel(0))
+#define __efi_nargs__(_0, _1, _2, _3, _4, _5, _6, _7, n, ...) \
+ __take_second_arg(n, \
+ ({ BUILD_BUG_ON_MSG(1, "__efi_nargs limit exceeded"); 8; }))
+#define __efi_arg_sentinel(n) , n
+/*
+ * __efi_nargs_check(f, n, ...) will cause a BUILD_BUG if the ellipsis
+ * represents more than n arguments.
+ */
+
+#define __efi_nargs_check(f, n, ...) \
+ __efi_nargs_check_(f, __efi_nargs(__VA_ARGS__), n)
+#define __efi_nargs_check_(f, p, n) __efi_nargs_check__(f, p, n)
+#define __efi_nargs_check__(f, p, n) ({ \
+ BUILD_BUG_ON_MSG( \
+ (p) > (n), \
+ #f " called with too many arguments (" #p ">" #n ")"); \
+})
+
+#ifdef CONFIG_X86_32
#define arch_efi_call_virt_setup() \
({ \
kernel_fpu_begin(); \
@@ -51,13 +91,7 @@ extern asmlinkage unsigned long efi_call_phys(void *, ...);
})
-/*
- * Wrap all the virtual calls in a way that forces the parameters on the stack.
- */
-#define arch_efi_call_virt(p, f, args...) \
-({ \
- ((efi_##f##_t __attribute__((regparm(0)))*) p->f)(args); \
-})
+#define arch_efi_call_virt(p, f, args...) p->f(args)
#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size)
@@ -65,9 +99,12 @@ extern asmlinkage unsigned long efi_call_phys(void *, ...);
#define EFI_LOADER_SIGNATURE "EL64"
-extern asmlinkage u64 efi_call(void *fp, ...);
+extern asmlinkage u64 __efi_call(void *fp, ...);
-#define efi_call_phys(f, args...) efi_call((f), args)
+#define efi_call(...) ({ \
+ __efi_nargs_check(efi_call, 7, __VA_ARGS__); \
+ __efi_call(__VA_ARGS__); \
+})
/*
* struct efi_scratch - Scratch space used while switching to/from efi_mm
@@ -85,7 +122,7 @@ struct efi_scratch {
kernel_fpu_begin(); \
firmware_restrict_branch_speculation_start(); \
\
- if (!efi_enabled(EFI_OLD_MEMMAP)) \
+ if (!efi_have_uv1_memmap()) \
efi_switch_mm(&efi_mm); \
})
@@ -94,7 +131,7 @@ struct efi_scratch {
#define arch_efi_call_virt_teardown() \
({ \
- if (!efi_enabled(EFI_OLD_MEMMAP)) \
+ if (!efi_have_uv1_memmap()) \
efi_switch_mm(efi_scratch.prev_mm); \
\
firmware_restrict_branch_speculation_end(); \
@@ -121,8 +158,6 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
extern struct efi_scratch efi_scratch;
extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
extern int __init efi_memblock_x86_reserve_range(void);
-extern pgd_t * __init efi_call_phys_prolog(void);
-extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
extern void __init efi_print_memmap(void);
extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
@@ -140,6 +175,8 @@ extern void efi_delete_dummy_variable(void);
extern void efi_switch_mm(struct mm_struct *mm);
extern void efi_recover_from_page_fault(unsigned long phys_addr);
extern void efi_free_boot_services(void);
+extern pgd_t * __init efi_uv1_memmap_phys_prolog(void);
+extern void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd);
struct efi_setup_data {
u64 fw_vendor;
@@ -152,93 +189,144 @@ struct efi_setup_data {
extern u64 efi_setup;
#ifdef CONFIG_EFI
+extern efi_status_t __efi64_thunk(u32, ...);
-static inline bool efi_is_native(void)
+#define efi64_thunk(...) ({ \
+ __efi_nargs_check(efi64_thunk, 6, __VA_ARGS__); \
+ __efi64_thunk(__VA_ARGS__); \
+})
+
+static inline bool efi_is_mixed(void)
{
- return IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT);
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return false;
+ return IS_ENABLED(CONFIG_X86_64) && !efi_enabled(EFI_64BIT);
}
static inline bool efi_runtime_supported(void)
{
- if (efi_is_native())
- return true;
-
- if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_enabled(EFI_OLD_MEMMAP))
+ if (IS_ENABLED(CONFIG_X86_64) == efi_enabled(EFI_64BIT))
return true;
- return false;
+ return IS_ENABLED(CONFIG_EFI_MIXED);
}
extern void parse_efi_setup(u64 phys_addr, u32 data_len);
extern void efifb_setup_from_dmi(struct screen_info *si, const char *opt);
-#ifdef CONFIG_EFI_MIXED
extern void efi_thunk_runtime_setup(void);
-extern efi_status_t efi_thunk_set_virtual_address_map(
- void *phys_set_virtual_address_map,
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map);
-#else
-static inline void efi_thunk_runtime_setup(void) {}
-static inline efi_status_t efi_thunk_set_virtual_address_map(
- void *phys_set_virtual_address_map,
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- return EFI_SUCCESS;
-}
-#endif /* CONFIG_EFI_MIXED */
-
+efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map);
/* arch specific definitions used by the stub code */
-struct efi_config {
- u64 image_handle;
- u64 table;
- u64 runtime_services;
- u64 boot_services;
- u64 text_output;
- efi_status_t (*call)(unsigned long, ...);
- bool is64;
-} __packed;
-
-__pure const struct efi_config *__efi_early(void);
+__attribute_const__ bool efi_is_64bit(void);
-static inline bool efi_is_64bit(void)
+static inline bool efi_is_native(void)
{
if (!IS_ENABLED(CONFIG_X86_64))
- return false;
-
+ return true;
if (!IS_ENABLED(CONFIG_EFI_MIXED))
return true;
+ return efi_is_64bit();
+}
+
+#define efi_mixed_mode_cast(attr) \
+ __builtin_choose_expr( \
+ __builtin_types_compatible_p(u32, __typeof__(attr)), \
+ (unsigned long)(attr), (attr))
- return __efi_early()->is64;
+#define efi_table_attr(inst, attr) \
+ (efi_is_native() \
+ ? inst->attr \
+ : (__typeof__(inst->attr)) \
+ efi_mixed_mode_cast(inst->mixed_mode.attr))
+
+/*
+ * The following macros allow translating arguments if necessary from native to
+ * mixed mode. The use case for this is to initialize the upper 32 bits of
+ * output parameters, and where the 32-bit method requires a 64-bit argument,
+ * which must be split up into two arguments to be thunked properly.
+ *
+ * As examples, the AllocatePool boot service returns the address of the
+ * allocation, but it will not set the high 32 bits of the address. To ensure
+ * that the full 64-bit address is initialized, we zero-init the address before
+ * calling the thunk.
+ *
+ * The FreePages boot service takes a 64-bit physical address even in 32-bit
+ * mode. For the thunk to work correctly, a native 64-bit call of
+ * free_pages(addr, size)
+ * must be translated to
+ * efi64_thunk(free_pages, addr & U32_MAX, addr >> 32, size)
+ * so that the two 32-bit halves of addr get pushed onto the stack separately.
+ */
+
+static inline void *efi64_zero_upper(void *p)
+{
+ ((u32 *)p)[1] = 0;
+ return p;
}
-#define efi_table_attr(table, attr, instance) \
- (efi_is_64bit() ? \
- ((table##_64_t *)(unsigned long)instance)->attr : \
- ((table##_32_t *)(unsigned long)instance)->attr)
+#define __efi64_argmap_free_pages(addr, size) \
+ ((addr), 0, (size))
+
+#define __efi64_argmap_get_memory_map(mm_size, mm, key, size, ver) \
+ ((mm_size), (mm), efi64_zero_upper(key), efi64_zero_upper(size), (ver))
+
+#define __efi64_argmap_allocate_pool(type, size, buffer) \
+ ((type), (size), efi64_zero_upper(buffer))
-#define efi_call_proto(protocol, f, instance, ...) \
- __efi_early()->call(efi_table_attr(protocol, f, instance), \
- instance, ##__VA_ARGS__)
+#define __efi64_argmap_handle_protocol(handle, protocol, interface) \
+ ((handle), (protocol), efi64_zero_upper(interface))
-#define efi_call_early(f, ...) \
- __efi_early()->call(efi_table_attr(efi_boot_services, f, \
- __efi_early()->boot_services), __VA_ARGS__)
+#define __efi64_argmap_locate_protocol(protocol, reg, interface) \
+ ((protocol), (reg), efi64_zero_upper(interface))
-#define __efi_call_early(f, ...) \
- __efi_early()->call((unsigned long)f, __VA_ARGS__);
+/* PCI I/O */
+#define __efi64_argmap_get_location(protocol, seg, bus, dev, func) \
+ ((protocol), efi64_zero_upper(seg), efi64_zero_upper(bus), \
+ efi64_zero_upper(dev), efi64_zero_upper(func))
+
+/*
+ * The macros below handle the plumbing for the argument mapping. To add a
+ * mapping for a specific EFI method, simply define a macro
+ * __efi64_argmap_<method name>, following the examples above.
+ */
-#define efi_call_runtime(f, ...) \
- __efi_early()->call(efi_table_attr(efi_runtime_services, f, \
- __efi_early()->runtime_services), __VA_ARGS__)
+#define __efi64_thunk_map(inst, func, ...) \
+ efi64_thunk(inst->mixed_mode.func, \
+ __efi64_argmap(__efi64_argmap_ ## func(__VA_ARGS__), \
+ (__VA_ARGS__)))
+
+#define __efi64_argmap(mapped, args) \
+ __PASTE(__efi64_argmap__, __efi_nargs(__efi_eat mapped))(mapped, args)
+#define __efi64_argmap__0(mapped, args) __efi_eval mapped
+#define __efi64_argmap__1(mapped, args) __efi_eval args
+
+#define __efi_eat(...)
+#define __efi_eval(...) __VA_ARGS__
+
+/* The three macros below handle dispatching via the thunk if needed */
+
+#define efi_call_proto(inst, func, ...) \
+ (efi_is_native() \
+ ? inst->func(inst, ##__VA_ARGS__) \
+ : __efi64_thunk_map(inst, func, inst, ##__VA_ARGS__))
+
+#define efi_bs_call(func, ...) \
+ (efi_is_native() \
+ ? efi_system_table()->boottime->func(__VA_ARGS__) \
+ : __efi64_thunk_map(efi_table_attr(efi_system_table(), \
+ boottime), func, __VA_ARGS__))
+
+#define efi_rt_call(func, ...) \
+ (efi_is_native() \
+ ? efi_system_table()->runtime->func(__VA_ARGS__) \
+ : __efi64_thunk_map(efi_table_attr(efi_system_table(), \
+ runtime), func, __VA_ARGS__))
extern bool efi_reboot_required(void);
extern bool efi_is_table_address(unsigned long phys_addr);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index c2a7458f912c..85be2f506272 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -47,8 +47,6 @@ struct dyn_arch_ftrace {
/* No extra data needed for x86 */
};
-int ftrace_int3_handler(struct pt_regs *regs);
-
#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR
#endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 5f10f7f2098d..92abc1e42bfc 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -809,7 +809,8 @@ union hv_synic_sint {
u64 reserved1:8;
u64 masked:1;
u64 auto_eoi:1;
- u64 reserved2:46;
+ u64 polling:1;
+ u64 reserved2:45;
} __packed;
};
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index c606c0b70738..4981c293f926 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -111,6 +111,7 @@
#define INTEL_FAM6_ATOM_TREMONT_D 0x86 /* Jacobsville */
#define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */
+#define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */
/* Xeon Phi */
diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h
index 9e7adcdbe031..e6da1ce26256 100644
--- a/arch/x86/include/asm/intel_pmc_ipc.h
+++ b/arch/x86/include/asm/intel_pmc_ipc.h
@@ -31,30 +31,13 @@
#if IS_ENABLED(CONFIG_INTEL_PMC_IPC)
-int intel_pmc_ipc_simple_command(int cmd, int sub);
-int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
- u32 *out, u32 outlen, u32 dptr, u32 sptr);
int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen);
int intel_pmc_s0ix_counter_read(u64 *data);
-int intel_pmc_gcr_read(u32 offset, u32 *data);
int intel_pmc_gcr_read64(u32 offset, u64 *data);
-int intel_pmc_gcr_write(u32 offset, u32 data);
-int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val);
#else
-static inline int intel_pmc_ipc_simple_command(int cmd, int sub)
-{
- return -EINVAL;
-}
-
-static inline int intel_pmc_ipc_raw_cmd(u32 cmd, u32 sub, u8 *in, u32 inlen,
- u32 *out, u32 outlen, u32 dptr, u32 sptr)
-{
- return -EINVAL;
-}
-
static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen,
u32 *out, u32 outlen)
{
@@ -66,26 +49,11 @@ static inline int intel_pmc_s0ix_counter_read(u64 *data)
return -EINVAL;
}
-static inline int intel_pmc_gcr_read(u32 offset, u32 *data)
-{
- return -EINVAL;
-}
-
static inline int intel_pmc_gcr_read64(u32 offset, u64 *data)
{
return -EINVAL;
}
-static inline int intel_pmc_gcr_write(u32 offset, u32 data)
-{
- return -EINVAL;
-}
-
-static inline int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val)
-{
- return -EINVAL;
-}
-
#endif /*CONFIG_INTEL_PMC_IPC*/
#endif
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
index 4a8c6e817398..2a1442ba6e78 100644
--- a/arch/x86/include/asm/intel_scu_ipc.h
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -22,24 +22,12 @@
/* Read single register */
int intel_scu_ipc_ioread8(u16 addr, u8 *data);
-/* Read two sequential registers */
-int intel_scu_ipc_ioread16(u16 addr, u16 *data);
-
-/* Read four sequential registers */
-int intel_scu_ipc_ioread32(u16 addr, u32 *data);
-
/* Read a vector */
int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
/* Write single register */
int intel_scu_ipc_iowrite8(u16 addr, u8 data);
-/* Write two sequential registers */
-int intel_scu_ipc_iowrite16(u16 addr, u16 data);
-
-/* Write four sequential registers */
-int intel_scu_ipc_iowrite32(u16 addr, u32 data);
-
/* Write a vector */
int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
@@ -50,14 +38,6 @@ int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
int intel_scu_ipc_simple_command(int cmd, int sub);
int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
u32 *out, int outlen);
-int intel_scu_ipc_raw_command(int cmd, int sub, u8 *in, int inlen,
- u32 *out, int outlen, u32 dptr, u32 sptr);
-
-/* I2C control api */
-int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
-
-/* Update FW version */
-int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
extern struct blocking_notifier_head intel_scu_notifier;
diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h
index 214394860632..2f77e31a1283 100644
--- a/arch/x86/include/asm/intel_telemetry.h
+++ b/arch/x86/include/asm/intel_telemetry.h
@@ -40,13 +40,10 @@ struct telemetry_evtmap {
struct telemetry_unit_config {
struct telemetry_evtmap *telem_evts;
void __iomem *regmap;
- u32 ssram_base_addr;
u8 ssram_evts_used;
u8 curr_period;
u8 max_period;
u8 min_period;
- u32 ssram_size;
-
};
struct telemetry_plt_config {
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 9997521fc5cd..e1aa17a468a8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -399,4 +399,40 @@ extern bool arch_memremap_can_ram_remap(resource_size_t offset,
extern bool phys_mem_access_encrypted(unsigned long phys_addr,
unsigned long size);
+/**
+ * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
+ * @__dst: destination, in MMIO space (must be 512-bit aligned)
+ * @src: source
+ * @count: number of 512 bits quantities to submit
+ *
+ * Submit data from kernel space to MMIO space, in units of 512 bits at a
+ * time. Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ *
+ * Warning: Do not use this helper unless your driver has checked that the CPU
+ * instruction is supported on the platform.
+ */
+static inline void iosubmit_cmds512(void __iomem *__dst, const void *src,
+ size_t count)
+{
+ /*
+ * Note that this isn't an "on-stack copy", just definition of "dst"
+ * as a pointer to 64-bytes of stuff that is going to be overwritten.
+ * In the MOVDIR64B case that may be needed as you can use the
+ * MOVDIR64B instruction to copy arbitrary memory around. This trick
+ * lets the compiler know how much gets clobbered.
+ */
+ volatile struct { char _[64]; } *dst = __dst;
+ const u8 *from = src;
+ const u8 *end = from + count * 64;
+
+ while (from < end) {
+ /* MOVDIR64B [rdx], rax */
+ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02"
+ : "=m" (dst)
+ : "d" (from), "a" (dst));
+ from += 64;
+ }
+}
+
#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h
index 02c6ef8f7667..07344d82e88e 100644
--- a/arch/x86/include/asm/io_bitmap.h
+++ b/arch/x86/include/asm/io_bitmap.h
@@ -19,7 +19,14 @@ struct task_struct;
void io_bitmap_share(struct task_struct *tsk);
void io_bitmap_exit(void);
-void tss_update_io_bitmap(void);
+void native_tss_update_io_bitmap(void);
+
+#ifdef CONFIG_PARAVIRT_XXL
+#include <asm/paravirt.h>
+#else
+#define tss_update_io_bitmap native_tss_update_io_bitmap
+#endif
+
#else
static inline void io_bitmap_share(struct task_struct *tsk) { }
static inline void io_bitmap_exit(void) { }
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 75f1e35e7c15..247ab14c6309 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -33,6 +33,7 @@ enum show_regs_mode {
};
extern void die(const char *, struct pt_regs *,long);
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_stack_regs(struct pt_regs *regs);
extern void __show_regs(struct pt_regs *regs, enum show_regs_mode);
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 5dc909d9ad81..95b1f053bd96 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -11,12 +11,11 @@
#include <asm-generic/kprobes.h>
-#define BREAKPOINT_INSTRUCTION 0xcc
-
#ifdef CONFIG_KPROBES
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
+#include <asm/text-patching.h>
#include <asm/insn.h>
#define __ARCH_WANT_KPROBES_INSN_SLOT
@@ -25,10 +24,7 @@ struct pt_regs;
struct kprobe;
typedef u8 kprobe_opcode_t;
-#define RELATIVEJUMP_OPCODE 0xe9
-#define RELATIVEJUMP_SIZE 5
-#define RELATIVECALL_OPCODE 0xe8
-#define RELATIVE_ADDR_SIZE 4
+
#define MAX_STACK_SIZE 64
#define CUR_STACK_SIZE(ADDR) \
(current_top_of_stack() - (unsigned long)(ADDR))
@@ -43,11 +39,11 @@ extern __visible kprobe_opcode_t optprobe_template_entry[];
extern __visible kprobe_opcode_t optprobe_template_val[];
extern __visible kprobe_opcode_t optprobe_template_call[];
extern __visible kprobe_opcode_t optprobe_template_end[];
-#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
+#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + DISP32_SIZE)
#define MAX_OPTINSN_SIZE \
(((unsigned long)optprobe_template_end - \
(unsigned long)optprobe_template_entry) + \
- MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
+ MAX_OPTIMIZED_LENGTH + JMP32_INSN_SIZE)
extern const int kretprobe_blacklist_size;
@@ -73,7 +69,7 @@ struct arch_specific_insn {
struct arch_optimized_insn {
/* copy of the original instructions */
- kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
+ kprobe_opcode_t copied_insn[DISP32_SIZE];
/* detour code buffer */
kprobe_opcode_t *insn;
/* the size of instructions copied to detour code buffer */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 77cf6c11f66b..c06e8353efd3 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -222,6 +222,10 @@ struct x86_emulate_ops {
bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit);
+ bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
@@ -288,6 +292,14 @@ enum x86emul_mode {
#define X86EMUL_SMM_MASK (1 << 6)
#define X86EMUL_SMM_INSIDE_NMI_MASK (1 << 7)
+/*
+ * fastop functions are declared as taking a never-defined fastop parameter,
+ * so they can't be called from C directly.
+ */
+struct fastop;
+
+typedef void (*fastop_t)(struct fastop *);
+
struct x86_emulate_ctxt {
const struct x86_emulate_ops *ops;
@@ -320,7 +332,10 @@ struct x86_emulate_ctxt {
struct operand src;
struct operand src2;
struct operand dst;
- int (*execute)(struct x86_emulate_ctxt *ctxt);
+ union {
+ int (*execute)(struct x86_emulate_ctxt *ctxt);
+ fastop_t fop;
+ };
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
/*
* The following six fields are cleared together,
@@ -345,7 +360,6 @@ struct x86_emulate_ctxt {
u64 d;
unsigned long _eip;
struct operand memop;
- /* Fields above regs are cleared together. */
unsigned long _regs[NR_VCPU_REGS];
struct operand *memopp;
struct fetch_cache fetch;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b79cd6aa4075..98959e8cd448 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -78,6 +78,8 @@
#define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22)
#define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23)
#define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24)
+#define KVM_REQ_APICV_UPDATE \
+ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define CR0_RESERVED_BITS \
(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
@@ -175,6 +177,11 @@ enum {
VCPU_SREG_LDTR,
};
+enum exit_fastpath_completion {
+ EXIT_FASTPATH_NONE,
+ EXIT_FASTPATH_SKIP_EMUL_INS,
+};
+
#include <asm/kvm_emulate.h>
#define KVM_NR_MEM_OBJS 40
@@ -378,12 +385,12 @@ struct kvm_mmu {
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err,
bool prefault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
- gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- struct x86_exception *exception);
+ gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t gva_or_gpa,
+ u32 access, struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
struct x86_exception *exception);
int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -606,7 +613,7 @@ struct kvm_vcpu_arch {
* Paging state of an L2 guest (used for nested npt)
*
* This context will save all necessary information to walk page tables
- * of the an L2 guest. This context is only initialized for page table
+ * of an L2 guest. This context is only initialized for page table
* walking and not for faulting since we never handle l2 page faults on
* the host.
*/
@@ -685,10 +692,10 @@ struct kvm_vcpu_arch {
bool pvclock_set_guest_stopped_request;
struct {
+ u8 preempted;
u64 msr_val;
u64 last_steal;
- struct gfn_to_hva_cache stime;
- struct kvm_steal_time steal;
+ struct gfn_to_pfn_cache cache;
} st;
u64 tsc_offset;
@@ -774,9 +781,19 @@ struct kvm_vcpu_arch {
u64 msr_kvm_poll_control;
/*
- * Indicate whether the access faults on its page table in guest
- * which is set when fix page fault and used to detect unhandeable
- * instruction.
+ * Indicates the guest is trying to write a gfn that contains one or
+ * more of the PTEs used to translate the write itself, i.e. the access
+ * is changing its own translation in the guest page tables. KVM exits
+ * to userspace if emulation of the faulting instruction fails and this
+ * flag is set, as KVM cannot make forward progress.
+ *
+ * If emulation fails for a write to guest page tables, KVM unprotects
+ * (zaps) the shadow page for the target gfn and resumes the guest to
+ * retry the non-emulatable instruction (on hardware). Unprotecting the
+ * gfn doesn't allow forward progress for a self-changing access because
+ * doing so also zaps the translation for the gfn, i.e. retrying the
+ * instruction will hit a !PRESENT fault, which results in a new shadow
+ * page and sends KVM back to square one.
*/
bool write_fault_to_shadow_pgtable;
@@ -868,6 +885,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
};
+#define APICV_INHIBIT_REASON_DISABLE 0
+#define APICV_INHIBIT_REASON_HYPERV 1
+#define APICV_INHIBIT_REASON_NESTED 2
+#define APICV_INHIBIT_REASON_IRQWIN 3
+#define APICV_INHIBIT_REASON_PIT_REINJ 4
+
struct kvm_arch {
unsigned long n_used_mmu_pages;
unsigned long n_requested_mmu_pages;
@@ -899,6 +922,7 @@ struct kvm_arch {
struct kvm_apic_map *apic_map;
bool apic_access_page_done;
+ unsigned long apicv_inhibit_reasons;
gpa_t wall_clock;
@@ -1022,6 +1046,11 @@ struct kvm_lapic_irq {
bool msi_redir_hint;
};
+static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical)
+{
+ return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL;
+}
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -1040,7 +1069,7 @@ struct kvm_x86_ops {
void (*vm_destroy)(struct kvm *kvm);
/* Create, but do not attach this VCPU */
- struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
+ int (*vcpu_create)(struct kvm_vcpu *vcpu);
void (*vcpu_free)(struct kvm_vcpu *vcpu);
void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event);
@@ -1090,8 +1119,10 @@ struct kvm_x86_ops {
void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
void (*run)(struct kvm_vcpu *vcpu);
- int (*handle_exit)(struct kvm_vcpu *vcpu);
+ int (*handle_exit)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath);
int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+ void (*update_emulated_instruction)(struct kvm_vcpu *vcpu);
void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu);
void (*patch_hypercall)(struct kvm_vcpu *vcpu,
@@ -1107,7 +1138,8 @@ struct kvm_x86_ops {
void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
void (*enable_irq_window)(struct kvm_vcpu *vcpu);
void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
- bool (*get_enable_apicv)(struct kvm *kvm);
+ bool (*check_apicv_inhibit_reasons)(ulong bit);
+ void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate);
void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr);
@@ -1115,7 +1147,7 @@ struct kvm_x86_ops {
void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
- void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
+ int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr);
@@ -1140,11 +1172,13 @@ struct kvm_x86_ops {
int (*check_intercept)(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage);
- void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu);
+ void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion *exit_fastpath);
bool (*mpx_supported)(void);
bool (*xsaves_supported)(void);
bool (*umip_emulated)(void);
bool (*pt_supported)(void);
+ bool (*pku_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
void (*request_immediate_exit)(struct kvm_vcpu *vcpu);
@@ -1464,11 +1498,15 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+bool kvm_apicv_activated(struct kvm *kvm);
+void kvm_apicv_init(struct kvm *kvm, bool enable);
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
+void kvm_request_apicv_update(struct kvm *kvm, bool activate,
+ unsigned long bit);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
@@ -1614,7 +1652,6 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu);
int kvm_is_in_guest(void);
int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index dc2d4b206ab7..4359b955e0b7 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -144,7 +144,7 @@ struct mce_log_buffer {
enum mce_notifier_prios {
MCE_PRIO_FIRST = INT_MAX,
- MCE_PRIO_SRAO = INT_MAX - 1,
+ MCE_PRIO_UC = INT_MAX - 1,
MCE_PRIO_EXTLOG = INT_MAX - 2,
MCE_PRIO_NFIT = INT_MAX - 3,
MCE_PRIO_EDAC = INT_MAX - 4,
@@ -290,6 +290,7 @@ extern void apei_mce_report_mem_error(int corrected,
/* These may be used by multiple smca_hwid_mcatypes */
enum smca_bank_types {
SMCA_LS = 0, /* Load Store */
+ SMCA_LS_V2, /* Load Store */
SMCA_IF, /* Instruction Fetch */
SMCA_L2_CACHE, /* L2 Cache */
SMCA_DE, /* Decoder Unit */
diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h
new file mode 100644
index 000000000000..9c2447b3555d
--- /dev/null
+++ b/arch/x86/include/asm/memtype.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MEMTYPE_H
+#define _ASM_X86_MEMTYPE_H
+
+#include <linux/types.h>
+#include <asm/pgtable_types.h>
+
+extern bool pat_enabled(void);
+extern void pat_disable(const char *reason);
+extern void pat_init(void);
+extern void init_cache_modes(void);
+
+extern int memtype_reserve(u64 start, u64 end,
+ enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
+extern int memtype_free(u64 start, u64 end);
+
+extern int memtype_kernel_map_sync(u64 base, unsigned long size,
+ enum page_cache_mode pcm);
+
+extern int memtype_reserve_io(resource_size_t start, resource_size_t end,
+ enum page_cache_mode *pcm);
+
+extern void memtype_free_io(resource_size_t start, resource_size_t end);
+
+extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
+
+#endif /* _ASM_X86_MEMTYPE_H */
diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index 209492849566..6685e1218959 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -53,6 +53,6 @@ static inline void __init load_ucode_amd_bsp(unsigned int family) {}
static inline void load_ucode_amd_ap(unsigned int family) {}
static inline int __init
save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; }
-void reload_ucode_amd(void) {}
+static inline void reload_ucode_amd(void) {}
#endif
#endif /* _ASM_X86_MICROCODE_AMD_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index e78c7db87801..bdeae9291e5c 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -50,10 +50,6 @@ typedef struct {
u16 pkey_allocation_map;
s16 execute_only_pkey;
#endif
-#ifdef CONFIG_X86_INTEL_MPX
- /* address of the bounds directory */
- void __user *bd_addr;
-#endif
} mm_context_t;
#define INIT_MM_CONTEXT(mm) \
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 5f33924e200f..b538d9ddee9c 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -12,7 +12,6 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
-#include <asm/mpx.h>
#include <asm/debugreg.h>
extern atomic64_t last_mm_ctx_id;
@@ -69,14 +68,6 @@ struct ldt_struct {
int slot;
};
-/* This is a multiple of PAGE_SIZE. */
-#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
-
-static inline void *ldt_slot_va(int slot)
-{
- return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
-}
-
/*
* Used for LDT copy/destruction.
*/
@@ -99,87 +90,21 @@ static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
-static inline void load_mm_ldt(struct mm_struct *mm)
-{
#ifdef CONFIG_MODIFY_LDT_SYSCALL
- struct ldt_struct *ldt;
-
- /* READ_ONCE synchronizes with smp_store_release */
- ldt = READ_ONCE(mm->context.ldt);
-
- /*
- * Any change to mm->context.ldt is followed by an IPI to all
- * CPUs with the mm active. The LDT will not be freed until
- * after the IPI is handled by all such CPUs. This means that,
- * if the ldt_struct changes before we return, the values we see
- * will be safe, and the new values will be loaded before we run
- * any user code.
- *
- * NB: don't try to convert this to use RCU without extreme care.
- * We would still need IRQs off, because we don't want to change
- * the local LDT after an IPI loaded a newer value than the one
- * that we can see.
- */
-
- if (unlikely(ldt)) {
- if (static_cpu_has(X86_FEATURE_PTI)) {
- if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
- /*
- * Whoops -- either the new LDT isn't mapped
- * (if slot == -1) or is mapped into a bogus
- * slot (if slot > 1).
- */
- clear_LDT();
- return;
- }
-
- /*
- * If page table isolation is enabled, ldt->entries
- * will not be mapped in the userspace pagetables.
- * Tell the CPU to access the LDT through the alias
- * at ldt_slot_va(ldt->slot).
- */
- set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
- } else {
- set_ldt(ldt->entries, ldt->nr_entries);
- }
- } else {
- clear_LDT();
- }
+extern void load_mm_ldt(struct mm_struct *mm);
+extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
+static inline void load_mm_ldt(struct mm_struct *mm)
+{
clear_LDT();
-#endif
}
-
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT if either the old or new mm had an LDT.
- *
- * An mm will never go from having an LDT to not having an LDT. Two
- * mms never share an LDT, so we don't gain anything by checking to
- * see whether the LDT changed. There's also no guarantee that
- * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
- * then prev->context.ldt will also be non-NULL.
- *
- * If we really cared, we could optimize the case where prev == next
- * and we're exiting lazy mode. Most of the time, if this happens,
- * we don't actually need to reload LDTR, but modify_ldt() is mostly
- * used by legacy code and emulators where we don't need this level of
- * performance.
- *
- * This uses | instead of || because it generates better code.
- */
- if (unlikely((unsigned long)prev->context.ldt |
- (unsigned long)next->context.ldt))
- load_mm_ldt(next);
-#endif
-
DEBUG_LOCKS_WARN_ON(preemptible());
}
+#endif
-void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
+extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
/*
* Init a new mm. Used on mm copies, like at fork()
@@ -274,34 +199,9 @@ static inline bool is_64bit_mm(struct mm_struct *mm)
}
#endif
-static inline void arch_bprm_mm_init(struct mm_struct *mm,
- struct vm_area_struct *vma)
-{
- mpx_mm_init(mm);
-}
-
static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
- /*
- * mpx_notify_unmap() goes and reads a rarely-hot
- * cacheline in the mm_struct. That can be expensive
- * enough to be seen in profiles.
- *
- * The mpx_notify_unmap() call and its contents have been
- * observed to affect munmap() performance on hardware
- * where MPX is not present.
- *
- * The unlikely() optimizes for the fast case: no MPX
- * in the CPU, or no MPX use in the process. Even if
- * we get this wrong (in the unlikely event that MPX
- * is widely enabled on some system) the overhead of
- * MPX itself (reading bounds tables) is expected to
- * overwhelm the overhead of getting this unlikely()
- * consistently wrong.
- */
- if (unlikely(cpu_feature_enabled(X86_FEATURE_MPX)))
- mpx_notify_unmap(mm, start, end);
}
/*
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
deleted file mode 100644
index 143a5c193ed3..000000000000
--- a/arch/x86/include/asm/mpx.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_MPX_H
-#define _ASM_X86_MPX_H
-
-#include <linux/types.h>
-#include <linux/mm_types.h>
-
-#include <asm/ptrace.h>
-#include <asm/insn.h>
-
-/*
- * NULL is theoretically a valid place to put the bounds
- * directory, so point this at an invalid address.
- */
-#define MPX_INVALID_BOUNDS_DIR ((void __user *)-1)
-#define MPX_BNDCFG_ENABLE_FLAG 0x1
-#define MPX_BD_ENTRY_VALID_FLAG 0x1
-
-/*
- * The upper 28 bits [47:20] of the virtual address in 64-bit
- * are used to index into bounds directory (BD).
- *
- * The directory is 2G (2^31) in size, and with 8-byte entries
- * it has 2^28 entries.
- */
-#define MPX_BD_SIZE_BYTES_64 (1UL<<31)
-#define MPX_BD_ENTRY_BYTES_64 8
-#define MPX_BD_NR_ENTRIES_64 (MPX_BD_SIZE_BYTES_64/MPX_BD_ENTRY_BYTES_64)
-
-/*
- * The 32-bit directory is 4MB (2^22) in size, and with 4-byte
- * entries it has 2^20 entries.
- */
-#define MPX_BD_SIZE_BYTES_32 (1UL<<22)
-#define MPX_BD_ENTRY_BYTES_32 4
-#define MPX_BD_NR_ENTRIES_32 (MPX_BD_SIZE_BYTES_32/MPX_BD_ENTRY_BYTES_32)
-
-/*
- * A 64-bit table is 4MB total in size, and an entry is
- * 4 64-bit pointers in size.
- */
-#define MPX_BT_SIZE_BYTES_64 (1UL<<22)
-#define MPX_BT_ENTRY_BYTES_64 32
-#define MPX_BT_NR_ENTRIES_64 (MPX_BT_SIZE_BYTES_64/MPX_BT_ENTRY_BYTES_64)
-
-/*
- * A 32-bit table is 16kB total in size, and an entry is
- * 4 32-bit pointers in size.
- */
-#define MPX_BT_SIZE_BYTES_32 (1UL<<14)
-#define MPX_BT_ENTRY_BYTES_32 16
-#define MPX_BT_NR_ENTRIES_32 (MPX_BT_SIZE_BYTES_32/MPX_BT_ENTRY_BYTES_32)
-
-#define MPX_BNDSTA_TAIL 2
-#define MPX_BNDCFG_TAIL 12
-#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1))
-#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1))
-#define MPX_BNDSTA_ERROR_CODE 0x3
-
-struct mpx_fault_info {
- void __user *addr;
- void __user *lower;
- void __user *upper;
-};
-
-#ifdef CONFIG_X86_INTEL_MPX
-
-extern int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs);
-extern int mpx_handle_bd_fault(void);
-
-static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
-{
- return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR);
-}
-
-static inline void mpx_mm_init(struct mm_struct *mm)
-{
- /*
- * NULL is theoretically a valid place to put the bounds
- * directory, so point this at an invalid address.
- */
- mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
-}
-
-extern void mpx_notify_unmap(struct mm_struct *mm, unsigned long start, unsigned long end);
-extern unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, unsigned long flags);
-
-#else
-static inline int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
-{
- return -EINVAL;
-}
-static inline int mpx_handle_bd_fault(void)
-{
- return -EINVAL;
-}
-static inline int kernel_managing_mpx_tables(struct mm_struct *mm)
-{
- return 0;
-}
-static inline void mpx_mm_init(struct mm_struct *mm)
-{
-}
-static inline void mpx_notify_unmap(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
-}
-
-static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
- unsigned long len, unsigned long flags)
-{
- return addr;
-}
-#endif /* CONFIG_X86_INTEL_MPX */
-
-#endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 084e98da04a7..d5e517d1c3dd 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -512,6 +512,8 @@
#define MSR_K7_HWCR 0xc0010015
#define MSR_K7_HWCR_SMMLOCK_BIT 0
#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT)
+#define MSR_K7_HWCR_IRPERF_EN_BIT 30
+#define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT)
#define MSR_K7_FID_VID_CTL 0xc0010041
#define MSR_K7_FID_VID_STATUS 0xc0010042
@@ -558,7 +560,14 @@
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_EBC_FREQUENCY_ID 0x0000002c
#define MSR_SMI_COUNT 0x00000034
-#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+
+/* Referred to as IA32_FEATURE_CONTROL in Intel's SDM. */
+#define MSR_IA32_FEAT_CTL 0x0000003a
+#define FEAT_CTL_LOCKED BIT(0)
+#define FEAT_CTL_VMX_ENABLED_INSIDE_SMX BIT(1)
+#define FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX BIT(2)
+#define FEAT_CTL_LMCE_ENABLED BIT(20)
+
#define MSR_IA32_TSC_ADJUST 0x0000003b
#define MSR_IA32_BNDCFGS 0x00000d90
@@ -566,11 +575,6 @@
#define MSR_IA32_XSS 0x00000da0
-#define FEATURE_CONTROL_LOCKED (1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
-#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
-#define FEATURE_CONTROL_LMCE (1<<20)
-
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
#define MSR_IA32_APICBASE_ENABLE (1<<11)
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index dbff1456d215..829df26fd7a3 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -24,7 +24,7 @@
#define _ASM_X86_MTRR_H
#include <uapi/asm/mtrr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
/*
@@ -86,7 +86,7 @@ static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
}
static inline void mtrr_bp_init(void)
{
- pat_disable("MTRRs disabled, skipping PAT initialization too.");
+ pat_disable("PAT support disabled because CONFIG_MTRR is disabled in the kernel.");
}
#define mtrr_ap_init() do {} while (0)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 75ded1d13d98..9d5d949e662e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -41,7 +41,6 @@ struct nmiaction {
struct list_head list;
nmi_handler_t handler;
u64 max_duration;
- struct irq_work irq_work;
unsigned long flags;
const char *name;
};
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 5c24a7b35166..07e95dcb40ad 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -37,7 +37,6 @@
*/
#define RSB_CLEAR_LOOPS 32 /* To forcibly overwrite all entries */
-#define RSB_FILL_LOOPS 16 /* To avoid underflow */
/*
* Google experimented with loop-unrolling and this turned out to be
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 86e7317eb31f..694d8daf4983 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -295,6 +295,13 @@ static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g);
}
+#ifdef CONFIG_X86_IOPL_IOPERM
+static inline void tss_update_io_bitmap(void)
+{
+ PVOP_VCALL0(cpu.update_io_bitmap);
+}
+#endif
+
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 84812964d3dd..732f62e04ddb 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -140,6 +140,10 @@ struct pv_cpu_ops {
void (*load_sp0)(unsigned long sp0);
+#ifdef CONFIG_X86_IOPL_IOPERM
+ void (*update_io_bitmap)(void);
+#endif
+
void (*wbinvd)(void);
/* cpuid emulation, mostly so that caps bits can be disabled */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
deleted file mode 100644
index 92015c65fa2a..000000000000
--- a/arch/x86/include/asm/pat.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_PAT_H
-#define _ASM_X86_PAT_H
-
-#include <linux/types.h>
-#include <asm/pgtable_types.h>
-
-bool pat_enabled(void);
-void pat_disable(const char *reason);
-extern void pat_init(void);
-extern void init_cache_modes(void);
-
-extern int reserve_memtype(u64 start, u64 end,
- enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
-extern int free_memtype(u64 start, u64 end);
-
-extern int kernel_map_sync_memtype(u64 base, unsigned long size,
- enum page_cache_mode pcm);
-
-int io_reserve_memtype(resource_size_t start, resource_size_t end,
- enum page_cache_mode *pcm);
-
-void io_free_memtype(resource_size_t start, resource_size_t end);
-
-bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn);
-
-#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 90d0731fdcb6..7ccb338507e3 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -9,7 +9,7 @@
#include <linux/scatterlist.h>
#include <linux/numa.h>
#include <asm/io.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/x86_init.h>
struct pci_sysdata {
@@ -25,7 +25,7 @@ struct pci_sysdata {
void *fwnode; /* IRQ domain for MSI assignment */
#endif
#if IS_ENABLED(CONFIG_VMD)
- bool vmd_domain; /* True if in Intel VMD domain */
+ struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */
#endif
};
@@ -33,14 +33,17 @@ extern int pci_routeirq;
extern int noioapicquirk;
extern int noioapicreroute;
+static inline struct pci_sysdata *to_pci_sysdata(const struct pci_bus *bus)
+{
+ return bus->sysdata;
+}
+
#ifdef CONFIG_PCI
#ifdef CONFIG_PCI_DOMAINS
static inline int pci_domain_nr(struct pci_bus *bus)
{
- struct pci_sysdata *sd = bus->sysdata;
-
- return sd->domain;
+ return to_pci_sysdata(bus)->domain;
}
static inline int pci_proc_domain(struct pci_bus *bus)
@@ -52,24 +55,20 @@ static inline int pci_proc_domain(struct pci_bus *bus)
#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
{
- struct pci_sysdata *sd = bus->sysdata;
-
- return sd->fwnode;
+ return to_pci_sysdata(bus)->fwnode;
}
#define pci_root_bus_fwnode _pci_root_bus_fwnode
#endif
+#if IS_ENABLED(CONFIG_VMD)
static inline bool is_vmd(struct pci_bus *bus)
{
-#if IS_ENABLED(CONFIG_VMD)
- struct pci_sysdata *sd = bus->sysdata;
-
- return sd->vmd_domain;
-#else
- return false;
-#endif
+ return to_pci_sysdata(bus)->vmd_dev != NULL;
}
+#else
+#define is_vmd(bus) false
+#endif /* CONFIG_VMD */
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
@@ -124,9 +123,7 @@ void native_restore_msi_irqs(struct pci_dev *dev);
/* Returns the node based on pci bus */
static inline int __pcibus_to_node(const struct pci_bus *bus)
{
- const struct pci_sysdata *sd = bus->sysdata;
-
- return sd->node;
+ return to_pci_sysdata(bus)->node;
}
static inline const struct cpumask *
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ee26e9215f18..29964b0e1075 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -322,17 +322,10 @@ struct perf_guest_switch_msr {
u64 host, guest;
};
-extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
extern void perf_check_microcode(void);
extern int x86_perf_rdpmc_index(struct perf_event *event);
#else
-static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
-{
- *nr = 0;
- return NULL;
-}
-
static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
{
memset(cap, 0, sizeof(*cap));
@@ -342,8 +335,23 @@ static inline void perf_events_lapic_init(void) { }
static inline void perf_check_microcode(void) { }
#endif
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
+extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+#else
+static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
+{
+ *nr = 0;
+ return NULL;
+}
+#endif
+
#ifdef CONFIG_CPU_SUP_INTEL
extern void intel_pt_handle_vmx(int on);
+#else
+static inline void intel_pt_handle_vmx(int on)
+{
+
+}
#endif
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ad97dc155195..7e118660bbd9 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -29,8 +29,9 @@
extern pgd_t early_top_pgt[PTRS_PER_PGD];
int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
+void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
+ bool user);
void ptdump_walk_pgd_level_checkwx(void);
void ptdump_walk_user_pgd_level_checkwx(void);
@@ -239,6 +240,7 @@ static inline unsigned long pgd_pfn(pgd_t pgd)
return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}
+#define p4d_leaf p4d_large
static inline int p4d_large(p4d_t p4d)
{
/* No 512 GiB pages yet */
@@ -247,6 +249,7 @@ static inline int p4d_large(p4d_t p4d)
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
+#define pmd_leaf pmd_large
static inline int pmd_large(pmd_t pte)
{
return pmd_flags(pte) & _PAGE_PSE;
@@ -874,6 +877,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address);
}
+#define pud_leaf pud_large
static inline int pud_large(pud_t pud)
{
return (pud_val(pud) & (_PAGE_PSE | _PAGE_PRESENT)) ==
@@ -885,6 +889,7 @@ static inline int pud_bad(pud_t pud)
return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#else
+#define pud_leaf pud_large
static inline int pud_large(pud_t pud)
{
return 0;
@@ -1233,6 +1238,7 @@ static inline bool pgdp_maps_userspace(void *__ptr)
return (((ptr & ~PAGE_MASK) / sizeof(pgd_t)) < PGD_KERNEL_START);
}
+#define pgd_leaf pgd_large
static inline int pgd_large(pgd_t pgd) { return 0; }
#ifdef CONFIG_PAGE_TABLE_ISOLATION
diff --git a/arch/x86/include/asm/pgtable_32_areas.h b/arch/x86/include/asm/pgtable_32_areas.h
new file mode 100644
index 000000000000..b6355416a15a
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32_areas.h
@@ -0,0 +1,53 @@
+#ifndef _ASM_X86_PGTABLE_32_AREAS_H
+#define _ASM_X86_PGTABLE_32_AREAS_H
+
+#include <asm/cpu_entry_area.h>
+
+/*
+ * Just any arbitrary offset to the start of the vmalloc VM area: the
+ * current 8MB value just means that there will be a 8MB "hole" after the
+ * physical memory until the kernel virtual memory starts. That means that
+ * any out-of-bounds memory accesses will hopefully be caught.
+ * The vmalloc() routines leaves a hole of 4kB between each vmalloced
+ * area for the same reason. ;)
+ */
+#define VMALLOC_OFFSET (8 * 1024 * 1024)
+
+#ifndef __ASSEMBLY__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
+#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
+#ifdef CONFIG_X86_PAE
+#define LAST_PKMAP 512
+#else
+#define LAST_PKMAP 1024
+#endif
+
+#define CPU_ENTRY_AREA_PAGES (NR_CPUS * DIV_ROUND_UP(sizeof(struct cpu_entry_area), PAGE_SIZE))
+
+/* The +1 is for the readonly IDT page: */
+#define CPU_ENTRY_AREA_BASE \
+ ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)
+
+#define LDT_BASE_ADDR \
+ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
+
+#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
+
+#define PKMAP_BASE \
+ ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
+
+#ifdef CONFIG_HIGHMEM
+# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
+#else
+# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
+#endif
+
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#define MODULES_LEN (MODULES_VADDR - MODULES_END)
+
+#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
+
+#endif /* _ASM_X86_PGTABLE_32_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index 0416d42e5bdd..5356a46b0373 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_PGTABLE_32_DEFS_H
-#define _ASM_X86_PGTABLE_32_DEFS_H
+#ifndef _ASM_X86_PGTABLE_32_TYPES_H
+#define _ASM_X86_PGTABLE_32_TYPES_H
/*
* The Linux x86 paging architecture is 'compile-time dual-mode', it
@@ -20,55 +20,4 @@
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
-/* Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts. That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- */
-#define VMALLOC_OFFSET (8 * 1024 * 1024)
-
-#ifndef __ASSEMBLY__
-extern bool __vmalloc_start_set; /* set once high_memory is set */
-#endif
-
-#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
-#ifdef CONFIG_X86_PAE
-#define LAST_PKMAP 512
-#else
-#define LAST_PKMAP 1024
-#endif
-
-/*
- * This is an upper bound on sizeof(struct cpu_entry_area) / PAGE_SIZE.
- * Define this here and validate with BUILD_BUG_ON() in cpu_entry_area.c
- * to avoid include recursion hell.
- */
-#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 43)
-
-/* The +1 is for the readonly IDT page: */
-#define CPU_ENTRY_AREA_BASE \
- ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK)
-
-#define LDT_BASE_ADDR \
- ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
-
-#define LDT_END_ADDR (LDT_BASE_ADDR + PMD_SIZE)
-
-#define PKMAP_BASE \
- ((LDT_BASE_ADDR - PAGE_SIZE) & PMD_MASK)
-
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
-#else
-# define VMALLOC_END (LDT_BASE_ADDR - 2 * PAGE_SIZE)
-#endif
-
-#define MODULES_VADDR VMALLOC_START
-#define MODULES_END VMALLOC_END
-#define MODULES_LEN (MODULES_VADDR - MODULES_END)
-
-#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
-
-#endif /* _ASM_X86_PGTABLE_32_DEFS_H */
+#endif /* _ASM_X86_PGTABLE_32_TYPES_H */
diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h
new file mode 100644
index 000000000000..d34cce1b995c
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_areas.h
@@ -0,0 +1,16 @@
+#ifndef _ASM_X86_PGTABLE_AREAS_H
+#define _ASM_X86_PGTABLE_AREAS_H
+
+#ifdef CONFIG_X86_32
+# include <asm/pgtable_32_areas.h>
+#endif
+
+/* Single page reserved for the readonly IDT mapping: */
+#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
+
+#endif /* _ASM_X86_PGTABLE_AREAS_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b5e49e6bac63..0239998d8cdc 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -110,11 +110,6 @@
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\
- _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_ACCESSED | _PAGE_DIRTY)
-
/*
* Set of bits not changed in pte_modify. The pte's
* protection key is treated like _PAGE_RW, for
@@ -136,80 +131,93 @@
*/
#ifndef __ASSEMBLY__
enum page_cache_mode {
- _PAGE_CACHE_MODE_WB = 0,
- _PAGE_CACHE_MODE_WC = 1,
+ _PAGE_CACHE_MODE_WB = 0,
+ _PAGE_CACHE_MODE_WC = 1,
_PAGE_CACHE_MODE_UC_MINUS = 2,
- _PAGE_CACHE_MODE_UC = 3,
- _PAGE_CACHE_MODE_WT = 4,
- _PAGE_CACHE_MODE_WP = 5,
- _PAGE_CACHE_MODE_NUM = 8
+ _PAGE_CACHE_MODE_UC = 3,
+ _PAGE_CACHE_MODE_WT = 4,
+ _PAGE_CACHE_MODE_WP = 5,
+
+ _PAGE_CACHE_MODE_NUM = 8
};
#endif
-#define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)
-#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
-#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
+#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
-#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
-#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-
-#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
- _PAGE_USER | _PAGE_ACCESSED)
-#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-#define PAGE_COPY PAGE_COPY_NOEXEC
-#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED | _PAGE_NX)
-#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
- _PAGE_ACCESSED)
-
-#define __PAGE_KERNEL_EXEC \
- (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
-#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
-
-#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
-#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
-#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_NOCACHE)
-#define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER)
-#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
-#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
-#define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP)
-
-#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
-#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
+#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
-#ifndef __ASSEMBLY__
+#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
+#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
-#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
+#define __PP _PAGE_PRESENT
+#define __RW _PAGE_RW
+#define _USR _PAGE_USER
+#define ___A _PAGE_ACCESSED
+#define ___D _PAGE_DIRTY
+#define ___G _PAGE_GLOBAL
+#define __NX _PAGE_NX
+
+#define _ENC _PAGE_ENC
+#define __WP _PAGE_CACHE_WP
+#define __NC _PAGE_NOCACHE
+#define _PSE _PAGE_PSE
+
+#define pgprot_val(x) ((x).pgprot)
+#define __pgprot(x) ((pgprot_t) { (x) } )
+#define __pg(x) __pgprot(x)
+
+#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
+
+#define PAGE_NONE __pg( 0| 0| 0|___A| 0| 0| 0|___G)
+#define PAGE_SHARED __pg(__PP|__RW|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_SHARED_EXEC __pg(__PP|__RW|_USR|___A| 0| 0| 0| 0)
+#define PAGE_COPY_NOEXEC __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_COPY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
+#define PAGE_COPY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_READONLY __pg(__PP| 0|_USR|___A|__NX| 0| 0| 0)
+#define PAGE_READONLY_EXEC __pg(__PP| 0|_USR|___A| 0| 0| 0| 0)
+
+#define __PAGE_KERNEL (__PP|__RW| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_EXEC (__PP|__RW| 0|___A| 0|___D| 0|___G)
+#define _KERNPG_TABLE_NOENC (__PP|__RW| 0|___A| 0|___D| 0| 0)
+#define _KERNPG_TABLE (__PP|__RW| 0|___A| 0|___D| 0| 0| _ENC)
+#define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0)
+#define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC)
+#define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G)
+#define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC)
+#define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G)
+#define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G)
+#define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW| 0|___A| 0|___D|_PSE|___G)
+#define __PAGE_KERNEL_WP (__PP|__RW| 0|___A|__NX|___D| 0|___G| __WP)
+
+
+#define __PAGE_KERNEL_IO __PAGE_KERNEL
+#define __PAGE_KERNEL_IO_NOCACHE __PAGE_KERNEL_NOCACHE
-#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
- _PAGE_DIRTY | _PAGE_ENC)
-#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER)
-#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC)
-#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC)
+#ifndef __ASSEMBLY__
-#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL)
-#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP)
+#define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _ENC)
+#define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _ENC)
+#define __PAGE_KERNEL_NOENC (__PAGE_KERNEL | 0)
+#define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP | 0)
-#define default_pgprot(x) __pgprot((x) & __default_kernel_pte_mask)
+#define __pgprot_mask(x) __pgprot((x) & __default_kernel_pte_mask)
-#define PAGE_KERNEL default_pgprot(__PAGE_KERNEL | _PAGE_ENC)
-#define PAGE_KERNEL_NOENC default_pgprot(__PAGE_KERNEL)
-#define PAGE_KERNEL_RO default_pgprot(__PAGE_KERNEL_RO | _PAGE_ENC)
-#define PAGE_KERNEL_EXEC default_pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC)
-#define PAGE_KERNEL_EXEC_NOENC default_pgprot(__PAGE_KERNEL_EXEC)
-#define PAGE_KERNEL_RX default_pgprot(__PAGE_KERNEL_RX | _PAGE_ENC)
-#define PAGE_KERNEL_NOCACHE default_pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC)
-#define PAGE_KERNEL_LARGE default_pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC)
-#define PAGE_KERNEL_LARGE_EXEC default_pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC)
-#define PAGE_KERNEL_VVAR default_pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC)
+#define PAGE_KERNEL __pgprot_mask(__PAGE_KERNEL | _ENC)
+#define PAGE_KERNEL_NOENC __pgprot_mask(__PAGE_KERNEL | 0)
+#define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC)
+#define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC)
+#define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0)
+#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC)
+#define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC)
+#define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC)
+#define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC)
+#define PAGE_KERNEL_VVAR __pgprot_mask(__PAGE_KERNEL_VVAR | _ENC)
-#define PAGE_KERNEL_IO default_pgprot(__PAGE_KERNEL_IO)
-#define PAGE_KERNEL_IO_NOCACHE default_pgprot(__PAGE_KERNEL_IO_NOCACHE)
+#define PAGE_KERNEL_IO __pgprot_mask(__PAGE_KERNEL_IO)
+#define PAGE_KERNEL_IO_NOCACHE __pgprot_mask(__PAGE_KERNEL_IO_NOCACHE)
#endif /* __ASSEMBLY__ */
@@ -449,9 +457,6 @@ static inline pteval_t pte_flags(pte_t pte)
return native_pte_val(pte) & PTE_FLAGS_MASK;
}
-#define pgprot_val(x) ((x).pgprot)
-#define __pgprot(x) ((pgprot_t) { (x) } )
-
extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM];
extern uint8_t __pte2cachemode_tbl[8];
@@ -561,6 +566,10 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+
+struct mm_struct;
+extern pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
+ unsigned int *level);
extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn,
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0340aad3f2fc..09705ccc393c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -25,6 +25,7 @@ struct vm86;
#include <asm/special_insns.h>
#include <asm/fpu/types.h>
#include <asm/unwind_hints.h>
+#include <asm/vmxfeatures.h>
#include <linux/personality.h>
#include <linux/cache.h>
@@ -85,6 +86,9 @@ struct cpuinfo_x86 {
/* Number of 4K pages in DTLB/ITLB combined(in pages): */
int x86_tlbsize;
#endif
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ __u32 vmx_capability[NVMXINTS];
+#endif
__u8 x86_virt_bits;
__u8 x86_phys_bits;
/* CPUID returned core id bits: */
@@ -943,24 +947,6 @@ extern int set_tsc_mode(unsigned int val);
DECLARE_PER_CPU(u64, msr_misc_features_shadow);
-/* Register/unregister a process' MPX related resource */
-#define MPX_ENABLE_MANAGEMENT() mpx_enable_management()
-#define MPX_DISABLE_MANAGEMENT() mpx_disable_management()
-
-#ifdef CONFIG_X86_INTEL_MPX
-extern int mpx_enable_management(void);
-extern int mpx_disable_management(void);
-#else
-static inline int mpx_enable_management(void)
-{
- return -EINVAL;
-}
-static inline int mpx_disable_management(void)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_X86_INTEL_MPX */
-
#ifdef CONFIG_CPU_SUP_AMD
extern u16 amd_get_nb_id(int cpu);
extern u32 amd_get_nodes_per_socket(void);
@@ -1015,11 +1001,4 @@ enum mds_mitigations {
MDS_MITIGATION_VMWERV,
};
-enum taa_mitigations {
- TAA_MITIGATION_OFF,
- TAA_MITIGATION_UCODE_NEEDED,
- TAA_MITIGATION_VERW,
- TAA_MITIGATION_TSX_DISABLED,
-};
-
#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5057a8ed100b..6d6475fdd327 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -159,6 +159,19 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}
+/*
+ * Determine whether the register set came from any context that is running in
+ * 64-bit mode.
+ */
+static inline bool any_64bit_mode(struct pt_regs *regs)
+{
+#ifdef CONFIG_X86_64
+ return !user_mode(regs) || user_64bit_mode(regs);
+#else
+ return false;
+#endif
+}
+
#ifdef CONFIG_X86_64
#define current_user_stack_pointer() current_pt_regs()->sp
#define compat_user_stack_pointer() current_pt_regs()->sp
@@ -339,22 +352,6 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
#define ARCH_HAS_USER_SINGLE_STEP_REPORT
-/*
- * When hitting ptrace_stop(), we cannot return using SYSRET because
- * that does not restore the full CPU state, only a minimal set. The
- * ptracer can change arbitrary register values, which is usually okay
- * because the usual ptrace stops run off the signal delivery path which
- * forces IRET; however, ptrace_event() stops happen in arbitrary places
- * in the kernel and don't force IRET path.
- *
- * So force IRET path after a ptrace stop.
- */
-#define arch_ptrace_stop_needed(code, info) \
-({ \
- force_iret(); \
- false; \
-})
-
struct user_desc;
extern int do_get_thread_area(struct task_struct *p, int idx,
struct user_desc __user *info);
diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 09ecc32f6524..b35030eeec36 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -14,7 +14,7 @@
#include <linux/types.h>
#include <asm/io.h>
-/* This must match data at realmode.S */
+/* This must match data at realmode/rm/header.S */
struct real_mode_header {
u32 text_start;
u32 ro_end;
@@ -36,7 +36,7 @@ struct real_mode_header {
#endif
};
-/* This must match data at trampoline_32/64.S */
+/* This must match data at realmode/rm/trampoline_{32,64}.S */
struct trampoline_header {
#ifdef CONFIG_X86_32
u32 start;
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index 2ee8e469dcf5..64c3dce374e5 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -81,8 +81,6 @@ int set_direct_map_invalid_noflush(struct page *page);
int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
-void set_kernel_text_rw(void);
-void set_kernel_text_ro(void);
#ifdef CONFIG_X86_64
static inline int set_mce_nospec(unsigned long pfn)
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index 23c626a742e8..67315fa3956a 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -25,14 +25,6 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
*/
#define POKE_MAX_OPCODE_SIZE 5
-struct text_poke_loc {
- void *addr;
- int len;
- s32 rel32;
- u8 opcode;
- const u8 text[POKE_MAX_OPCODE_SIZE];
-};
-
extern void text_poke_early(void *addr, const void *opcode, size_t len);
/*
@@ -50,21 +42,13 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len);
* an inconsistent instruction while you patch.
*/
extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void text_poke_sync(void);
extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
extern int poke_int3_handler(struct pt_regs *regs);
extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate);
-extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries);
-extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
- const void *opcode, size_t len, const void *emulate);
-extern int after_bootmem;
-extern __ro_after_init struct mm_struct *poking_mm;
-extern __ro_after_init unsigned long poking_addr;
-#ifndef CONFIG_UML_X86
-static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
-{
- regs->ip = ip;
-}
+extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate);
+extern void text_poke_finish(void);
#define INT3_INSN_SIZE 1
#define INT3_INSN_OPCODE 0xCC
@@ -78,6 +62,67 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
#define JMP8_INSN_SIZE 2
#define JMP8_INSN_OPCODE 0xEB
+#define DISP32_SIZE 4
+
+static inline int text_opcode_size(u8 opcode)
+{
+ int size = 0;
+
+#define __CASE(insn) \
+ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break
+
+ switch(opcode) {
+ __CASE(INT3);
+ __CASE(CALL);
+ __CASE(JMP32);
+ __CASE(JMP8);
+ }
+
+#undef __CASE
+
+ return size;
+}
+
+union text_poke_insn {
+ u8 text[POKE_MAX_OPCODE_SIZE];
+ struct {
+ u8 opcode;
+ s32 disp;
+ } __attribute__((packed));
+};
+
+static __always_inline
+void *text_gen_insn(u8 opcode, const void *addr, const void *dest)
+{
+ static union text_poke_insn insn; /* per instance */
+ int size = text_opcode_size(opcode);
+
+ insn.opcode = opcode;
+
+ if (size > 1) {
+ insn.disp = (long)dest - (long)(addr + size);
+ if (size == 2) {
+ /*
+ * Ensure that for JMP9 the displacement
+ * actually fits the signed byte.
+ */
+ BUG_ON((insn.disp >> 31) != (insn.disp >> 7));
+ }
+ }
+
+ return &insn.text;
+}
+
+extern int after_bootmem;
+extern __ro_after_init struct mm_struct *poking_mm;
+extern __ro_after_init unsigned long poking_addr;
+
+#ifndef CONFIG_UML_X86
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
{
/*
@@ -85,6 +130,9 @@ static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
* stack where the break point happened, and the saving of
* pt_regs. We can extend the original stack because of
* this gap. See the idtentry macro's create_gap option.
+ *
+ * Similarly entry_32.S will have a gap on the stack for (any) hardware
+ * exception and pt_regs; see FIXUP_FRAME.
*/
regs->sp -= sizeof(unsigned long);
*(unsigned long *)regs->sp = val;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d779366ce3f8..cf4327986e98 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -239,15 +239,6 @@ static inline int arch_within_stack_frames(const void * const stack,
current_thread_info()->status & TS_COMPAT)
#endif
-/*
- * Force syscall return via IRET by making it look as if there was
- * some work pending. IRET is our most capable (but slowest) syscall
- * return path, which is able to restore modified SS, CS and certain
- * EFLAGS values that other (fast) syscall return instructions
- * are not able to restore properly.
- */
-#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
-
extern void arch_task_cache_init(void);
extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
extern void arch_release_task_struct(struct task_struct *tsk);
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index f23e7aaff4cd..820082bd6880 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -29,8 +29,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
* shootdown, enablement code for several hypervisors overrides
* .flush_tlb_others hook in pv_mmu_ops and implements it by issuing
* a hypercall. To keep software pagetable walkers safe in this case we
- * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment
- * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the comment
+ * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
* for more details.
*/
static inline void __tlb_remove_table(void *table)
diff --git a/arch/x86/include/asm/trace/mpx.h b/arch/x86/include/asm/trace/mpx.h
deleted file mode 100644
index 54133017267c..000000000000
--- a/arch/x86/include/asm/trace/mpx.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM mpx
-
-#if !defined(_TRACE_MPX_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_MPX_H
-
-#include <linux/tracepoint.h>
-
-#ifdef CONFIG_X86_INTEL_MPX
-
-TRACE_EVENT(mpx_bounds_register_exception,
-
- TP_PROTO(void __user *addr_referenced,
- const struct mpx_bndreg *bndreg),
- TP_ARGS(addr_referenced, bndreg),
-
- TP_STRUCT__entry(
- __field(void __user *, addr_referenced)
- __field(u64, lower_bound)
- __field(u64, upper_bound)
- ),
-
- TP_fast_assign(
- __entry->addr_referenced = addr_referenced;
- __entry->lower_bound = bndreg->lower_bound;
- __entry->upper_bound = bndreg->upper_bound;
- ),
- /*
- * Note that we are printing out the '~' of the upper
- * bounds register here. It is actually stored in its
- * one's complement form so that its 'init' state
- * corresponds to all 0's. But, that looks like
- * gibberish when printed out, so print out the 1's
- * complement instead of the actual value here. Note
- * though that you still need to specify filters for the
- * actual value, not the displayed one.
- */
- TP_printk("address referenced: 0x%p bounds: lower: 0x%llx ~upper: 0x%llx",
- __entry->addr_referenced,
- __entry->lower_bound,
- ~__entry->upper_bound
- )
-);
-
-TRACE_EVENT(bounds_exception_mpx,
-
- TP_PROTO(const struct mpx_bndcsr *bndcsr),
- TP_ARGS(bndcsr),
-
- TP_STRUCT__entry(
- __field(u64, bndcfgu)
- __field(u64, bndstatus)
- ),
-
- TP_fast_assign(
- /* need to get rid of the 'const' on bndcsr */
- __entry->bndcfgu = (u64)bndcsr->bndcfgu;
- __entry->bndstatus = (u64)bndcsr->bndstatus;
- ),
-
- TP_printk("bndcfgu:0x%llx bndstatus:0x%llx",
- __entry->bndcfgu,
- __entry->bndstatus)
-);
-
-DECLARE_EVENT_CLASS(mpx_range_trace,
-
- TP_PROTO(unsigned long start,
- unsigned long end),
- TP_ARGS(start, end),
-
- TP_STRUCT__entry(
- __field(unsigned long, start)
- __field(unsigned long, end)
- ),
-
- TP_fast_assign(
- __entry->start = start;
- __entry->end = end;
- ),
-
- TP_printk("[0x%p:0x%p]",
- (void *)__entry->start,
- (void *)__entry->end
- )
-);
-
-DEFINE_EVENT(mpx_range_trace, mpx_unmap_zap,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end)
-);
-
-DEFINE_EVENT(mpx_range_trace, mpx_unmap_search,
- TP_PROTO(unsigned long start, unsigned long end),
- TP_ARGS(start, end)
-);
-
-TRACE_EVENT(mpx_new_bounds_table,
-
- TP_PROTO(unsigned long table_vaddr),
- TP_ARGS(table_vaddr),
-
- TP_STRUCT__entry(
- __field(unsigned long, table_vaddr)
- ),
-
- TP_fast_assign(
- __entry->table_vaddr = table_vaddr;
- ),
-
- TP_printk("table vaddr:%p", (void *)__entry->table_vaddr)
-);
-
-#else
-
-/*
- * This gets used outside of MPX-specific code, so we need a stub.
- */
-static inline
-void trace_bounds_exception_mpx(const struct mpx_bndcsr *bndcsr)
-{
-}
-
-#endif /* CONFIG_X86_INTEL_MPX */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH asm/trace/
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE mpx
-#endif /* _TRACE_MPX_H */
-
-/* This part must be outside protection */
-#include <trace/define_trace.h>
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 230474e2ddb5..bbcdc7b8f963 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -21,6 +21,7 @@ struct vdso_image {
long sym_vvar_page;
long sym_pvclock_page;
long sym_hvclock_page;
+ long sym_timens_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;
long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h
index e9ee139cf29e..6ee1f7dba34b 100644
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -21,6 +21,7 @@
#include <clocksource/hyperv_timer.h>
#define __vdso_data (VVAR(_vdso_data))
+#define __timens_vdso_data (TIMENS(_vdso_data))
#define VDSO_HAS_TIME 1
@@ -56,6 +57,13 @@ extern struct ms_hyperv_tsc_page hvclock_page
__attribute__((visibility("hidden")));
#endif
+#ifdef CONFIG_TIME_NS
+static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void)
+{
+ return __timens_vdso_data;
+}
+#endif
+
#ifndef BUILD_VDSO32
static __always_inline
@@ -96,8 +104,6 @@ long clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
#else
-#define VDSO_HAS_32BIT_FALLBACK 1
-
static __always_inline
long clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts)
{
diff --git a/arch/x86/include/asm/vmalloc.h b/arch/x86/include/asm/vmalloc.h
new file mode 100644
index 000000000000..29837740b520
--- /dev/null
+++ b/arch/x86/include/asm/vmalloc.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_VMALLOC_H
+#define _ASM_X86_VMALLOC_H
+
+#include <asm/pgtable_areas.h>
+
+#endif /* _ASM_X86_VMALLOC_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 1835767aa335..8521af3fef27 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -15,67 +15,70 @@
#include <linux/bitops.h>
#include <linux/types.h>
#include <uapi/asm/vmx.h>
+#include <asm/vmxfeatures.h>
+
+#define VMCS_CONTROL_BIT(x) BIT(VMX_FEATURE_##x & 0x1f)
/*
* Definitions of Primary Processor-Based VM-Execution Controls.
*/
-#define CPU_BASED_VIRTUAL_INTR_PENDING 0x00000004
-#define CPU_BASED_USE_TSC_OFFSETING 0x00000008
-#define CPU_BASED_HLT_EXITING 0x00000080
-#define CPU_BASED_INVLPG_EXITING 0x00000200
-#define CPU_BASED_MWAIT_EXITING 0x00000400
-#define CPU_BASED_RDPMC_EXITING 0x00000800
-#define CPU_BASED_RDTSC_EXITING 0x00001000
-#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
-#define CPU_BASED_CR3_STORE_EXITING 0x00010000
-#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
-#define CPU_BASED_CR8_STORE_EXITING 0x00100000
-#define CPU_BASED_TPR_SHADOW 0x00200000
-#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
-#define CPU_BASED_MOV_DR_EXITING 0x00800000
-#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
-#define CPU_BASED_USE_IO_BITMAPS 0x02000000
-#define CPU_BASED_MONITOR_TRAP_FLAG 0x08000000
-#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
-#define CPU_BASED_MONITOR_EXITING 0x20000000
-#define CPU_BASED_PAUSE_EXITING 0x40000000
-#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+#define CPU_BASED_INTR_WINDOW_EXITING VMCS_CONTROL_BIT(INTR_WINDOW_EXITING)
+#define CPU_BASED_USE_TSC_OFFSETTING VMCS_CONTROL_BIT(USE_TSC_OFFSETTING)
+#define CPU_BASED_HLT_EXITING VMCS_CONTROL_BIT(HLT_EXITING)
+#define CPU_BASED_INVLPG_EXITING VMCS_CONTROL_BIT(INVLPG_EXITING)
+#define CPU_BASED_MWAIT_EXITING VMCS_CONTROL_BIT(MWAIT_EXITING)
+#define CPU_BASED_RDPMC_EXITING VMCS_CONTROL_BIT(RDPMC_EXITING)
+#define CPU_BASED_RDTSC_EXITING VMCS_CONTROL_BIT(RDTSC_EXITING)
+#define CPU_BASED_CR3_LOAD_EXITING VMCS_CONTROL_BIT(CR3_LOAD_EXITING)
+#define CPU_BASED_CR3_STORE_EXITING VMCS_CONTROL_BIT(CR3_STORE_EXITING)
+#define CPU_BASED_CR8_LOAD_EXITING VMCS_CONTROL_BIT(CR8_LOAD_EXITING)
+#define CPU_BASED_CR8_STORE_EXITING VMCS_CONTROL_BIT(CR8_STORE_EXITING)
+#define CPU_BASED_TPR_SHADOW VMCS_CONTROL_BIT(VIRTUAL_TPR)
+#define CPU_BASED_NMI_WINDOW_EXITING VMCS_CONTROL_BIT(NMI_WINDOW_EXITING)
+#define CPU_BASED_MOV_DR_EXITING VMCS_CONTROL_BIT(MOV_DR_EXITING)
+#define CPU_BASED_UNCOND_IO_EXITING VMCS_CONTROL_BIT(UNCOND_IO_EXITING)
+#define CPU_BASED_USE_IO_BITMAPS VMCS_CONTROL_BIT(USE_IO_BITMAPS)
+#define CPU_BASED_MONITOR_TRAP_FLAG VMCS_CONTROL_BIT(MONITOR_TRAP_FLAG)
+#define CPU_BASED_USE_MSR_BITMAPS VMCS_CONTROL_BIT(USE_MSR_BITMAPS)
+#define CPU_BASED_MONITOR_EXITING VMCS_CONTROL_BIT(MONITOR_EXITING)
+#define CPU_BASED_PAUSE_EXITING VMCS_CONTROL_BIT(PAUSE_EXITING)
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS VMCS_CONTROL_BIT(SEC_CONTROLS)
#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
/*
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
-#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
-#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
-#define SECONDARY_EXEC_DESC 0x00000004
-#define SECONDARY_EXEC_RDTSCP 0x00000008
-#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
-#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
-#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
-#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
-#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
-#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
-#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
-#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
-#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
-#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
-#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
-#define SECONDARY_EXEC_ENCLS_EXITING 0x00008000
-#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
-#define SECONDARY_EXEC_ENABLE_PML 0x00020000
-#define SECONDARY_EXEC_PT_CONCEAL_VMX 0x00080000
-#define SECONDARY_EXEC_XSAVES 0x00100000
-#define SECONDARY_EXEC_PT_USE_GPA 0x01000000
-#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC 0x00400000
-#define SECONDARY_EXEC_TSC_SCALING 0x02000000
-#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE 0x04000000
-
-#define PIN_BASED_EXT_INTR_MASK 0x00000001
-#define PIN_BASED_NMI_EXITING 0x00000008
-#define PIN_BASED_VIRTUAL_NMIS 0x00000020
-#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
-#define PIN_BASED_POSTED_INTR 0x00000080
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES VMCS_CONTROL_BIT(VIRT_APIC_ACCESSES)
+#define SECONDARY_EXEC_ENABLE_EPT VMCS_CONTROL_BIT(EPT)
+#define SECONDARY_EXEC_DESC VMCS_CONTROL_BIT(DESC_EXITING)
+#define SECONDARY_EXEC_RDTSCP VMCS_CONTROL_BIT(RDTSCP)
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE VMCS_CONTROL_BIT(VIRTUAL_X2APIC)
+#define SECONDARY_EXEC_ENABLE_VPID VMCS_CONTROL_BIT(VPID)
+#define SECONDARY_EXEC_WBINVD_EXITING VMCS_CONTROL_BIT(WBINVD_EXITING)
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST VMCS_CONTROL_BIT(UNRESTRICTED_GUEST)
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT VMCS_CONTROL_BIT(APIC_REGISTER_VIRT)
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY VMCS_CONTROL_BIT(VIRT_INTR_DELIVERY)
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING VMCS_CONTROL_BIT(PAUSE_LOOP_EXITING)
+#define SECONDARY_EXEC_RDRAND_EXITING VMCS_CONTROL_BIT(RDRAND_EXITING)
+#define SECONDARY_EXEC_ENABLE_INVPCID VMCS_CONTROL_BIT(INVPCID)
+#define SECONDARY_EXEC_ENABLE_VMFUNC VMCS_CONTROL_BIT(VMFUNC)
+#define SECONDARY_EXEC_SHADOW_VMCS VMCS_CONTROL_BIT(SHADOW_VMCS)
+#define SECONDARY_EXEC_ENCLS_EXITING VMCS_CONTROL_BIT(ENCLS_EXITING)
+#define SECONDARY_EXEC_RDSEED_EXITING VMCS_CONTROL_BIT(RDSEED_EXITING)
+#define SECONDARY_EXEC_ENABLE_PML VMCS_CONTROL_BIT(PAGE_MOD_LOGGING)
+#define SECONDARY_EXEC_PT_CONCEAL_VMX VMCS_CONTROL_BIT(PT_CONCEAL_VMX)
+#define SECONDARY_EXEC_XSAVES VMCS_CONTROL_BIT(XSAVES)
+#define SECONDARY_EXEC_MODE_BASED_EPT_EXEC VMCS_CONTROL_BIT(MODE_BASED_EPT_EXEC)
+#define SECONDARY_EXEC_PT_USE_GPA VMCS_CONTROL_BIT(PT_USE_GPA)
+#define SECONDARY_EXEC_TSC_SCALING VMCS_CONTROL_BIT(TSC_SCALING)
+#define SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE VMCS_CONTROL_BIT(USR_WAIT_PAUSE)
+
+#define PIN_BASED_EXT_INTR_MASK VMCS_CONTROL_BIT(INTR_EXITING)
+#define PIN_BASED_NMI_EXITING VMCS_CONTROL_BIT(NMI_EXITING)
+#define PIN_BASED_VIRTUAL_NMIS VMCS_CONTROL_BIT(VIRTUAL_NMIS)
+#define PIN_BASED_VMX_PREEMPTION_TIMER VMCS_CONTROL_BIT(PREEMPTION_TIMER)
+#define PIN_BASED_POSTED_INTR VMCS_CONTROL_BIT(POSTED_INTR)
#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
@@ -114,7 +117,9 @@
#define VMX_MISC_MSR_LIST_MULTIPLIER 512
/* VMFUNC functions */
-#define VMX_VMFUNC_EPTP_SWITCHING 0x00000001
+#define VMFUNC_CONTROL_BIT(x) BIT((VMX_FEATURE_##x & 0x1f) - 28)
+
+#define VMX_VMFUNC_EPTP_SWITCHING VMFUNC_CONTROL_BIT(EPTP_SWITCHING)
#define VMFUNC_EPTP_ENTRIES 512
static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic)
diff --git a/arch/x86/include/asm/vmxfeatures.h b/arch/x86/include/asm/vmxfeatures.h
new file mode 100644
index 000000000000..9915990fd8cf
--- /dev/null
+++ b/arch/x86/include/asm/vmxfeatures.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_VMXFEATURES_H
+#define _ASM_X86_VMXFEATURES_H
+
+/*
+ * Defines VMX CPU feature bits
+ */
+#define NVMXINTS 3 /* N 32-bit words worth of info */
+
+/*
+ * Note: If the comment begins with a quoted string, that string is used
+ * in /proc/cpuinfo instead of the macro name. If the string is "",
+ * this feature bit is not displayed in /proc/cpuinfo at all.
+ */
+
+/* Pin-Based VM-Execution Controls, EPT/VPID, APIC and VM-Functions, word 0 */
+#define VMX_FEATURE_INTR_EXITING ( 0*32+ 0) /* "" VM-Exit on vectored interrupts */
+#define VMX_FEATURE_NMI_EXITING ( 0*32+ 3) /* "" VM-Exit on NMIs */
+#define VMX_FEATURE_VIRTUAL_NMIS ( 0*32+ 5) /* "vnmi" NMI virtualization */
+#define VMX_FEATURE_PREEMPTION_TIMER ( 0*32+ 6) /* VMX Preemption Timer */
+#define VMX_FEATURE_POSTED_INTR ( 0*32+ 7) /* Posted Interrupts */
+
+/* EPT/VPID features, scattered to bits 16-23 */
+#define VMX_FEATURE_INVVPID ( 0*32+ 16) /* INVVPID is supported */
+#define VMX_FEATURE_EPT_EXECUTE_ONLY ( 0*32+ 17) /* "ept_x_only" EPT entries can be execute only */
+#define VMX_FEATURE_EPT_AD ( 0*32+ 18) /* EPT Accessed/Dirty bits */
+#define VMX_FEATURE_EPT_1GB ( 0*32+ 19) /* 1GB EPT pages */
+
+/* Aggregated APIC features 24-27 */
+#define VMX_FEATURE_FLEXPRIORITY ( 0*32+ 24) /* TPR shadow + virt APIC */
+#define VMX_FEATURE_APICV ( 0*32+ 25) /* TPR shadow + APIC reg virt + virt intr delivery + posted interrupts */
+
+/* VM-Functions, shifted to bits 28-31 */
+#define VMX_FEATURE_EPTP_SWITCHING ( 0*32+ 28) /* EPTP switching (in guest) */
+
+/* Primary Processor-Based VM-Execution Controls, word 1 */
+#define VMX_FEATURE_INTR_WINDOW_EXITING ( 1*32+ 2) /* "" VM-Exit if INTRs are unblocked in guest */
+#define VMX_FEATURE_USE_TSC_OFFSETTING ( 1*32+ 3) /* "tsc_offset" Offset hardware TSC when read in guest */
+#define VMX_FEATURE_HLT_EXITING ( 1*32+ 7) /* "" VM-Exit on HLT */
+#define VMX_FEATURE_INVLPG_EXITING ( 1*32+ 9) /* "" VM-Exit on INVLPG */
+#define VMX_FEATURE_MWAIT_EXITING ( 1*32+ 10) /* "" VM-Exit on MWAIT */
+#define VMX_FEATURE_RDPMC_EXITING ( 1*32+ 11) /* "" VM-Exit on RDPMC */
+#define VMX_FEATURE_RDTSC_EXITING ( 1*32+ 12) /* "" VM-Exit on RDTSC */
+#define VMX_FEATURE_CR3_LOAD_EXITING ( 1*32+ 15) /* "" VM-Exit on writes to CR3 */
+#define VMX_FEATURE_CR3_STORE_EXITING ( 1*32+ 16) /* "" VM-Exit on reads from CR3 */
+#define VMX_FEATURE_CR8_LOAD_EXITING ( 1*32+ 19) /* "" VM-Exit on writes to CR8 */
+#define VMX_FEATURE_CR8_STORE_EXITING ( 1*32+ 20) /* "" VM-Exit on reads from CR8 */
+#define VMX_FEATURE_VIRTUAL_TPR ( 1*32+ 21) /* "vtpr" TPR virtualization, a.k.a. TPR shadow */
+#define VMX_FEATURE_NMI_WINDOW_EXITING ( 1*32+ 22) /* "" VM-Exit if NMIs are unblocked in guest */
+#define VMX_FEATURE_MOV_DR_EXITING ( 1*32+ 23) /* "" VM-Exit on accesses to debug registers */
+#define VMX_FEATURE_UNCOND_IO_EXITING ( 1*32+ 24) /* "" VM-Exit on *all* IN{S} and OUT{S}*/
+#define VMX_FEATURE_USE_IO_BITMAPS ( 1*32+ 25) /* "" VM-Exit based on I/O port */
+#define VMX_FEATURE_MONITOR_TRAP_FLAG ( 1*32+ 27) /* "mtf" VMX single-step VM-Exits */
+#define VMX_FEATURE_USE_MSR_BITMAPS ( 1*32+ 28) /* "" VM-Exit based on MSR index */
+#define VMX_FEATURE_MONITOR_EXITING ( 1*32+ 29) /* "" VM-Exit on MONITOR (MWAIT's accomplice) */
+#define VMX_FEATURE_PAUSE_EXITING ( 1*32+ 30) /* "" VM-Exit on PAUSE (unconditionally) */
+#define VMX_FEATURE_SEC_CONTROLS ( 1*32+ 31) /* "" Enable Secondary VM-Execution Controls */
+
+/* Secondary Processor-Based VM-Execution Controls, word 2 */
+#define VMX_FEATURE_VIRT_APIC_ACCESSES ( 2*32+ 0) /* "vapic" Virtualize memory mapped APIC accesses */
+#define VMX_FEATURE_EPT ( 2*32+ 1) /* Extended Page Tables, a.k.a. Two-Dimensional Paging */
+#define VMX_FEATURE_DESC_EXITING ( 2*32+ 2) /* "" VM-Exit on {S,L}*DT instructions */
+#define VMX_FEATURE_RDTSCP ( 2*32+ 3) /* "" Enable RDTSCP in guest */
+#define VMX_FEATURE_VIRTUAL_X2APIC ( 2*32+ 4) /* "" Virtualize X2APIC for the guest */
+#define VMX_FEATURE_VPID ( 2*32+ 5) /* Virtual Processor ID (TLB ASID modifier) */
+#define VMX_FEATURE_WBINVD_EXITING ( 2*32+ 6) /* "" VM-Exit on WBINVD */
+#define VMX_FEATURE_UNRESTRICTED_GUEST ( 2*32+ 7) /* Allow Big Real Mode and other "invalid" states */
+#define VMX_FEATURE_APIC_REGISTER_VIRT ( 2*32+ 8) /* "vapic_reg" Hardware emulation of reads to the virtual-APIC */
+#define VMX_FEATURE_VIRT_INTR_DELIVERY ( 2*32+ 9) /* "vid" Evaluation and delivery of pending virtual interrupts */
+#define VMX_FEATURE_PAUSE_LOOP_EXITING ( 2*32+ 10) /* "ple" Conditionally VM-Exit on PAUSE at CPL0 */
+#define VMX_FEATURE_RDRAND_EXITING ( 2*32+ 11) /* "" VM-Exit on RDRAND*/
+#define VMX_FEATURE_INVPCID ( 2*32+ 12) /* "" Enable INVPCID in guest */
+#define VMX_FEATURE_VMFUNC ( 2*32+ 13) /* "" Enable VM-Functions (leaf dependent) */
+#define VMX_FEATURE_SHADOW_VMCS ( 2*32+ 14) /* VMREAD/VMWRITE in guest can access shadow VMCS */
+#define VMX_FEATURE_ENCLS_EXITING ( 2*32+ 15) /* "" VM-Exit on ENCLS (leaf dependent) */
+#define VMX_FEATURE_RDSEED_EXITING ( 2*32+ 16) /* "" VM-Exit on RDSEED */
+#define VMX_FEATURE_PAGE_MOD_LOGGING ( 2*32+ 17) /* "pml" Log dirty pages into buffer */
+#define VMX_FEATURE_EPT_VIOLATION_VE ( 2*32+ 18) /* "" Conditionally reflect EPT violations as #VE exceptions */
+#define VMX_FEATURE_PT_CONCEAL_VMX ( 2*32+ 19) /* "" Suppress VMX indicators in Processor Trace */
+#define VMX_FEATURE_XSAVES ( 2*32+ 20) /* "" Enable XSAVES and XRSTORS in guest */
+#define VMX_FEATURE_MODE_BASED_EPT_EXEC ( 2*32+ 22) /* "ept_mode_based_exec" Enable separate EPT EXEC bits for supervisor vs. user */
+#define VMX_FEATURE_PT_USE_GPA ( 2*32+ 24) /* "" Processor Trace logs GPAs */
+#define VMX_FEATURE_TSC_SCALING ( 2*32+ 25) /* Scale hardware TSC when read in guest */
+#define VMX_FEATURE_USR_WAIT_PAUSE ( 2*32+ 26) /* Enable TPAUSE, UMONITOR, UMWAIT in guest */
+#define VMX_FEATURE_ENCLV_EXITING ( 2*32+ 28) /* "" VM-Exit on ENCLV (leaf dependent) */
+
+#endif /* _ASM_X86_VMXFEATURES_H */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 32f5d9a0b90e..183e98e49ab9 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -19,10 +19,10 @@
#ifndef _ASM_X86_VVAR_H
#define _ASM_X86_VVAR_H
-#if defined(__VVAR_KERNEL_LDS)
-
-/* The kernel linker script defines its own magic to put vvars in the
- * right place.
+#ifdef EMIT_VVAR
+/*
+ * EMIT_VVAR() is used by the kernel linker script to put vvars in the
+ * right place. Also, it's used by kernel code to import offsets values.
*/
#define DECLARE_VVAR(offset, type, name) \
EMIT_VVAR(name, offset)
@@ -33,9 +33,12 @@ extern char __vvar_page;
#define DECLARE_VVAR(offset, type, name) \
extern type vvar_ ## name[CS_BASES] \
- __attribute__((visibility("hidden")));
+ __attribute__((visibility("hidden"))); \
+ extern type timens_ ## name[CS_BASES] \
+ __attribute__((visibility("hidden"))); \
#define VVAR(name) (vvar_ ## name)
+#define TIMENS(name) (timens_ ## name)
#define DEFINE_VVAR(type, name) \
type name[CS_BASES] \
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 19435858df5f..96d9cd208610 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -51,12 +51,14 @@ struct x86_init_resources {
* are set up.
* @intr_init: interrupt init code
* @trap_init: platform specific trap setup
+ * @intr_mode_select: interrupt delivery mode selection
* @intr_mode_init: interrupt delivery mode setup
*/
struct x86_init_irqs {
void (*pre_vector_init)(void);
void (*intr_init)(void);
void (*trap_init)(void);
+ void (*intr_mode_select)(void);
void (*intr_mode_init)(void);
};
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 503d3f42da16..3f3f780c8c65 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -390,6 +390,7 @@ struct kvm_sync_regs {
#define KVM_STATE_NESTED_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_RUN_PENDING 0x00000002
#define KVM_STATE_NESTED_EVMCS 0x00000004
+#define KVM_STATE_NESTED_MTF_PENDING 0x00000008
#define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001
#define KVM_STATE_NESTED_SMM_VMXON 0x00000002
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 3eb8411ab60e..e95b72ec19bc 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -33,7 +33,7 @@
#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_INIT_SIGNAL 3
-#define EXIT_REASON_PENDING_INTERRUPT 7
+#define EXIT_REASON_INTERRUPT_WINDOW 7
#define EXIT_REASON_NMI_WINDOW 8
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
@@ -94,7 +94,7 @@
{ EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
{ EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
{ EXIT_REASON_INIT_SIGNAL, "INIT_SIGNAL" }, \
- { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
+ { EXIT_REASON_INTERRUPT_WINDOW, "INTERRUPT_WINDOW" }, \
{ EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
{ EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
{ EXIT_REASON_CPUID, "CPUID" }, \
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6175e370ee4a..9b294c13809a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
obj-$(CONFIG_X86_TSC) += trace_clock.o
+obj-$(CONFIG_CRASH_CORE) += crash_core_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += machine_kexec_$(BITS).o
obj-$(CONFIG_KEXEC_CORE) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca13851f0570..26b7256f590f 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -27,6 +27,17 @@ static char temp_stack[4096];
#endif
/**
+ * acpi_get_wakeup_address - provide physical address for S3 wakeup
+ *
+ * Returns the physical address where the kernel should be resumed after the
+ * system awakes from S3, e.g. for programming into the firmware waking vector.
+ */
+unsigned long acpi_get_wakeup_address(void)
+{
+ return ((unsigned long)(real_mode_header->wakeup_start));
+}
+
+/**
* x86_acpi_enter_sleep_state - enter sleep state
* @state: Sleep state to enter.
*
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index fbb60ca4255c..d06c2079b6c1 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -3,7 +3,7 @@
* Variables and functions used by the code in sleep.c
*/
-#include <asm/realmode.h>
+#include <linux/linkage.h>
extern unsigned long saved_video_mode;
extern long saved_magic;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 9ec463fe96f2..15ac0d5f4b40 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -23,6 +23,7 @@
#include <asm/nmi.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <asm/insn.h>
#include <asm/io.h>
#include <asm/fixmap.h>
@@ -936,44 +937,81 @@ static void do_sync_core(void *info)
sync_core();
}
-static struct bp_patching_desc {
+void text_poke_sync(void)
+{
+ on_each_cpu(do_sync_core, NULL, 1);
+}
+
+struct text_poke_loc {
+ s32 rel_addr; /* addr := _stext + rel_addr */
+ s32 rel32;
+ u8 opcode;
+ const u8 text[POKE_MAX_OPCODE_SIZE];
+};
+
+struct bp_patching_desc {
struct text_poke_loc *vec;
int nr_entries;
-} bp_patching;
+ atomic_t refs;
+};
+
+static struct bp_patching_desc *bp_desc;
+
+static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+{
+ struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
+
+ if (!desc || !atomic_inc_not_zero(&desc->refs))
+ return NULL;
+
+ return desc;
+}
+
+static inline void put_desc(struct bp_patching_desc *desc)
+{
+ smp_mb__before_atomic();
+ atomic_dec(&desc->refs);
+}
-static int patch_cmp(const void *key, const void *elt)
+static inline void *text_poke_addr(struct text_poke_loc *tp)
+{
+ return _stext + tp->rel_addr;
+}
+
+static int notrace patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
- if (key < tp->addr)
+ if (key < text_poke_addr(tp))
return -1;
- if (key > tp->addr)
+ if (key > text_poke_addr(tp))
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
-int poke_int3_handler(struct pt_regs *regs)
+int notrace poke_int3_handler(struct pt_regs *regs)
{
+ struct bp_patching_desc *desc;
struct text_poke_loc *tp;
+ int len, ret = 0;
void *ip;
+ if (user_mode(regs))
+ return 0;
+
/*
* Having observed our INT3 instruction, we now must observe
- * bp_patching.nr_entries.
+ * bp_desc:
*
- * nr_entries != 0 INT3
+ * bp_desc = desc INT3
* WMB RMB
- * write INT3 if (nr_entries)
- *
- * Idem for other elements in bp_patching.
+ * write INT3 if (desc)
*/
smp_rmb();
- if (likely(!bp_patching.nr_entries))
- return 0;
-
- if (user_mode(regs))
+ desc = try_get_desc(&bp_desc);
+ if (!desc)
return 0;
/*
@@ -984,19 +1022,20 @@ int poke_int3_handler(struct pt_regs *regs)
/*
* Skip the binary search if there is a single member in the vector.
*/
- if (unlikely(bp_patching.nr_entries > 1)) {
- tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries,
+ if (unlikely(desc->nr_entries > 1)) {
+ tp = bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
- return 0;
+ goto out_put;
} else {
- tp = bp_patching.vec;
- if (tp->addr != ip)
- return 0;
+ tp = desc->vec;
+ if (text_poke_addr(tp) != ip)
+ goto out_put;
}
- ip += tp->len;
+ len = text_opcode_size(tp->opcode);
+ ip += len;
switch (tp->opcode) {
case INT3_INSN_OPCODE:
@@ -1004,7 +1043,7 @@ int poke_int3_handler(struct pt_regs *regs)
* Someone poked an explicit INT3, they'll want to handle it,
* do not consume.
*/
- return 0;
+ goto out_put;
case CALL_INSN_OPCODE:
int3_emulate_call(regs, (long)ip + tp->rel32);
@@ -1019,10 +1058,18 @@ int poke_int3_handler(struct pt_regs *regs)
BUG();
}
- return 1;
+ ret = 1;
+
+out_put:
+ put_desc(desc);
+ return ret;
}
NOKPROBE_SYMBOL(poke_int3_handler);
+#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
+static struct text_poke_loc tp_vec[TP_VEC_MAX];
+static int tp_vec_nr;
+
/**
* text_poke_bp_batch() -- update instructions on live kernel on SMP
* @tp: vector of instructions to patch
@@ -1044,16 +1091,20 @@ NOKPROBE_SYMBOL(poke_int3_handler);
* replacing opcode
* - sync cores
*/
-void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
+static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
+ struct bp_patching_desc desc = {
+ .vec = tp,
+ .nr_entries = nr_entries,
+ .refs = ATOMIC_INIT(1),
+ };
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
lockdep_assert_held(&text_mutex);
- bp_patching.vec = tp;
- bp_patching.nr_entries = nr_entries;
+ smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
/*
* Corresponding read barrier in int3 notifier for making sure the
@@ -1065,18 +1116,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* First step: add a int3 trap to the address that will be patched.
*/
for (i = 0; i < nr_entries; i++)
- text_poke(tp[i].addr, &int3, sizeof(int3));
+ text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
/*
* Second step: update all but the first byte of the patched range.
*/
for (do_sync = 0, i = 0; i < nr_entries; i++) {
- if (tp[i].len - sizeof(int3) > 0) {
- text_poke((char *)tp[i].addr + sizeof(int3),
- (const char *)tp[i].text + sizeof(int3),
- tp[i].len - sizeof(int3));
+ int len = text_opcode_size(tp[i].opcode);
+
+ if (len - INT3_INSN_SIZE > 0) {
+ text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+ (const char *)tp[i].text + INT3_INSN_SIZE,
+ len - INT3_INSN_SIZE);
do_sync++;
}
}
@@ -1087,7 +1140,7 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
* not necessary and we'd be safe even without it. But
* better safe than sorry (plus there's not only Intel).
*/
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
}
/*
@@ -1098,19 +1151,20 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
if (tp[i].text[0] == INT3_INSN_OPCODE)
continue;
- text_poke(tp[i].addr, tp[i].text, sizeof(int3));
+ text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE);
do_sync++;
}
if (do_sync)
- on_each_cpu(do_sync_core, NULL, 1);
+ text_poke_sync();
/*
- * sync_core() implies an smp_mb() and orders this store against
- * the writing of the new instruction.
+ * Remove and synchronize_rcu(), except we have a very primitive
+ * refcount based completion.
*/
- bp_patching.vec = NULL;
- bp_patching.nr_entries = 0;
+ WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
+ if (!atomic_dec_and_test(&desc.refs))
+ atomic_cond_read_acquire(&desc.refs, !VAL);
}
void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
@@ -1118,11 +1172,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
{
struct insn insn;
- if (!opcode)
- opcode = (void *)tp->text;
- else
- memcpy((void *)tp->text, opcode, len);
-
+ memcpy((void *)tp->text, opcode, len);
if (!emulate)
emulate = opcode;
@@ -1132,8 +1182,7 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
BUG_ON(!insn_complete(&insn));
BUG_ON(len != insn.length);
- tp->addr = addr;
- tp->len = len;
+ tp->rel_addr = addr - (void *)_stext;
tp->opcode = insn.opcode.bytes[0];
switch (tp->opcode) {
@@ -1167,6 +1216,55 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
}
}
+/*
+ * We hard rely on the tp_vec being ordered; ensure this is so by flushing
+ * early if needed.
+ */
+static bool tp_order_fail(void *addr)
+{
+ struct text_poke_loc *tp;
+
+ if (!tp_vec_nr)
+ return false;
+
+ if (!addr) /* force */
+ return true;
+
+ tp = &tp_vec[tp_vec_nr - 1];
+ if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
+ return true;
+
+ return false;
+}
+
+static void text_poke_flush(void *addr)
+{
+ if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
+ text_poke_bp_batch(tp_vec, tp_vec_nr);
+ tp_vec_nr = 0;
+ }
+}
+
+void text_poke_finish(void)
+{
+ text_poke_flush(NULL);
+}
+
+void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
+{
+ struct text_poke_loc *tp;
+
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
+ text_poke_flush(addr);
+
+ tp = &tp_vec[tp_vec_nr++];
+ text_poke_loc_init(tp, addr, opcode, len, emulate);
+}
+
/**
* text_poke_bp() -- update instructions on live kernel on SMP
* @addr: address to patch
@@ -1178,10 +1276,15 @@ void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
* dynamically allocated memory. This function should be used when it is
* not possible to allocate memory.
*/
-void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
+void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
{
struct text_poke_loc tp;
+ if (unlikely(system_state == SYSTEM_BOOTING)) {
+ text_poke_early(addr, opcode, len);
+ return;
+ }
+
text_poke_loc_init(&tp, addr, opcode, len, emulate);
text_poke_bp_batch(&tp, 1);
}
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 251c795b4eb3..69aed0ebbdfc 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -22,6 +22,7 @@
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -52,6 +53,7 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{}
};
EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
@@ -66,6 +68,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 5da106f84e84..fe698f96617c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -95,7 +95,7 @@ static inline void apbt_set_mapping(void)
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
- apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
+ apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE);
if (!apbt_virt_address) {
pr_debug("Failed mapping APBT phy address at %lu\n",\
(unsigned long)apbt_address);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 28446fa6bf18..5f973fed3c9f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -830,8 +830,17 @@ bool __init apic_needs_pit(void)
if (!tsc_khz || !cpu_khz)
return true;
- /* Is there an APIC at all? */
- if (!boot_cpu_has(X86_FEATURE_APIC))
+ /* Is there an APIC at all or is it disabled? */
+ if (!boot_cpu_has(X86_FEATURE_APIC) || disable_apic)
+ return true;
+
+ /*
+ * If interrupt delivery mode is legacy PIC or virtual wire without
+ * configuration, the local APIC timer wont be set up. Make sure
+ * that the PIT is initialized.
+ */
+ if (apic_intr_mode == APIC_PIC ||
+ apic_intr_mode == APIC_VIRTUAL_WIRE_NO_CONFIG)
return true;
/* Virt guests may lack ARAT, but still have DEADLINE */
@@ -1322,7 +1331,7 @@ void __init sync_Arb_IDs(void)
enum apic_intr_mode_id apic_intr_mode __ro_after_init;
-static int __init apic_intr_mode_select(void)
+static int __init __apic_intr_mode_select(void)
{
/* Check kernel option */
if (disable_apic) {
@@ -1384,6 +1393,12 @@ static int __init apic_intr_mode_select(void)
return APIC_SYMMETRIC_IO;
}
+/* Select the interrupt delivery mode for the BSP */
+void __init apic_intr_mode_select(void)
+{
+ apic_intr_mode = __apic_intr_mode_select();
+}
+
/*
* An initial setup of the virtual wire mode.
*/
@@ -1440,8 +1455,6 @@ void __init apic_intr_mode_init(void)
{
bool upmode = IS_ENABLED(CONFIG_UP_LATE_INIT);
- apic_intr_mode = apic_intr_mode_select();
-
switch (apic_intr_mode) {
case APIC_PIC:
pr_info("APIC: Keep in PIC mode(8259)\n");
@@ -2626,6 +2639,13 @@ static int lapic_suspend(void)
#endif
local_irq_save(flags);
+
+ /*
+ * Mask IOAPIC before disabling the local APIC to prevent stale IRR
+ * entries on some implementations.
+ */
+ mask_ioapic_entries();
+
disable_local_APIC();
irq_remapping_disable();
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 7f7533462474..159bd0cb8548 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -23,10 +23,8 @@
static struct irq_domain *msi_default_domain;
-static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+static void __irq_msi_compose_msg(struct irq_cfg *cfg, struct msi_msg *msg)
{
- struct irq_cfg *cfg = irqd_cfg(data);
-
msg->address_hi = MSI_ADDR_BASE_HI;
if (x2apic_enabled())
@@ -47,6 +45,127 @@ static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
MSI_DATA_VECTOR(cfg->vector);
}
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
+{
+ __irq_msi_compose_msg(irqd_cfg(data), msg);
+}
+
+static void irq_msi_update_msg(struct irq_data *irqd, struct irq_cfg *cfg)
+{
+ struct msi_msg msg[2] = { [1] = { }, };
+
+ __irq_msi_compose_msg(cfg, msg);
+ irq_data_get_irq_chip(irqd)->irq_write_msi_msg(irqd, msg);
+}
+
+static int
+msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg old_cfg, *cfg = irqd_cfg(irqd);
+ struct irq_data *parent = irqd->parent_data;
+ unsigned int cpu;
+ int ret;
+
+ /* Save the current configuration */
+ cpu = cpumask_first(irq_data_get_effective_affinity_mask(irqd));
+ old_cfg = *cfg;
+
+ /* Allocate a new target vector */
+ ret = parent->chip->irq_set_affinity(parent, mask, force);
+ if (ret < 0 || ret == IRQ_SET_MASK_OK_DONE)
+ return ret;
+
+ /*
+ * For non-maskable and non-remapped MSI interrupts the migration
+ * to a different destination CPU and a different vector has to be
+ * done careful to handle the possible stray interrupt which can be
+ * caused by the non-atomic update of the address/data pair.
+ *
+ * Direct update is possible when:
+ * - The MSI is maskable (remapped MSI does not use this code path)).
+ * The quirk bit is not set in this case.
+ * - The new vector is the same as the old vector
+ * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up)
+ * - The new destination CPU is the same as the old destination CPU
+ */
+ if (!irqd_msi_nomask_quirk(irqd) ||
+ cfg->vector == old_cfg.vector ||
+ old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR ||
+ cfg->dest_apicid == old_cfg.dest_apicid) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Paranoia: Validate that the interrupt target is the local
+ * CPU.
+ */
+ if (WARN_ON_ONCE(cpu != smp_processor_id())) {
+ irq_msi_update_msg(irqd, cfg);
+ return ret;
+ }
+
+ /*
+ * Redirect the interrupt to the new vector on the current CPU
+ * first. This might cause a spurious interrupt on this vector if
+ * the device raises an interrupt right between this update and the
+ * update to the final destination CPU.
+ *
+ * If the vector is in use then the installed device handler will
+ * denote it as spurious which is no harm as this is a rare event
+ * and interrupt handlers have to cope with spurious interrupts
+ * anyway. If the vector is unused, then it is marked so it won't
+ * trigger the 'No irq handler for vector' warning in do_IRQ().
+ *
+ * This requires to hold vector lock to prevent concurrent updates to
+ * the affected vector.
+ */
+ lock_vector_lock();
+
+ /*
+ * Mark the new target vector on the local CPU if it is currently
+ * unused. Reuse the VECTOR_RETRIGGERED state which is also used in
+ * the CPU hotplug path for a similar purpose. This cannot be
+ * undone here as the current CPU has interrupts disabled and
+ * cannot handle the interrupt before the whole set_affinity()
+ * section is done. In the CPU unplug case, the current CPU is
+ * about to vanish and will not handle any interrupts anymore. The
+ * vector is cleaned up when the CPU comes online again.
+ */
+ if (IS_ERR_OR_NULL(this_cpu_read(vector_irq[cfg->vector])))
+ this_cpu_write(vector_irq[cfg->vector], VECTOR_RETRIGGERED);
+
+ /* Redirect it to the new vector on the local CPU temporarily */
+ old_cfg.vector = cfg->vector;
+ irq_msi_update_msg(irqd, &old_cfg);
+
+ /* Now transition it to the target CPU */
+ irq_msi_update_msg(irqd, cfg);
+
+ /*
+ * All interrupts after this point are now targeted at the new
+ * vector/CPU.
+ *
+ * Drop vector lock before testing whether the temporary assignment
+ * to the local CPU was hit by an interrupt raised in the device,
+ * because the retrigger function acquires vector lock again.
+ */
+ unlock_vector_lock();
+
+ /*
+ * Check whether the transition raced with a device interrupt and
+ * is pending in the local APICs IRR. It is safe to do this outside
+ * of vector lock as the irq_desc::lock of this interrupt is still
+ * held and interrupts are disabled: The check is not accessing the
+ * underlying vector store. It's just checking the local APIC's
+ * IRR.
+ */
+ if (lapic_vector_set_in_irr(cfg->vector))
+ irq_data_get_irq_chip(irqd)->irq_retrigger(irqd);
+
+ return ret;
+}
+
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
@@ -58,6 +177,7 @@ static struct irq_chip pci_msi_controller = {
.irq_ack = irq_chip_ack_parent,
.irq_retrigger = irq_chip_retrigger_hierarchy,
.irq_compose_msi_msg = irq_msi_compose_msg,
+ .irq_set_affinity = msi_set_affinity,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -146,6 +266,8 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
}
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
+ else
+ msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
}
#ifdef CONFIG_IRQ_REMAP
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 2c5676b0a6e7..48293d15f1e1 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -838,13 +838,15 @@ static void free_moved_vector(struct apic_chip_data *apicd)
bool managed = apicd->is_managed;
/*
- * This should never happen. Managed interrupts are not
- * migrated except on CPU down, which does not involve the
- * cleanup vector. But try to keep the accounting correct
- * nevertheless.
+ * Managed interrupts are usually not migrated away
+ * from an online CPU, but CPU isolation 'managed_irq'
+ * can make that happen.
+ * 1) Activation does not take the isolation into account
+ * to keep the code simple
+ * 2) Migration away from an isolated CPU can happen when
+ * a non-isolated CPU which is in the calculated
+ * affinity mask comes online.
*/
- WARN_ON_ONCE(managed);
-
trace_vector_free_moved(apicd->irq, cpu, vector, managed);
irq_matrix_free(vector_matrix, cpu, vector, managed);
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d5b51a740524..ad53b2abc859 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1493,65 +1493,34 @@ static void check_efi_reboot(void)
}
/* Setup user proc fs files */
-static int proc_hubbed_show(struct seq_file *file, void *data)
+static int __maybe_unused proc_hubbed_show(struct seq_file *file, void *data)
{
seq_printf(file, "0x%x\n", uv_hubbed_system);
return 0;
}
-static int proc_hubless_show(struct seq_file *file, void *data)
+static int __maybe_unused proc_hubless_show(struct seq_file *file, void *data)
{
seq_printf(file, "0x%x\n", uv_hubless_system);
return 0;
}
-static int proc_oemid_show(struct seq_file *file, void *data)
+static int __maybe_unused proc_oemid_show(struct seq_file *file, void *data)
{
seq_printf(file, "%s/%s\n", oem_id, oem_table_id);
return 0;
}
-static int proc_hubbed_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_hubbed_show, (void *)NULL);
-}
-
-static int proc_hubless_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_hubless_show, (void *)NULL);
-}
-
-static int proc_oemid_open(struct inode *inode, struct file *file)
-{
- return single_open(file, proc_oemid_show, (void *)NULL);
-}
-
-/* (struct is "non-const" as open function is set at runtime) */
-static struct file_operations proc_version_fops = {
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static const struct file_operations proc_oemid_fops = {
- .open = proc_oemid_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static __init void uv_setup_proc_files(int hubless)
{
struct proc_dir_entry *pde;
- char *name = hubless ? "hubless" : "hubbed";
pde = proc_mkdir(UV_PROC_NODE, NULL);
- proc_create("oemid", 0, pde, &proc_oemid_fops);
- proc_create(name, 0, pde, &proc_version_fops);
+ proc_create_single("oemid", 0, pde, proc_oemid_show);
if (hubless)
- proc_version_fops.open = proc_hubless_open;
+ proc_create_single("hubless", 0, pde, proc_hubless_show);
else
- proc_version_fops.open = proc_hubbed_open;
+ proc_create_single("hubbed", 0, pde, proc_hubbed_show);
}
/* Initialize UV hubless systems */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 890f60083eca..7dc4ad68eb41 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -29,6 +29,7 @@ obj-y += umwait.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
+obj-$(CONFIG_IA32_FEAT_CTL) += feat_ctl.o
ifdef CONFIG_CPU_SUP_INTEL
obj-y += intel.o intel_pconfig.o tsx.o
obj-$(CONFIG_PM) += intel_epb.o
@@ -53,11 +54,12 @@ obj-$(CONFIG_ACRN_GUEST) += acrn.o
ifdef CONFIG_X86_FEATURE_NAMES
quiet_cmd_mkcapflags = MKCAP $@
- cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $< $@
+ cmd_mkcapflags = $(CONFIG_SHELL) $(srctree)/$(src)/mkcapflags.sh $@ $^
cpufeature = $(src)/../../include/asm/cpufeatures.h
+vmxfeature = $(src)/../../include/asm/vmxfeatures.h
-$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE
+$(obj)/capflags.c: $(cpufeature) $(vmxfeature) $(src)/mkcapflags.sh FORCE
$(call if_changed,mkcapflags)
endif
targets += capflags.c
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 62c30279be77..1f875fbe1384 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -28,6 +28,7 @@
static const int amd_erratum_383[];
static const int amd_erratum_400[];
+static const int amd_erratum_1054[];
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum);
/*
@@ -319,13 +320,6 @@ static void legacy_fixup_core_id(struct cpuinfo_x86 *c)
c->cpu_core_id %= cus_per_node;
}
-
-static void amd_get_topology_early(struct cpuinfo_x86 *c)
-{
- if (cpu_has(c, X86_FEATURE_TOPOEXT))
- smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
-}
-
/*
* Fixup core topology information for
* (1) AMD multi-node processors
@@ -717,7 +711,8 @@ static void early_init_amd(struct cpuinfo_x86 *c)
}
}
- amd_get_topology_early(c);
+ if (cpu_has(c, X86_FEATURE_TOPOEXT))
+ smp_num_siblings = ((cpuid_ebx(0x8000001e) >> 8) & 0xff) + 1;
}
static void init_amd_k8(struct cpuinfo_x86 *c)
@@ -978,6 +973,15 @@ static void init_amd(struct cpuinfo_x86 *c)
/* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */
if (!cpu_has(c, X86_FEATURE_XENPV))
set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ /*
+ * Turn on the Instructions Retired free counter on machines not
+ * susceptible to erratum #1054 "Instructions Retired Performance
+ * Counter May Be Inaccurate".
+ */
+ if (cpu_has(c, X86_FEATURE_IRPERF) &&
+ !cpu_has_amd_erratum(c, amd_erratum_1054))
+ msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT);
}
#ifdef CONFIG_X86_32
@@ -1105,6 +1109,10 @@ static const int amd_erratum_400[] =
static const int amd_erratum_383[] =
AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
+static const int amd_erratum_1054[] =
+ AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
+
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8bf64899f56a..ed54b3b21c39 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -286,6 +286,13 @@ early_param("mds", mds_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "TAA: " fmt
+enum taa_mitigations {
+ TAA_MITIGATION_OFF,
+ TAA_MITIGATION_UCODE_NEEDED,
+ TAA_MITIGATION_VERW,
+ TAA_MITIGATION_TSX_DISABLED,
+};
+
/* Default mitigation for TAA-affected CPUs */
static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW;
static bool taa_nosmt __ro_after_init;
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 14433ff5b828..426792565d86 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -18,13 +18,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 6) /* MSR_VIA_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_c3(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -71,8 +64,6 @@ static void init_c3(struct cpuinfo_x86 *c)
c->x86_cache_alignment = c->x86_clflush_size * 2;
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
}
-
- cpu_detect_cache_sizes(c);
}
enum {
@@ -119,31 +110,6 @@ static void early_init_centaur(struct cpuinfo_x86 *c)
}
}
-static void centaur_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_centaur(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
@@ -250,8 +216,7 @@ static void init_centaur(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- centaur_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2e4d90294fe6..4cdb123ff66a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -14,6 +14,7 @@
#include <linux/sched/mm.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
+#include <linux/sched/smt.h>
#include <linux/init.h>
#include <linux/kprobes.h>
#include <linux/kgdb.h>
@@ -49,7 +50,7 @@
#include <asm/cpu.h>
#include <asm/mce.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/microcode.h>
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
@@ -163,22 +164,6 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
} };
EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-static int __init x86_mpx_setup(char *s)
-{
- /* require an exact match without trailing characters */
- if (strlen(s))
- return 0;
-
- /* do not emit a message if the feature is not present */
- if (!boot_cpu_has(X86_FEATURE_MPX))
- return 1;
-
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_info("nompx: Intel Memory Protection Extensions (MPX) disabled\n");
- return 1;
-}
-__setup("nompx", x86_mpx_setup);
-
#ifdef CONFIG_X86_64
static int __init x86_nopcid_setup(char *s)
{
@@ -305,8 +290,6 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
static __init int setup_disable_smep(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_SMEP);
- /* Check for things that depend on SMEP being enabled: */
- check_mpx_erratum(&boot_cpu_data);
return 1;
}
__setup("nosmep", setup_disable_smep);
@@ -462,7 +445,7 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c)
* cpuid bit to be set. We need to ensure that we
* update that bit in this CPU's "cpu_info".
*/
- get_cpu_cap(c);
+ set_cpu_cap(c, X86_FEATURE_OSPKE);
}
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
@@ -1023,6 +1006,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
#define MSBDS_ONLY BIT(5)
#define NO_SWAPGS BIT(6)
#define NO_ITLB_MULTIHIT BIT(7)
+#define NO_SPECTRE_V2 BIT(8)
#define VULNWL(_vendor, _family, _model, _whitelist) \
{ X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
@@ -1084,6 +1068,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
/* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT),
+
+ /* Zhaoxin Family 7 */
+ VULNWL(CENTAUR, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
+ VULNWL(ZHAOXIN, 7, X86_MODEL_ANY, NO_SPECTRE_V2 | NO_SWAPGS),
{}
};
@@ -1116,7 +1104,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
- setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
+ if (!cpu_matches(NO_SPECTRE_V2))
+ setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
@@ -1449,6 +1439,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
c->x86_cache_alignment = c->x86_clflush_size;
memset(&c->x86_capability, 0, sizeof(c->x86_capability));
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ memset(&c->vmx_capability, 0, sizeof(c->vmx_capability));
+#endif
generic_identify(c);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 38ab6e115eac..37fdefd14f28 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -80,4 +80,8 @@ extern void x86_spec_ctrl_setup_ap(void);
extern u64 x86_read_arch_cap_msr(void);
+#ifdef CONFIG_IA32_FEAT_CTL
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
+#endif
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
new file mode 100644
index 000000000000..0268185bef94
--- /dev/null
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/tboot.h>
+
+#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
+#include <asm/processor.h>
+#include <asm/vmx.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "x86/cpu: " fmt
+
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+enum vmx_feature_leafs {
+ MISC_FEATURES = 0,
+ PRIMARY_CTLS,
+ SECONDARY_CTLS,
+ NR_VMX_FEATURE_WORDS,
+};
+
+#define VMX_F(x) BIT(VMX_FEATURE_##x & 0x1f)
+
+static void init_vmx_capabilities(struct cpuinfo_x86 *c)
+{
+ u32 supported, funcs, ept, vpid, ign;
+
+ BUILD_BUG_ON(NVMXINTS != NR_VMX_FEATURE_WORDS);
+
+ /*
+ * The high bits contain the allowed-1 settings, i.e. features that can
+ * be turned on. The low bits contain the allowed-0 settings, i.e.
+ * features that can be turned off. Ignore the allowed-0 settings,
+ * if a feature can be turned on then it's supported.
+ *
+ * Use raw rdmsr() for primary processor controls and pin controls MSRs
+ * as they exist on any CPU that supports VMX, i.e. we want the WARN if
+ * the RDMSR faults.
+ */
+ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, ign, supported);
+ c->vmx_capability[PRIMARY_CTLS] = supported;
+
+ rdmsr_safe(MSR_IA32_VMX_PROCBASED_CTLS2, &ign, &supported);
+ c->vmx_capability[SECONDARY_CTLS] = supported;
+
+ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, ign, supported);
+ rdmsr_safe(MSR_IA32_VMX_VMFUNC, &ign, &funcs);
+
+ /*
+ * Except for EPT+VPID, which enumerates support for both in a single
+ * MSR, low for EPT, high for VPID.
+ */
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, &ept, &vpid);
+
+ /* Pin, EPT, VPID and VM-Func are merged into a single word. */
+ WARN_ON_ONCE(supported >> 16);
+ WARN_ON_ONCE(funcs >> 4);
+ c->vmx_capability[MISC_FEATURES] = (supported & 0xffff) |
+ ((vpid & 0x1) << 16) |
+ ((funcs & 0xf) << 28);
+
+ /* EPT bits are full on scattered and must be manually handled. */
+ if (ept & VMX_EPT_EXECUTE_ONLY_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_EXECUTE_ONLY);
+ if (ept & VMX_EPT_AD_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_AD);
+ if (ept & VMX_EPT_1GB_PAGE_BIT)
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(EPT_1GB);
+
+ /* Synthetic APIC features that are aggregates of multiple features. */
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_APIC_ACCESSES)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(FLEXPRIORITY);
+
+ if ((c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(APIC_REGISTER_VIRT)) &&
+ (c->vmx_capability[SECONDARY_CTLS] & VMX_F(VIRT_INTR_DELIVERY)) &&
+ (c->vmx_capability[MISC_FEATURES] & VMX_F(POSTED_INTR)))
+ c->vmx_capability[MISC_FEATURES] |= VMX_F(APICV);
+
+ /* Set the synthetic cpufeatures to preserve /proc/cpuinfo's ABI. */
+ if (c->vmx_capability[PRIMARY_CTLS] & VMX_F(VIRTUAL_TPR))
+ set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(FLEXPRIORITY))
+ set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VIRTUAL_NMIS))
+ set_cpu_cap(c, X86_FEATURE_VNMI);
+ if (c->vmx_capability[SECONDARY_CTLS] & VMX_F(EPT))
+ set_cpu_cap(c, X86_FEATURE_EPT);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(EPT_AD))
+ set_cpu_cap(c, X86_FEATURE_EPT_AD);
+ if (c->vmx_capability[MISC_FEATURES] & VMX_F(VPID))
+ set_cpu_cap(c, X86_FEATURE_VPID);
+}
+#endif /* CONFIG_X86_VMX_FEATURE_NAMES */
+
+void init_ia32_feat_ctl(struct cpuinfo_x86 *c)
+{
+ bool tboot = tboot_enabled();
+ u64 msr;
+
+ if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) {
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ return;
+ }
+
+ if (msr & FEAT_CTL_LOCKED)
+ goto update_caps;
+
+ /*
+ * Ignore whatever value BIOS left in the MSR to avoid enabling random
+ * features or faulting on the WRMSR.
+ */
+ msr = FEAT_CTL_LOCKED;
+
+ /*
+ * Enable VMX if and only if the kernel may do VMXON at some point,
+ * i.e. KVM is enabled, to avoid unnecessarily adding an attack vector
+ * for the kernel, e.g. using VMX to hide malicious code.
+ */
+ if (cpu_has(c, X86_FEATURE_VMX) && IS_ENABLED(CONFIG_KVM_INTEL)) {
+ msr |= FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+
+ if (tboot)
+ msr |= FEAT_CTL_VMX_ENABLED_INSIDE_SMX;
+ }
+
+ wrmsrl(MSR_IA32_FEAT_CTL, msr);
+
+update_caps:
+ set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL);
+
+ if (!cpu_has(c, X86_FEATURE_VMX))
+ return;
+
+ if ( (tboot && !(msr & FEAT_CTL_VMX_ENABLED_INSIDE_SMX)) ||
+ (!tboot && !(msr & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX))) {
+ if (IS_ENABLED(CONFIG_KVM_INTEL))
+ pr_err_once("VMX (%s TXT) disabled by BIOS\n",
+ tboot ? "inside" : "outside");
+ clear_cpu_cap(c, X86_FEATURE_VMX);
+ } else {
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ init_vmx_capabilities(c);
+#endif
+ }
+}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 4a900804a023..be82cd5841c3 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -32,41 +32,6 @@
#endif
/*
- * Just in case our CPU detection goes bad, or you have a weird system,
- * allow a way to override the automatic disabling of MPX.
- */
-static int forcempx;
-
-static int __init forcempx_setup(char *__unused)
-{
- forcempx = 1;
-
- return 1;
-}
-__setup("intel-skd-046-workaround=disable", forcempx_setup);
-
-void check_mpx_erratum(struct cpuinfo_x86 *c)
-{
- if (forcempx)
- return;
- /*
- * Turn off the MPX feature on CPUs where SMEP is not
- * available or disabled.
- *
- * Works around Intel Erratum SKD046: "Branch Instructions
- * May Initialize MPX Bound Registers Incorrectly".
- *
- * This might falsely disable MPX on systems without
- * SMEP, like Atom processors without SMEP. But there
- * is no such hardware known at the moment.
- */
- if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) {
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- pr_warn("x86/mpx: Disabling MPX since SMEP not present\n");
- }
-}
-
-/*
* Processors which have self-snooping capability can handle conflicting
* memory type across CPUs by snooping its own cache. However, there exists
* CPU models in which having conflicting memory types still leads to
@@ -330,7 +295,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff);
}
- check_mpx_erratum(c);
check_memory_type_self_snoop_errata(c);
/*
@@ -494,52 +458,6 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
#endif
}
-static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- /* Intel VMX MSR indicated features */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-#define x86_VMX_FEATURE_EPT_CAP_AD 0x00200000
-
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
- u32 msr_vpid_cap, msr_ept_cap;
-
- clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- clear_cpu_cap(c, X86_FEATURE_VNMI);
- clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- clear_cpu_cap(c, X86_FEATURE_EPT);
- clear_cpu_cap(c, X86_FEATURE_VPID);
- clear_cpu_cap(c, X86_FEATURE_EPT_AD);
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) {
- set_cpu_cap(c, X86_FEATURE_EPT);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- msr_ept_cap, msr_vpid_cap);
- if (msr_ept_cap & x86_VMX_FEATURE_EPT_CAP_AD)
- set_cpu_cap(c, X86_FEATURE_EPT_AD);
- }
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
#define MSR_IA32_TME_ACTIVATE 0x982
/* Helpers to access TME_ACTIVATE MSR */
@@ -755,8 +673,7 @@ static void init_intel(struct cpuinfo_x86 *c)
/* Work around errata */
srat_detect_node(c);
- if (cpu_has(c, X86_FEATURE_VMX))
- detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
if (cpu_has(c, X86_FEATURE_TME))
detect_tme(c);
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index d6cf5c18a7e0..52de616a8065 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -78,6 +78,7 @@ struct smca_bank_name {
static struct smca_bank_name smca_names[] = {
[SMCA_LS] = { "load_store", "Load Store Unit" },
+ [SMCA_LS_V2] = { "load_store", "Load Store Unit" },
[SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" },
[SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" },
[SMCA_DE] = { "decode_unit", "Decode Unit" },
@@ -138,6 +139,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
/* ZN Core (HWID=0xB0) MCA types */
{ SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF },
+ { SMCA_LS_V2, HWID_MCATYPE(0xB0, 0x10), 0xFFFFFF },
{ SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF },
{ SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF },
{ SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF },
@@ -1161,9 +1163,12 @@ static const struct sysfs_ops threshold_ops = {
.store = store,
};
+static void threshold_block_release(struct kobject *kobj);
+
static struct kobj_type threshold_ktype = {
.sysfs_ops = &threshold_ops,
.default_attrs = default_attrs,
+ .release = threshold_block_release,
};
static const char *get_name(unsigned int bank, struct threshold_block *b)
@@ -1196,8 +1201,9 @@ static const char *get_name(unsigned int bank, struct threshold_block *b)
return buf_mcatype;
}
-static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
- unsigned int block, u32 address)
+static int allocate_threshold_blocks(unsigned int cpu, struct threshold_bank *tb,
+ unsigned int bank, unsigned int block,
+ u32 address)
{
struct threshold_block *b = NULL;
u32 low, high;
@@ -1241,16 +1247,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
INIT_LIST_HEAD(&b->miscj);
- if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
- list_add(&b->miscj,
- &per_cpu(threshold_banks, cpu)[bank]->blocks->miscj);
- } else {
- per_cpu(threshold_banks, cpu)[bank]->blocks = b;
- }
+ if (tb->blocks)
+ list_add(&b->miscj, &tb->blocks->miscj);
+ else
+ tb->blocks = b;
- err = kobject_init_and_add(&b->kobj, &threshold_ktype,
- per_cpu(threshold_banks, cpu)[bank]->kobj,
- get_name(bank, b));
+ err = kobject_init_and_add(&b->kobj, &threshold_ktype, tb->kobj, get_name(bank, b));
if (err)
goto out_free;
recurse:
@@ -1258,7 +1260,7 @@ recurse:
if (!address)
return 0;
- err = allocate_threshold_blocks(cpu, bank, block, address);
+ err = allocate_threshold_blocks(cpu, tb, bank, block, address);
if (err)
goto out_free;
@@ -1343,8 +1345,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
goto out_free;
}
- per_cpu(threshold_banks, cpu)[bank] = b;
-
if (is_shared_bank(bank)) {
refcount_set(&b->cpus, 1);
@@ -1355,9 +1355,13 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
}
}
- err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank));
- if (!err)
- goto out;
+ err = allocate_threshold_blocks(cpu, b, bank, 0, msr_ops.misc(bank));
+ if (err)
+ goto out_free;
+
+ per_cpu(threshold_banks, cpu)[bank] = b;
+
+ return 0;
out_free:
kfree(b);
@@ -1366,8 +1370,12 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank)
return err;
}
-static void deallocate_threshold_block(unsigned int cpu,
- unsigned int bank)
+static void threshold_block_release(struct kobject *kobj)
+{
+ kfree(to_block(kobj));
+}
+
+static void deallocate_threshold_block(unsigned int cpu, unsigned int bank)
{
struct threshold_block *pos = NULL;
struct threshold_block *tmp = NULL;
@@ -1377,13 +1385,11 @@ static void deallocate_threshold_block(unsigned int cpu,
return;
list_for_each_entry_safe(pos, tmp, &head->blocks->miscj, miscj) {
- kobject_put(&pos->kobj);
list_del(&pos->miscj);
- kfree(pos);
+ kobject_put(&pos->kobj);
}
- kfree(per_cpu(threshold_banks, cpu)[bank]->blocks);
- per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
+ kobject_put(&head->blocks->kobj);
}
static void __threshold_remove_blocks(struct threshold_bank *b)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2e2a421c8528..2c4f949611e4 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -53,8 +53,6 @@
#include "internal.h"
-static DEFINE_MUTEX(mce_log_mutex);
-
/* sysfs synchronization */
static DEFINE_MUTEX(mce_sysfs_mutex);
@@ -156,19 +154,10 @@ void mce_log(struct mce *m)
if (!mce_gen_pool_add(m))
irq_work_queue(&mce_irq_work);
}
-
-void mce_inject_log(struct mce *m)
-{
- mutex_lock(&mce_log_mutex);
- mce_log(m);
- mutex_unlock(&mce_log_mutex);
-}
-EXPORT_SYMBOL_GPL(mce_inject_log);
-
-static struct notifier_block mce_srao_nb;
+EXPORT_SYMBOL_GPL(mce_log);
/*
- * We run the default notifier if we have only the SRAO, the first and the
+ * We run the default notifier if we have only the UC, the first and the
* default notifier registered. I.e., the mandatory NUM_DEFAULT_NOTIFIERS
* notifiers registered on the chain.
*/
@@ -594,26 +583,29 @@ static struct notifier_block first_nb = {
.priority = MCE_PRIO_FIRST,
};
-static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
- void *data)
+static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
+ void *data)
{
struct mce *mce = (struct mce *)data;
unsigned long pfn;
- if (!mce)
+ if (!mce || !mce_usable_address(mce))
return NOTIFY_DONE;
- if (mce_usable_address(mce) && (mce->severity == MCE_AO_SEVERITY)) {
- pfn = mce->addr >> PAGE_SHIFT;
- if (!memory_failure(pfn, 0))
- set_mce_nospec(pfn);
- }
+ if (mce->severity != MCE_AO_SEVERITY &&
+ mce->severity != MCE_DEFERRED_SEVERITY)
+ return NOTIFY_DONE;
+
+ pfn = mce->addr >> PAGE_SHIFT;
+ if (!memory_failure(pfn, 0))
+ set_mce_nospec(pfn);
return NOTIFY_OK;
}
-static struct notifier_block mce_srao_nb = {
- .notifier_call = srao_decode_notifier,
- .priority = MCE_PRIO_SRAO,
+
+static struct notifier_block mce_uc_nb = {
+ .notifier_call = uc_decode_notifier,
+ .priority = MCE_PRIO_UC,
};
static int mce_default_notifier(struct notifier_block *nb, unsigned long val,
@@ -763,26 +755,22 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
log_it:
error_seen = true;
- mce_read_aux(&m, i);
+ if (flags & MCP_DONTLOG)
+ goto clear_it;
+ mce_read_aux(&m, i);
m.severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
-
/*
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
- mce_log(&m);
- else if (mce_usable_address(&m)) {
- /*
- * Although we skipped logging this, we still want
- * to take action. Add to the pool so the registered
- * notifiers will see it.
- */
- if (!mce_gen_pool_add(&m))
- mce_schedule_work();
- }
+ if (mca_cfg.dont_log_ce && !mce_usable_address(&m))
+ goto clear_it;
+
+ mce_log(&m);
+
+clear_it:
/*
* Clear state for this bank.
*/
@@ -807,7 +795,7 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
- char *tmp;
+ char *tmp = *msg;
int i;
for (i = 0; i < this_cpu_read(mce_num_banks); i++) {
@@ -1232,8 +1220,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
int cpu = smp_processor_id();
- char *msg = "Unknown";
struct mce m, *final;
+ char *msg = NULL;
int worst = 0;
/*
@@ -1365,7 +1353,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
ist_end_non_atomic();
} else {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, NULL);
+ mce_panic("Failed kernel mode recovery", &m, msg);
}
out_ist:
@@ -2041,7 +2029,7 @@ int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&first_nb);
- mce_register_decode_chain(&mce_srao_nb);
+ mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
mcheck_vendor_init_severity();
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 1f30117b24ba..3413b41b8d55 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -494,7 +494,7 @@ static void do_inject(void)
i_mce.status |= MCI_STATUS_SYNDV;
if (inj_type == SW_INJ) {
- mce_inject_log(&i_mce);
+ mce_log(&i_mce);
return;
}
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index e270d0770134..f996ffb887bc 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -115,15 +115,16 @@ static bool lmce_supported(void)
/*
* BIOS should indicate support for LMCE by setting bit 20 in
- * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
- * generate a #GP fault.
+ * IA32_FEAT_CTL without which touching MCG_EXT_CTL will generate a #GP
+ * fault. The MSR must also be locked for LMCE_ENABLED to take effect.
+ * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally
+ * locks the MSR in the event that it wasn't already locked by BIOS.
*/
- rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
- if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
- (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
- return true;
+ rdmsrl(MSR_IA32_FEAT_CTL, tmp);
+ if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED)))
+ return false;
- return false;
+ return tmp & FEAT_CTL_LMCE_ENABLED;
}
bool mce_intel_cmci_poll(void)
@@ -492,17 +493,18 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
return;
if ((val & 3UL) == 1UL) {
- /* PPIN available but disabled: */
+ /* PPIN locked in disabled mode */
return;
}
- /* If PPIN is disabled, but not locked, try to enable: */
- if (!(val & 3UL)) {
+ /* If PPIN is disabled, try to enable */
+ if (!(val & 2UL)) {
wrmsrl_safe(MSR_PPIN_CTL, val | 2UL);
rdmsrl_safe(MSR_PPIN_CTL, &val);
}
- if ((val & 3UL) == 2UL)
+ /* Is the enable bit set? */
+ if (val & 2UL)
set_cpu_cap(c, X86_FEATURE_INTEL_PPIN);
}
}
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 842b273bce31..b785c0d0b590 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -84,8 +84,6 @@ static inline int apei_clear_mce(u64 record_id)
}
#endif
-void mce_inject_log(struct mce *m);
-
/*
* We consider records to be equivalent if bank+status+addr+misc all match.
* This is only used when the system is going down because of a fatal error
diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c
index 6c3e1c92f183..f36dc0742085 100644
--- a/arch/x86/kernel/cpu/mce/therm_throt.c
+++ b/arch/x86/kernel/cpu/mce/therm_throt.c
@@ -235,7 +235,7 @@ static void get_therm_status(int level, bool *proc_hot, u8 *temp)
*temp = (msr_val >> 16) & 0x7F;
}
-static void throttle_active_work(struct work_struct *work)
+static void __maybe_unused throttle_active_work(struct work_struct *work)
{
struct _thermal_state *state = container_of(to_delayed_work(work),
struct _thermal_state, therm_work);
@@ -486,9 +486,14 @@ static int thermal_throttle_offline(unsigned int cpu)
{
struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
+ u32 l;
+
+ /* Mask the thermal vector before draining evtl. pending work */
+ l = apic_read(APIC_LVTTHMR);
+ apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
- cancel_delayed_work(&state->package_throttle.therm_work);
- cancel_delayed_work(&state->core_throttle.therm_work);
+ cancel_delayed_work_sync(&state->package_throttle.therm_work);
+ cancel_delayed_work_sync(&state->core_throttle.therm_work);
state->package_throttle.rate_control_active = false;
state->core_throttle.rate_control_active = false;
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index aed45b8895d5..1db560ed2ca3 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,8 +6,7 @@
set -e
-IN=$1
-OUT=$2
+OUT=$1
dump_array()
{
@@ -15,6 +14,7 @@ dump_array()
SIZE=$2
PFX=$3
POSTFIX=$4
+ IN=$5
PFX_SZ=$(echo $PFX | wc -c)
TABS="$(printf '\t\t\t\t\t')"
@@ -57,11 +57,18 @@ trap 'rm "$OUT"' EXIT
echo "#endif"
echo ""
- dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" ""
+ dump_array "x86_cap_flags" "NCAPINTS*32" "X86_FEATURE_" "" $2
echo ""
- dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32"
+ dump_array "x86_bug_flags" "NBUGINTS*32" "X86_BUG_" "NCAPINTS*32" $2
+ echo ""
+ echo "#ifdef CONFIG_X86_VMX_FEATURE_NAMES"
+ echo "#ifndef _ASM_X86_VMXFEATURES_H"
+ echo "#include <asm/vmxfeatures.h>"
+ echo "#endif"
+ dump_array "x86_vmx_flags" "NVMXINTS*32" "VMX_FEATURE_" "" $3
+ echo "#endif /* CONFIG_X86_VMX_FEATURE_NAMES */"
) > $OUT
trap - EXIT
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index aa5c064a6a22..51b9190c628b 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -15,7 +15,7 @@
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 4d36dcc1cf87..a5c506f6da7f 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -101,9 +101,6 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
int length;
size_t linelen;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
memset(line, 0, LINE_SIZE);
len = min_t(size_t, len, LINE_SIZE - 1);
@@ -226,8 +223,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 0);
@@ -236,24 +231,18 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_add(sentry.base, sentry.size, sentry.type, false);
break;
case MTRRIOC_DEL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 0);
break;
case MTRRIOC_KILL_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_ENTRY:
@@ -279,8 +268,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_ADD_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_file_add(sentry.base, sentry.size, sentry.type, true,
file, 1);
@@ -289,8 +276,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_SET_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err =
mtrr_add_page(sentry.base, sentry.size, sentry.type, false);
break;
@@ -298,16 +283,12 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
#ifdef CONFIG_COMPAT
case MTRRIOC32_DEL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_file_del(sentry.base, sentry.size, file, 1);
break;
case MTRRIOC_KILL_PAGE_ENTRY:
#ifdef CONFIG_COMPAT
case MTRRIOC32_KILL_PAGE_ENTRY:
#endif
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
err = mtrr_del_page(-1, sentry.base, sentry.size);
break;
case MTRRIOC_GET_PAGE_ENTRY:
@@ -373,28 +354,6 @@ static int mtrr_close(struct inode *ino, struct file *file)
return single_release(ino, file);
}
-static int mtrr_seq_show(struct seq_file *seq, void *offset);
-
-static int mtrr_open(struct inode *inode, struct file *file)
-{
- if (!mtrr_if)
- return -EIO;
- if (!mtrr_if->get)
- return -ENXIO;
- return single_open(file, mtrr_seq_show, NULL);
-}
-
-static const struct file_operations mtrr_fops = {
- .owner = THIS_MODULE,
- .open = mtrr_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = mtrr_write,
- .unlocked_ioctl = mtrr_ioctl,
- .compat_ioctl = mtrr_ioctl,
- .release = mtrr_close,
-};
-
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
@@ -426,6 +385,29 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
return 0;
}
+static int mtrr_open(struct inode *inode, struct file *file)
+{
+ if (!mtrr_if)
+ return -EIO;
+ if (!mtrr_if->get)
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ return single_open(file, mtrr_seq_show, NULL);
+}
+
+static const struct proc_ops mtrr_proc_ops = {
+ .proc_open = mtrr_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = mtrr_write,
+ .proc_ioctl = mtrr_ioctl,
+#ifdef CONFIG_COMPAT
+ .proc_compat_ioctl = mtrr_ioctl,
+#endif
+ .proc_release = mtrr_close,
+};
+
static int __init mtrr_if_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -436,7 +418,7 @@ static int __init mtrr_if_init(void)
(!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
return -ENODEV;
- proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_fops);
+ proc_create("mtrr", S_IWUSR | S_IRUGO, NULL, &mtrr_proc_ops);
return 0;
}
arch_initcall(mtrr_if_init);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.c b/arch/x86/kernel/cpu/mtrr/mtrr.c
index 507039c20128..6a80f36b5d59 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.c
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.c
@@ -52,7 +52,7 @@
#include <asm/e820/api.h>
#include <asm/mtrr.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include "mtrr.h"
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index cb2e49810d68..4eec8889b0ff 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -7,6 +7,10 @@
#include "cpu.h"
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+extern const char * const x86_vmx_flags[NVMXINTS*32];
+#endif
+
/*
* Get CPU information for use by the procfs.
*/
@@ -102,6 +106,17 @@ static int show_cpuinfo(struct seq_file *m, void *v)
if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
+#ifdef CONFIG_X86_VMX_FEATURE_NAMES
+ if (cpu_has(c, X86_FEATURE_VMX) && c->vmx_capability[0]) {
+ seq_puts(m, "\nvmx flags\t:");
+ for (i = 0; i < 32*NVMXINTS; i++) {
+ if (test_bit(i, (unsigned long *)c->vmx_capability) &&
+ x86_vmx_flags[i] != NULL)
+ seq_printf(m, " %s", x86_vmx_flags[i]);
+ }
+ }
+#endif
+
seq_puts(m, "\nbugs\t\t:");
for (i = 0; i < 32*NBUGINTS; i++) {
unsigned int bug_bit = 32*NCAPINTS + i;
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index e49b77283924..181c992f448c 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -57,6 +57,7 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
}
DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
/**
* struct mon_evt - Entry in the event list of a resource
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 397206f23d14..773124b0e18a 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -514,7 +514,7 @@ void mbm_handle_overflow(struct work_struct *work)
mutex_lock(&rdtgroup_mutex);
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
@@ -543,7 +543,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
unsigned long delay = msecs_to_jiffies(delay_ms);
int cpu;
- if (!static_branch_likely(&rdt_enable_key))
+ if (!static_branch_likely(&rdt_mon_enable_key))
return;
cpu = cpumask_any(&dom->cpu_mask);
dom->mbm_work_cpu = cpu;
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index dac7209a0708..064e9ef44cd6 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -532,11 +532,15 @@ static void move_myself(struct callback_head *head)
kfree(rdtgrp);
}
+ if (unlikely(current->flags & PF_EXITING))
+ goto out;
+
preempt_disable();
/* update PQR_ASSOC MSR to make resource group go into effect */
resctrl_sched_in();
preempt_enable();
+out:
kfree(callback);
}
@@ -725,6 +729,92 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
return ret;
}
+#ifdef CONFIG_PROC_CPU_RESCTRL
+
+/*
+ * A task can only be part of one resctrl control group and of one monitor
+ * group which is associated to that control group.
+ *
+ * 1) res:
+ * mon:
+ *
+ * resctrl is not available.
+ *
+ * 2) res:/
+ * mon:
+ *
+ * Task is part of the root resctrl control group, and it is not associated
+ * to any monitor group.
+ *
+ * 3) res:/
+ * mon:mon0
+ *
+ * Task is part of the root resctrl control group and monitor group mon0.
+ *
+ * 4) res:group0
+ * mon:
+ *
+ * Task is part of resctrl control group group0, and it is not associated
+ * to any monitor group.
+ *
+ * 5) res:group0
+ * mon:mon1
+ *
+ * Task is part of resctrl control group group0 and monitor group mon1.
+ */
+int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ struct rdtgroup *rdtg;
+ int ret = 0;
+
+ mutex_lock(&rdtgroup_mutex);
+
+ /* Return empty if resctrl has not been mounted. */
+ if (!static_branch_unlikely(&rdt_enable_key)) {
+ seq_puts(s, "res:\nmon:\n");
+ goto unlock;
+ }
+
+ list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
+ struct rdtgroup *crg;
+
+ /*
+ * Task information is only relevant for shareable
+ * and exclusive groups.
+ */
+ if (rdtg->mode != RDT_MODE_SHAREABLE &&
+ rdtg->mode != RDT_MODE_EXCLUSIVE)
+ continue;
+
+ if (rdtg->closid != tsk->closid)
+ continue;
+
+ seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
+ rdtg->kn->name);
+ seq_puts(s, "mon:");
+ list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
+ mon.crdtgrp_list) {
+ if (tsk->rmid != crg->mon.rmid)
+ continue;
+ seq_printf(s, "%s", crg->kn->name);
+ break;
+ }
+ seq_putc(s, '\n');
+ goto unlock;
+ }
+ /*
+ * The above search should succeed. Otherwise return
+ * with an error.
+ */
+ ret = -ENOENT;
+unlock:
+ mutex_unlock(&rdtgroup_mutex);
+
+ return ret;
+}
+#endif
+
static int rdt_last_cmd_status_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
@@ -1970,7 +2060,7 @@ static int rdt_get_tree(struct fs_context *fc)
if (rdt_mon_capable) {
ret = mongroup_create_dir(rdtgroup_default.kn,
- NULL, "mon_groups",
+ &rdtgroup_default, "mon_groups",
&kn_mongrp);
if (ret < 0)
goto out_info;
@@ -2037,25 +2127,20 @@ enum rdt_param {
nr__rdt_params
};
-static const struct fs_parameter_spec rdt_param_specs[] = {
+static const struct fs_parameter_spec rdt_fs_parameters[] = {
fsparam_flag("cdp", Opt_cdp),
fsparam_flag("cdpl2", Opt_cdpl2),
fsparam_flag("mba_MBps", Opt_mba_mbps),
{}
};
-static const struct fs_parameter_description rdt_fs_parameters = {
- .name = "rdt",
- .specs = rdt_param_specs,
-};
-
static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct rdt_fs_context *ctx = rdt_fc2context(fc);
struct fs_parse_result result;
int opt;
- opt = fs_parse(fc, &rdt_fs_parameters, param, &result);
+ opt = fs_parse(fc, rdt_fs_parameters, param, &result);
if (opt < 0)
return opt;
@@ -2205,7 +2290,11 @@ static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
free_rmid(sentry->mon.rmid);
list_del(&sentry->mon.crdtgrp_list);
- kfree(sentry);
+
+ if (atomic_read(&sentry->waitcount) != 0)
+ sentry->flags = RDT_DELETED;
+ else
+ kfree(sentry);
}
}
@@ -2243,7 +2332,11 @@ static void rmdir_all_sub(void)
kernfs_remove(rdtgrp->kn);
list_del(&rdtgrp->rdtgroup_list);
- kfree(rdtgrp);
+
+ if (atomic_read(&rdtgrp->waitcount) != 0)
+ rdtgrp->flags = RDT_DELETED;
+ else
+ kfree(rdtgrp);
}
/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
update_closid_rmid(cpu_online_mask, &rdtgroup_default);
@@ -2280,7 +2373,7 @@ static void rdt_kill_sb(struct super_block *sb)
static struct file_system_type rdt_fs_type = {
.name = "resctrl",
.init_fs_context = rdt_init_fs_context,
- .parameters = &rdt_fs_parameters,
+ .parameters = rdt_fs_parameters,
.kill_sb = rdt_kill_sb,
};
@@ -2446,7 +2539,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
/*
* Create the mon_data directory first.
*/
- ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
+ ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
if (ret)
return ret;
@@ -2636,7 +2729,6 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode,
enum rdt_group_type rtype, struct rdtgroup **r)
{
@@ -2645,7 +2737,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
uint files = 0;
int ret;
- prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
+ prdtgrp = rdtgroup_kn_lock_live(parent_kn);
if (!prdtgrp) {
ret = -ENODEV;
goto out_unlock;
@@ -2718,7 +2810,7 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
kernfs_activate(kn);
/*
- * The caller unlocks the prgrp_kn upon success.
+ * The caller unlocks the parent_kn upon success.
*/
return 0;
@@ -2729,7 +2821,7 @@ out_destroy:
out_free_rgrp:
kfree(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2746,15 +2838,12 @@ static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
* to monitor a subset of tasks and cpus in its parent ctrl_mon group.
*/
static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
- const char *name,
- umode_t mode)
+ const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp, *prgrp;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2767,7 +2856,7 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
*/
list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2776,7 +2865,6 @@ static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
* to allocate and monitor resources.
*/
static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
- struct kernfs_node *prgrp_kn,
const char *name, umode_t mode)
{
struct rdtgroup *rdtgrp;
@@ -2784,8 +2872,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
u32 closid;
int ret;
- ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
- &rdtgrp);
+ ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp);
if (ret)
return ret;
@@ -2810,7 +2897,7 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
* Create an empty mon_groups directory to hold the subset
* of tasks and cpus to monitor.
*/
- ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
+ ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL);
if (ret) {
rdt_last_cmd_puts("kernfs subdir error\n");
goto out_del_list;
@@ -2826,7 +2913,7 @@ out_id_free:
out_common_fail:
mkdir_rdt_prepare_clean(rdtgrp);
out_unlock:
- rdtgroup_kn_unlock(prgrp_kn);
+ rdtgroup_kn_unlock(parent_kn);
return ret;
}
@@ -2859,14 +2946,14 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
* subdirectory
*/
if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
- return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
+ return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode);
/*
* If RDT monitoring is supported and the parent directory is a valid
* "mon_groups" directory, add a monitoring subdirectory.
*/
if (rdt_mon_capable && is_mon_groups(parent_kn, name))
- return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
+ return rdtgroup_mkdir_mon(parent_kn, name, mode);
return -EPERM;
}
@@ -2952,13 +3039,13 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
closid_free(rdtgrp->closid);
free_rmid(rdtgrp->mon.rmid);
+ rdtgroup_ctrl_remove(kn, rdtgrp);
+
/*
* Free all the child monitor group rmids.
*/
free_all_child_rdtgrp(rdtgrp);
- rdtgroup_ctrl_remove(kn, rdtgrp);
-
return 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index adf9b71386ef..62b137c3c97a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -4,7 +4,7 @@
*/
#include <linux/cpu.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/apic.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c
index ee48c3fc8a65..d3a0791bc052 100644
--- a/arch/x86/kernel/cpu/topology.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -7,7 +7,7 @@
#include <linux/cpu.h>
#include <asm/apic.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/processor.h>
#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c
index 3e20d322bc98..e2ad30e474f8 100644
--- a/arch/x86/kernel/cpu/tsx.c
+++ b/arch/x86/kernel/cpu/tsx.c
@@ -14,6 +14,9 @@
#include "cpu.h"
+#undef pr_fmt
+#define pr_fmt(fmt) "tsx: " fmt
+
enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED;
void tsx_disable(void)
@@ -99,7 +102,7 @@ void __init tsx_init(void)
tsx_ctrl_state = x86_get_tsx_auto_mode();
} else {
tsx_ctrl_state = TSX_CTRL_DISABLE;
- pr_err("tsx: invalid option, defaulting to off\n");
+ pr_err("invalid option, defaulting to off\n");
}
} else {
/* tsx= not provided */
@@ -115,11 +118,12 @@ void __init tsx_init(void)
tsx_disable();
/*
- * tsx_disable() will change the state of the
- * RTM CPUID bit. Clear it here since it is now
- * expected to be not set.
+ * tsx_disable() will change the state of the RTM and HLE CPUID
+ * bits. Clear them here since they are now expected to be not
+ * set.
*/
setup_clear_cpu_cap(X86_FEATURE_RTM);
+ setup_clear_cpu_cap(X86_FEATURE_HLE);
} else if (tsx_ctrl_state == TSX_CTRL_ENABLE) {
/*
@@ -131,10 +135,10 @@ void __init tsx_init(void)
tsx_enable();
/*
- * tsx_enable() will change the state of the
- * RTM CPUID bit. Force it here since it is now
- * expected to be set.
+ * tsx_enable() will change the state of the RTM and HLE CPUID
+ * bits. Force them here since they are now expected to be set.
*/
setup_force_cpu_cap(X86_FEATURE_RTM);
+ setup_force_cpu_cap(X86_FEATURE_HLE);
}
}
diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c
index 8e6f2f4b4afe..df1358ba622b 100644
--- a/arch/x86/kernel/cpu/zhaoxin.c
+++ b/arch/x86/kernel/cpu/zhaoxin.c
@@ -16,13 +16,6 @@
#define RNG_ENABLED (1 << 3)
#define RNG_ENABLE (1 << 8) /* MSR_ZHAOXIN_RNG */
-#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
-#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
-#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
-#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
-#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
-#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
-
static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
{
u32 lo, hi;
@@ -58,8 +51,6 @@ static void init_zhaoxin_cap(struct cpuinfo_x86 *c)
if (c->x86 >= 0x6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
- cpu_detect_cache_sizes(c);
}
static void early_init_zhaoxin(struct cpuinfo_x86 *c)
@@ -89,31 +80,6 @@ static void early_init_zhaoxin(struct cpuinfo_x86 *c)
}
-static void zhaoxin_detect_vmx_virtcap(struct cpuinfo_x86 *c)
-{
- u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
-
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
- msr_ctl = vmx_msr_high | vmx_msr_low;
-
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
- set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
- set_cpu_cap(c, X86_FEATURE_VNMI);
- if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- vmx_msr_low, vmx_msr_high);
- msr_ctl2 = vmx_msr_high | vmx_msr_low;
- if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
- (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
- set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
- set_cpu_cap(c, X86_FEATURE_EPT);
- if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
- set_cpu_cap(c, X86_FEATURE_VPID);
- }
-}
-
static void init_zhaoxin(struct cpuinfo_x86 *c)
{
early_init_zhaoxin(c);
@@ -141,8 +107,7 @@ static void init_zhaoxin(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
#endif
- if (cpu_has(c, X86_FEATURE_VMX))
- zhaoxin_detect_vmx_virtcap(c);
+ init_ia32_feat_ctl(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 00fc55ac7ffa..fd87b59452a3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -370,7 +370,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
/* Add crashk_low_res region */
if (crashk_low_res.end) {
ei.addr = crashk_low_res.start;
- ei.size = crashk_low_res.end - crashk_low_res.start + 1;
+ ei.size = resource_size(&crashk_low_res);
ei.type = E820_TYPE_RAM;
add_e820_entry(params, &ei);
}
diff --git a/arch/x86/kernel/crash_core_32.c b/arch/x86/kernel/crash_core_32.c
new file mode 100644
index 000000000000..c0159a7bca6d
--- /dev/null
+++ b/arch/x86/kernel/crash_core_32.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crash_core.h>
+
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+#ifdef CONFIG_X86_PAE
+ VMCOREINFO_CONFIG(X86_PAE);
+#endif
+}
diff --git a/arch/x86/kernel/crash_core_64.c b/arch/x86/kernel/crash_core_64.c
new file mode 100644
index 000000000000..845a57eb4eb7
--- /dev/null
+++ b/arch/x86/kernel/crash_core_64.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/crash_core.h>
+
+#include <asm/pgtable.h>
+#include <asm/setup.h>
+
+void arch_crash_save_vmcoreinfo(void)
+{
+ u64 sme_mask = sme_me_mask;
+
+ VMCOREINFO_NUMBER(phys_base);
+ VMCOREINFO_SYMBOL(init_top_pgt);
+ vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
+ pgtable_l5_enabled());
+
+#ifdef CONFIG_NUMA
+ VMCOREINFO_SYMBOL(node_data);
+ VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
+#endif
+ vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
+ VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
+ VMCOREINFO_NUMBER(sme_mask);
+}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e07424e19274..ae64ec7f752f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -365,7 +365,7 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr)
}
NOKPROBE_SYMBOL(oops_end);
-int __die(const char *str, struct pt_regs *regs, long err)
+static void __die_header(const char *str, struct pt_regs *regs, long err)
{
const char *pr = "";
@@ -384,7 +384,11 @@ int __die(const char *str, struct pt_regs *regs, long err)
IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ?
(boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : "");
+}
+NOKPROBE_SYMBOL(__die_header);
+static int __die_body(const char *str, struct pt_regs *regs, long err)
+{
show_regs(regs);
print_modules();
@@ -394,6 +398,13 @@ int __die(const char *str, struct pt_regs *regs, long err)
return 0;
}
+NOKPROBE_SYMBOL(__die_body);
+
+int __die(const char *str, struct pt_regs *regs, long err)
+{
+ __die_header(str, regs, err);
+ return __die_body(str, regs, err);
+}
NOKPROBE_SYMBOL(__die);
/*
@@ -410,6 +421,19 @@ void die(const char *str, struct pt_regs *regs, long err)
oops_end(flags, regs, sig);
}
+void die_addr(const char *str, struct pt_regs *regs, long err, long gp_addr)
+{
+ unsigned long flags = oops_begin();
+ int sig = SIGSEGV;
+
+ __die_header(str, regs, err);
+ if (gp_addr)
+ kasan_non_canonical_hook(gp_addr);
+ if (__die_body(str, regs, err))
+ sig = 0;
+ oops_end(flags, regs, sig);
+}
+
void show_regs(struct pt_regs *regs)
{
show_regs_print_info(KERN_DEFAULT);
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 0071b794ed19..400a05e1c1c5 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -352,6 +352,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
fpregs_unlock();
return 0;
}
+ fpregs_deactivate(fpu);
fpregs_unlock();
}
@@ -403,6 +404,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
}
if (!ret)
fpregs_mark_activate();
+ else
+ fpregs_deactivate(fpu);
fpregs_unlock();
err_out:
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index fa31470bbf24..a1806598aaa4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -107,23 +107,20 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
}
EXPORT_SYMBOL_GPL(cpu_has_xfeatures);
-static int xfeature_is_supervisor(int xfeature_nr)
+static bool xfeature_is_supervisor(int xfeature_nr)
{
/*
- * We currently do not support supervisor states, but if
- * we did, we could find out like this.
- *
- * SDM says: If state component 'i' is a user state component,
- * ECX[0] return 0; if state component i is a supervisor
- * state component, ECX[0] returns 1.
+ * Extended State Enumeration Sub-leaves (EAX = 0DH, ECX = n, n > 1)
+ * returns ECX[0] set to (1) for a supervisor state, and cleared (0)
+ * for a user state.
*/
u32 eax, ebx, ecx, edx;
cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx);
- return !!(ecx & 1);
+ return ecx & 1;
}
-static int xfeature_is_user(int xfeature_nr)
+static bool xfeature_is_user(int xfeature_nr)
{
return !xfeature_is_supervisor(xfeature_nr);
}
@@ -419,7 +416,8 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask;
+ init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
+ xfeatures_mask;
/*
* Init all the features state with header.xfeatures being 0x0
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 024c3053dbba..37a0aeaf89e7 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/module.h>
#include <linux/memory.h>
+#include <linux/vmalloc.h>
#include <trace/syscall.h>
@@ -34,6 +35,8 @@
#ifdef CONFIG_DYNAMIC_FTRACE
+static int ftrace_poke_late = 0;
+
int ftrace_arch_code_modify_prepare(void)
__acquires(&text_mutex)
{
@@ -43,84 +46,37 @@ int ftrace_arch_code_modify_prepare(void)
* ftrace has it set to "read/write".
*/
mutex_lock(&text_mutex);
- set_kernel_text_rw();
- set_all_modules_text_rw();
+ ftrace_poke_late = 1;
return 0;
}
int ftrace_arch_code_modify_post_process(void)
__releases(&text_mutex)
{
- set_all_modules_text_ro();
- set_kernel_text_ro();
+ /*
+ * ftrace_make_{call,nop}() may be called during
+ * module load, and we need to finish the text_poke_queue()
+ * that they do, here.
+ */
+ text_poke_finish();
+ ftrace_poke_late = 0;
mutex_unlock(&text_mutex);
return 0;
}
-union ftrace_code_union {
- char code[MCOUNT_INSN_SIZE];
- struct {
- unsigned char op;
- int offset;
- } __attribute__((packed));
-};
-
-static int ftrace_calc_offset(long ip, long addr)
-{
- return (int)(addr - ip);
-}
-
-static unsigned char *
-ftrace_text_replace(unsigned char op, unsigned long ip, unsigned long addr)
+static const char *ftrace_nop_replace(void)
{
- static union ftrace_code_union calc;
-
- calc.op = op;
- calc.offset = ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
-
- return calc.code;
-}
-
-static unsigned char *
-ftrace_call_replace(unsigned long ip, unsigned long addr)
-{
- return ftrace_text_replace(0xe8, ip, addr);
-}
-
-static inline int
-within(unsigned long addr, unsigned long start, unsigned long end)
-{
- return addr >= start && addr < end;
+ return ideal_nops[NOP_ATOMIC5];
}
-static unsigned long text_ip_addr(unsigned long ip)
+static const char *ftrace_call_replace(unsigned long ip, unsigned long addr)
{
- /*
- * On x86_64, kernel text mappings are mapped read-only, so we use
- * the kernel identity mapping instead of the kernel text mapping
- * to modify the kernel text.
- *
- * For 32bit kernels, these mappings are same and we can use
- * kernel identity mapping to modify code.
- */
- if (within(ip, (unsigned long)_text, (unsigned long)_etext))
- ip = (unsigned long)__va(__pa_symbol(ip));
-
- return ip;
+ return text_gen_insn(CALL_INSN_OPCODE, (void *)ip, (void *)addr);
}
-static const unsigned char *ftrace_nop_replace(void)
+static int ftrace_verify_code(unsigned long ip, const char *old_code)
{
- return ideal_nops[NOP_ATOMIC5];
-}
-
-static int
-ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
-
- ftrace_expected = old_code;
+ char cur_code[MCOUNT_INSN_SIZE];
/*
* Note:
@@ -129,31 +85,46 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
* Carefully read and modify the code with probe_kernel_*(), and make
* sure what we read is what we expected it to be before modifying it.
*/
-
/* read the text we want to modify */
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+ if (probe_kernel_read(cur_code, (void *)ip, MCOUNT_INSN_SIZE)) {
+ WARN_ON(1);
return -EFAULT;
+ }
/* Make sure it is what we expect it to be */
- if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+ if (memcmp(cur_code, old_code, MCOUNT_INSN_SIZE) != 0) {
+ WARN_ON(1);
return -EINVAL;
+ }
- ip = text_ip_addr(ip);
-
- /* replace the text with the new text */
- if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
- return -EPERM;
+ return 0;
+}
- sync_core();
+/*
+ * Marked __ref because it calls text_poke_early() which is .init.text. That is
+ * ok because that call will happen early, during boot, when .init sections are
+ * still present.
+ */
+static int __ref
+ftrace_modify_code_direct(unsigned long ip, const char *old_code,
+ const char *new_code)
+{
+ int ret = ftrace_verify_code(ip, old_code);
+ if (ret)
+ return ret;
+ /* replace the text with the new text */
+ if (ftrace_poke_late)
+ text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL);
+ else
+ text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE);
return 0;
}
-int ftrace_make_nop(struct module *mod,
- struct dyn_ftrace *rec, unsigned long addr)
+int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_call_replace(ip, addr);
new = ftrace_nop_replace();
@@ -167,19 +138,20 @@ int ftrace_make_nop(struct module *mod,
* just modify the code directly.
*/
if (addr == MCOUNT_ADDR)
- return ftrace_modify_code_direct(rec->ip, old, new);
+ return ftrace_modify_code_direct(ip, old, new);
- ftrace_expected = NULL;
-
- /* Normal cases use add_brk_on_nop */
+ /*
+ * x86 overrides ftrace_replace_code -- this function will never be used
+ * in this case.
+ */
WARN_ONCE(1, "invalid use of ftrace_make_nop");
return -EINVAL;
}
int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
{
- unsigned const char *new, *old;
unsigned long ip = rec->ip;
+ const char *new, *old;
old = ftrace_nop_replace();
new = ftrace_call_replace(ip, addr);
@@ -189,43 +161,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
}
/*
- * The modifying_ftrace_code is used to tell the breakpoint
- * handler to call ftrace_int3_handler(). If it fails to
- * call this handler for a breakpoint added by ftrace, then
- * the kernel may crash.
- *
- * As atomic_writes on x86 do not need a barrier, we do not
- * need to add smp_mb()s for this to work. It is also considered
- * that we can not read the modifying_ftrace_code before
- * executing the breakpoint. That would be quite remarkable if
- * it could do that. Here's the flow that is required:
- *
- * CPU-0 CPU-1
- *
- * atomic_inc(mfc);
- * write int3s
- * <trap-int3> // implicit (r)mb
- * if (atomic_read(mfc))
- * call ftrace_int3_handler()
- *
- * Then when we are finished:
- *
- * atomic_dec(mfc);
- *
- * If we hit a breakpoint that was not set by ftrace, it does not
- * matter if ftrace_int3_handler() is called or not. It will
- * simply be ignored. But it is crucial that a ftrace nop/caller
- * breakpoint is handled. No other user should ever place a
- * breakpoint on an ftrace nop/caller location. It must only
- * be done by this code.
- */
-atomic_t modifying_ftrace_code __read_mostly;
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code);
-
-/*
* Should never be called:
* As it is only called by __ftrace_replace_code() which is called by
* ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
@@ -237,452 +172,84 @@ int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
unsigned long addr)
{
WARN_ON(1);
- ftrace_expected = NULL;
return -EINVAL;
}
-static unsigned long ftrace_update_func;
-static unsigned long ftrace_update_func_call;
-
-static int update_ftrace_func(unsigned long ip, void *new)
-{
- unsigned char old[MCOUNT_INSN_SIZE];
- int ret;
-
- memcpy(old, (void *)ip, MCOUNT_INSN_SIZE);
-
- ftrace_update_func = ip;
- /* Make sure the breakpoints see the ftrace_update_func update */
- smp_wmb();
-
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
- ret = ftrace_modify_code(ip, old, new);
-
- atomic_dec(&modifying_ftrace_code);
-
- return ret;
-}
-
int ftrace_update_ftrace_func(ftrace_func_t func)
{
- unsigned long ip = (unsigned long)(&ftrace_call);
- unsigned char *new;
- int ret;
-
- ftrace_update_func_call = (unsigned long)func;
-
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
-
- /* Also update the regs callback function */
- if (!ret) {
- ip = (unsigned long)(&ftrace_regs_call);
- new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- }
-
- return ret;
-}
-
-static nokprobe_inline int is_ftrace_caller(unsigned long ip)
-{
- if (ip == ftrace_update_func)
- return 1;
-
- return 0;
-}
-
-/*
- * A breakpoint was added to the code address we are about to
- * modify, and this is the handle that will just skip over it.
- * We are either changing a nop into a trace call, or a trace
- * call to a nop. While the change is taking place, we treat
- * it just like it was a nop.
- */
-int ftrace_int3_handler(struct pt_regs *regs)
-{
unsigned long ip;
+ const char *new;
- if (WARN_ON_ONCE(!regs))
- return 0;
-
- ip = regs->ip - INT3_INSN_SIZE;
-
- if (ftrace_location(ip)) {
- int3_emulate_call(regs, (unsigned long)ftrace_regs_caller);
- return 1;
- } else if (is_ftrace_caller(ip)) {
- if (!ftrace_update_func_call) {
- int3_emulate_jmp(regs, ip + CALL_INSN_SIZE);
- return 1;
- }
- int3_emulate_call(regs, ftrace_update_func_call);
- return 1;
- }
-
- return 0;
-}
-NOKPROBE_SYMBOL(ftrace_int3_handler);
-
-static int ftrace_write(unsigned long ip, const char *val, int size)
-{
- ip = text_ip_addr(ip);
-
- if (probe_kernel_write((void *)ip, val, size))
- return -EPERM;
-
- return 0;
-}
-
-static int add_break(unsigned long ip, const char *old)
-{
- unsigned char replaced[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
-
- if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- ftrace_expected = old;
-
- /* Make sure it is what we expect it to be */
- if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0)
- return -EINVAL;
-
- return ftrace_write(ip, &brk, 1);
-}
-
-static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned const char *old;
- unsigned long ip = rec->ip;
-
- old = ftrace_call_replace(ip, addr);
-
- return add_break(rec->ip, old);
-}
-
-
-static int add_brk_on_nop(struct dyn_ftrace *rec)
-{
- unsigned const char *old;
-
- old = ftrace_nop_replace();
-
- return add_break(rec->ip, old);
-}
-
-static int add_breakpoints(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ftrace_addr = ftrace_get_addr_curr(rec);
-
- ret = ftrace_test_record(rec, enable);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_brk_on_nop(rec);
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_brk_on_call(rec, ftrace_addr);
- }
- return 0;
-}
-
-/*
- * On error, we need to remove breakpoints. This needs to
- * be done caefully. If the address does not currently have a
- * breakpoint, we know we are done. Otherwise, we look at the
- * remaining 4 bytes of the instruction. If it matches a nop
- * we replace the breakpoint with the nop. Otherwise we replace
- * it with the call instruction.
- */
-static int remove_breakpoint(struct dyn_ftrace *rec)
-{
- unsigned char ins[MCOUNT_INSN_SIZE];
- unsigned char brk = BREAKPOINT_INSTRUCTION;
- const unsigned char *nop;
- unsigned long ftrace_addr;
- unsigned long ip = rec->ip;
-
- /* If we fail the read, just give up */
- if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
- return -EFAULT;
-
- /* If this does not have a breakpoint, we are done */
- if (ins[0] != brk)
- return 0;
-
- nop = ftrace_nop_replace();
-
- /*
- * If the last 4 bytes of the instruction do not match
- * a nop, then we assume that this is a call to ftrace_addr.
- */
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
- /*
- * For extra paranoidism, we check if the breakpoint is on
- * a call that would actually jump to the ftrace_addr.
- * If not, don't touch the breakpoint, we make just create
- * a disaster.
- */
- ftrace_addr = ftrace_get_addr_new(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
- goto update;
-
- /* Check both ftrace_addr and ftrace_old_addr */
- ftrace_addr = ftrace_get_addr_curr(rec);
- nop = ftrace_call_replace(ip, ftrace_addr);
-
- ftrace_expected = nop;
-
- if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
- return -EINVAL;
- }
-
- update:
- return ftrace_write(ip, nop, 1);
-}
-
-static int add_update_code(unsigned long ip, unsigned const char *new)
-{
- /* skip breakpoint */
- ip++;
- new++;
- return ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1);
-}
-
-static int add_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
- return add_update_code(ip, new);
-}
-
-static int add_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
- return add_update_code(ip, new);
-}
-
-static int add_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_test_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return add_update_call(rec, ftrace_addr);
-
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return add_update_nop(rec);
- }
-
- return 0;
-}
-
-static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_call_replace(ip, addr);
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update_nop(struct dyn_ftrace *rec)
-{
- unsigned long ip = rec->ip;
- unsigned const char *new;
-
- new = ftrace_nop_replace();
-
- return ftrace_write(ip, new, 1);
-}
-
-static int finish_update(struct dyn_ftrace *rec, bool enable)
-{
- unsigned long ftrace_addr;
- int ret;
-
- ret = ftrace_update_record(rec, enable);
-
- ftrace_addr = ftrace_get_addr_new(rec);
-
- switch (ret) {
- case FTRACE_UPDATE_IGNORE:
- return 0;
-
- case FTRACE_UPDATE_MODIFY_CALL:
- case FTRACE_UPDATE_MAKE_CALL:
- /* converting nop to call */
- return finish_update_call(rec, ftrace_addr);
+ ip = (unsigned long)(&ftrace_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
- case FTRACE_UPDATE_MAKE_NOP:
- /* converting a call to a nop */
- return finish_update_nop(rec);
- }
+ ip = (unsigned long)(&ftrace_regs_call);
+ new = ftrace_call_replace(ip, (unsigned long)func);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
return 0;
}
-static void do_sync_core(void *data)
-{
- sync_core();
-}
-
-static void run_sync(void)
-{
- int enable_irqs;
-
- /* No need to sync if there's only one CPU */
- if (num_online_cpus() == 1)
- return;
-
- enable_irqs = irqs_disabled();
-
- /* We may be called with interrupts disabled (on bootup). */
- if (enable_irqs)
- local_irq_enable();
- on_each_cpu(do_sync_core, NULL, 1);
- if (enable_irqs)
- local_irq_disable();
-}
-
void ftrace_replace_code(int enable)
{
struct ftrace_rec_iter *iter;
struct dyn_ftrace *rec;
- const char *report = "adding breakpoints";
- int count = 0;
+ const char *new, *old;
int ret;
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = add_breakpoints(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
-
- run_sync();
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- report = "updating code";
- count = 0;
+ case FTRACE_UPDATE_MAKE_CALL:
+ old = ftrace_nop_replace();
+ break;
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
+ case FTRACE_UPDATE_MODIFY_CALL:
+ case FTRACE_UPDATE_MAKE_NOP:
+ old = ftrace_call_replace(rec->ip, ftrace_get_addr_curr(rec));
+ break;
+ }
- ret = add_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
+ ret = ftrace_verify_code(rec->ip, old);
+ if (ret) {
+ ftrace_bug(ret, rec);
+ return;
+ }
}
- run_sync();
-
- report = "removing breakpoints";
- count = 0;
-
for_ftrace_rec_iter(iter) {
rec = ftrace_rec_iter_record(iter);
- ret = finish_update(rec, enable);
- if (ret)
- goto remove_breakpoints;
- count++;
- }
+ switch (ftrace_test_record(rec, enable)) {
+ case FTRACE_UPDATE_IGNORE:
+ default:
+ continue;
- run_sync();
+ case FTRACE_UPDATE_MAKE_CALL:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ new = ftrace_call_replace(rec->ip, ftrace_get_addr_new(rec));
+ break;
- return;
+ case FTRACE_UPDATE_MAKE_NOP:
+ new = ftrace_nop_replace();
+ break;
+ }
- remove_breakpoints:
- pr_warn("Failed on %s (%d):\n", report, count);
- ftrace_bug(ret, rec);
- for_ftrace_rec_iter(iter) {
- rec = ftrace_rec_iter_record(iter);
- /*
- * Breakpoints are handled only when this function is in
- * progress. The system could not work with them.
- */
- if (remove_breakpoint(rec))
- BUG();
+ text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL);
+ ftrace_update_record(rec, enable);
}
- run_sync();
-}
-
-static int
-ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
- unsigned const char *new_code)
-{
- int ret;
-
- ret = add_break(ip, old_code);
- if (ret)
- goto out;
-
- run_sync();
-
- ret = add_update_code(ip, new_code);
- if (ret)
- goto fail_update;
-
- run_sync();
-
- ret = ftrace_write(ip, new_code, 1);
- /*
- * The breakpoint is handled only when this function is in progress.
- * The system could not work if we could not remove it.
- */
- BUG_ON(ret);
- out:
- run_sync();
- return ret;
-
- fail_update:
- /* Also here the system could not work with the breakpoint */
- if (ftrace_write(ip, old_code, 1))
- BUG();
- goto out;
+ text_poke_finish();
}
void arch_ftrace_update_code(int command)
{
- /* See comment above by declaration of modifying_ftrace_code */
- atomic_inc(&modifying_ftrace_code);
-
ftrace_modify_all_code(command);
-
- atomic_dec(&modifying_ftrace_code);
}
int __init ftrace_dyn_arch_init(void)
@@ -747,6 +314,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long start_offset;
unsigned long end_offset;
unsigned long op_offset;
+ unsigned long call_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -763,10 +331,12 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
op_offset = (unsigned long)ftrace_caller_op_ptr;
+ call_offset = (unsigned long)ftrace_call;
}
size = end_offset - start_offset;
@@ -823,16 +393,21 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
/* put in the new offset to the ftrace_ops */
memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE);
+ /* put in the call to the function */
+ mutex_lock(&text_mutex);
+ call_offset -= start_offset;
+ memcpy(trampoline + call_offset,
+ text_gen_insn(CALL_INSN_OPCODE,
+ trampoline + call_offset,
+ ftrace_ops_get_func(ops)), CALL_INSN_SIZE);
+ mutex_unlock(&text_mutex);
+
/* ALLOC_TRAMP flags lets us know we created it */
ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
set_vm_flush_reset_perms(trampoline);
- /*
- * Module allocation needs to be completed by making the page
- * executable. The page is still writable, which is a security hazard,
- * but anyhow ftrace breaks W^X completely.
- */
+ set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -859,62 +434,54 @@ static unsigned long calc_trampoline_call_offset(bool save_regs)
void arch_ftrace_update_trampoline(struct ftrace_ops *ops)
{
ftrace_func_t func;
- unsigned char *new;
unsigned long offset;
unsigned long ip;
unsigned int size;
- int ret, npages;
+ const char *new;
- if (ops->trampoline) {
- /*
- * The ftrace_ops caller may set up its own trampoline.
- * In such a case, this code must not modify it.
- */
- if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
- return;
- npages = PAGE_ALIGN(ops->trampoline_size) >> PAGE_SHIFT;
- set_memory_rw(ops->trampoline, npages);
- } else {
+ if (!ops->trampoline) {
ops->trampoline = create_trampoline(ops, &size);
if (!ops->trampoline)
return;
ops->trampoline_size = size;
- npages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ return;
}
+ /*
+ * The ftrace_ops caller may set up its own trampoline.
+ * In such a case, this code must not modify it.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+ return;
+
offset = calc_trampoline_call_offset(ops->flags & FTRACE_OPS_FL_SAVE_REGS);
ip = ops->trampoline + offset;
-
func = ftrace_ops_get_func(ops);
- ftrace_update_func_call = (unsigned long)func;
-
+ mutex_lock(&text_mutex);
/* Do a safe modify in case the trampoline is executing */
new = ftrace_call_replace(ip, (unsigned long)func);
- ret = update_ftrace_func(ip, new);
- set_memory_ro(ops->trampoline, npages);
-
- /* The update should never fail */
- WARN_ON(ret);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ mutex_unlock(&text_mutex);
}
/* Return the address of the function the trampoline calls */
static void *addr_from_call(void *ptr)
{
- union ftrace_code_union calc;
+ union text_poke_insn call;
int ret;
- ret = probe_kernel_read(&calc, ptr, MCOUNT_INSN_SIZE);
+ ret = probe_kernel_read(&call, ptr, CALL_INSN_SIZE);
if (WARN_ON_ONCE(ret < 0))
return NULL;
/* Make sure this is a call */
- if (WARN_ON_ONCE(calc.op != 0xe8)) {
- pr_warn("Expected e8, got %x\n", calc.op);
+ if (WARN_ON_ONCE(call.opcode != CALL_INSN_OPCODE)) {
+ pr_warn("Expected E8, got %x\n", call.opcode);
return NULL;
}
- return ptr + MCOUNT_INSN_SIZE + calc.offset;
+ return ptr + CALL_INSN_SIZE + call.disp;
}
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
@@ -981,19 +548,18 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops)
#ifdef CONFIG_DYNAMIC_FTRACE
extern void ftrace_graph_call(void);
-static unsigned char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
+static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr)
{
- return ftrace_text_replace(0xe9, ip, addr);
+ return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr);
}
static int ftrace_mod_jmp(unsigned long ip, void *func)
{
- unsigned char *new;
+ const char *new;
- ftrace_update_func_call = 0UL;
new = ftrace_jmp_replace(ip, (unsigned long)func);
-
- return update_ftrace_func(ip, new);
+ text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL);
+ return 0;
}
int ftrace_enable_ftrace_graph_caller(void)
@@ -1019,10 +585,9 @@ int ftrace_disable_ftrace_graph_caller(void)
void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent,
unsigned long frame_pointer)
{
+ unsigned long return_hooker = (unsigned long)&return_to_handler;
unsigned long old;
int faulted;
- unsigned long return_hooker = (unsigned long)
- &return_to_handler;
/*
* When resuming from suspend-to-ram, this function can be indirectly
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index c6f791bc481e..7a50f0b62a70 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -84,7 +84,7 @@ static inline void hpet_writel(unsigned int d, unsigned int a)
static inline void hpet_set_mapping(void)
{
- hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+ hpet_virt_address = ioremap(hpet_address, HPET_MMAP_SIZE);
}
static inline void hpet_clear_mapping(void)
diff --git a/arch/x86/kernel/ima_arch.c b/arch/x86/kernel/ima_arch.c
index 4d4f5d9faac3..23054909c8dd 100644
--- a/arch/x86/kernel/ima_arch.c
+++ b/arch/x86/kernel/ima_arch.c
@@ -10,8 +10,6 @@ extern struct boot_params boot_params;
static enum efi_secureboot_mode get_sb_mode(void)
{
- efi_char16_t efi_SecureBoot_name[] = L"SecureBoot";
- efi_char16_t efi_SetupMode_name[] = L"SecureBoot";
efi_guid_t efi_variable_guid = EFI_GLOBAL_VARIABLE_GUID;
efi_status_t status;
unsigned long size;
@@ -25,7 +23,7 @@ static enum efi_secureboot_mode get_sb_mode(void)
}
/* Get variable contents into buffer */
- status = efi.get_variable(efi_SecureBoot_name, &efi_variable_guid,
+ status = efi.get_variable(L"SecureBoot", &efi_variable_guid,
NULL, &size, &secboot);
if (status == EFI_NOT_FOUND) {
pr_info("ima: secureboot mode disabled\n");
@@ -38,7 +36,7 @@ static enum efi_secureboot_mode get_sb_mode(void)
}
size = sizeof(setupmode);
- status = efi.get_variable(efi_SetupMode_name, &efi_variable_guid,
+ status = efi.get_variable(L"SetupMode", &efi_variable_guid,
NULL, &size, &setupmode);
if (status != EFI_SUCCESS) /* ignore unknown SetupMode */
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index c1a8b9e71408..9c4498ea0b3c 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -16,15 +16,7 @@
#include <asm/alternative.h>
#include <asm/text-patching.h>
-union jump_code_union {
- char code[JUMP_LABEL_NOP_SIZE];
- struct {
- char jump;
- int offset;
- } __attribute__((packed));
-};
-
-static void bug_at(unsigned char *ip, int line)
+static void bug_at(const void *ip, int line)
{
/*
* The location is not an op that we were expecting.
@@ -35,42 +27,42 @@ static void bug_at(unsigned char *ip, int line)
BUG();
}
-static void __jump_label_set_jump_code(struct jump_entry *entry,
- enum jump_label_type type,
- union jump_code_union *code,
- int init)
+static const void *
+__jump_label_set_jump_code(struct jump_entry *entry, enum jump_label_type type, int init)
{
const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP };
const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5];
- const void *expect;
+ const void *expect, *code;
+ const void *addr, *dest;
int line;
- code->jump = 0xe9;
- code->offset = jump_entry_target(entry) -
- (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE);
+ addr = (void *)jump_entry_code(entry);
+ dest = (void *)jump_entry_target(entry);
+
+ code = text_gen_insn(JMP32_INSN_OPCODE, addr, dest);
if (init) {
expect = default_nop; line = __LINE__;
} else if (type == JUMP_LABEL_JMP) {
expect = ideal_nop; line = __LINE__;
} else {
- expect = code->code; line = __LINE__;
+ expect = code; line = __LINE__;
}
- if (memcmp((void *)jump_entry_code(entry), expect, JUMP_LABEL_NOP_SIZE))
- bug_at((void *)jump_entry_code(entry), line);
+ if (memcmp(addr, expect, JUMP_LABEL_NOP_SIZE))
+ bug_at(addr, line);
if (type == JUMP_LABEL_NOP)
- memcpy(code, ideal_nop, JUMP_LABEL_NOP_SIZE);
+ code = ideal_nop;
+
+ return code;
}
-static void __ref __jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type,
- int init)
+static void inline __jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
- union jump_code_union code;
-
- __jump_label_set_jump_code(entry, type, &code, init);
+ const void *opcode = __jump_label_set_jump_code(entry, type, init);
/*
* As long as only a single processor is running and the code is still
@@ -84,31 +76,33 @@ static void __ref __jump_label_transform(struct jump_entry *entry,
* always nop being the 'currently valid' instruction
*/
if (init || system_state == SYSTEM_BOOTING) {
- text_poke_early((void *)jump_entry_code(entry), &code,
+ text_poke_early((void *)jump_entry_code(entry), opcode,
JUMP_LABEL_NOP_SIZE);
return;
}
- text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL);
+ text_poke_bp((void *)jump_entry_code(entry), opcode, JUMP_LABEL_NOP_SIZE, NULL);
}
-void arch_jump_label_transform(struct jump_entry *entry,
- enum jump_label_type type)
+static void __ref jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type,
+ int init)
{
mutex_lock(&text_mutex);
- __jump_label_transform(entry, type, 0);
+ __jump_label_transform(entry, type, init);
mutex_unlock(&text_mutex);
}
-#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
-static struct text_poke_loc tp_vec[TP_VEC_MAX];
-static int tp_vec_nr;
+void arch_jump_label_transform(struct jump_entry *entry,
+ enum jump_label_type type)
+{
+ jump_label_transform(entry, type, 0);
+}
bool arch_jump_label_transform_queue(struct jump_entry *entry,
enum jump_label_type type)
{
- struct text_poke_loc *tp;
- void *entry_code;
+ const void *opcode;
if (system_state == SYSTEM_BOOTING) {
/*
@@ -118,53 +112,19 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry,
return true;
}
- /*
- * No more space in the vector, tell upper layer to apply
- * the queue before continuing.
- */
- if (tp_vec_nr == TP_VEC_MAX)
- return false;
-
- tp = &tp_vec[tp_vec_nr];
-
- entry_code = (void *)jump_entry_code(entry);
-
- /*
- * The INT3 handler will do a bsearch in the queue, so we need entries
- * to be sorted. We can survive an unsorted list by rejecting the entry,
- * forcing the generic jump_label code to apply the queue. Warning once,
- * to raise the attention to the case of an unsorted entry that is
- * better not happen, because, in the worst case we will perform in the
- * same way as we do without batching - with some more overhead.
- */
- if (tp_vec_nr > 0) {
- int prev = tp_vec_nr - 1;
- struct text_poke_loc *prev_tp = &tp_vec[prev];
-
- if (WARN_ON_ONCE(prev_tp->addr > entry_code))
- return false;
- }
-
- __jump_label_set_jump_code(entry, type,
- (union jump_code_union *)&tp->text, 0);
-
- text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL);
-
- tp_vec_nr++;
-
+ mutex_lock(&text_mutex);
+ opcode = __jump_label_set_jump_code(entry, type, 0);
+ text_poke_queue((void *)jump_entry_code(entry),
+ opcode, JUMP_LABEL_NOP_SIZE, NULL);
+ mutex_unlock(&text_mutex);
return true;
}
void arch_jump_label_transform_apply(void)
{
- if (!tp_vec_nr)
- return;
-
mutex_lock(&text_mutex);
- text_poke_bp_batch(tp_vec, tp_vec_nr);
+ text_poke_finish();
mutex_unlock(&text_mutex);
-
- tp_vec_nr = 0;
}
static enum {
@@ -193,5 +153,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry,
jlstate = JL_STATE_NO_UPDATE;
}
if (jlstate == JL_STATE_UPDATE)
- __jump_label_transform(entry, type, 1);
+ jump_label_transform(entry, type, 1);
}
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index d2f4e706a428..f293d872602a 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -177,7 +177,7 @@ setup_efi_state(struct boot_params *params, unsigned long params_load_addr,
* acpi_rsdp=<addr> on kernel command line to make second kernel boot
* without efi.
*/
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return 0;
params->secure_boot = boot_params.secure_boot;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4f13af7cbcdb..4d7022a740ab 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -40,6 +40,7 @@
#include <linux/frame.h>
#include <linux/kasan.h>
#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
@@ -119,14 +120,14 @@ __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
void synthesize_reljump(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
+ __synthesize_relative_insn(dest, from, to, JMP32_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_reljump);
/* Insert a call instruction at address 'from', which calls address 'to'.*/
void synthesize_relcall(void *dest, void *from, void *to)
{
- __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
+ __synthesize_relative_insn(dest, from, to, CALL_INSN_OPCODE);
}
NOKPROBE_SYMBOL(synthesize_relcall);
@@ -301,7 +302,7 @@ static int can_probe(unsigned long paddr)
* Another debugging subsystem might insert this breakpoint.
* In that case, we can't recover it.
*/
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
addr += insn.length;
}
@@ -356,7 +357,7 @@ int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
return 0;
/* Another subsystem puts a breakpoint, failed to recover */
- if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn->opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* We should not singlestep on the exception masking instructions */
@@ -400,14 +401,14 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
int len = insn->length;
if (can_boost(insn, p->addr) &&
- MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
+ MAX_INSN_SIZE - len >= JMP32_INSN_SIZE) {
/*
* These instructions can be executed directly if it
* jumps back to correct address.
*/
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
p->ainsn.boostable = true;
} else {
p->ainsn.boostable = false;
@@ -501,12 +502,14 @@ int arch_prepare_kprobe(struct kprobe *p)
void arch_arm_kprobe(struct kprobe *p)
{
- text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
+ text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+ text_poke_sync();
}
void arch_disarm_kprobe(struct kprobe *p)
{
text_poke(p->addr, &p->opcode, 1);
+ text_poke_sync();
}
void arch_remove_kprobe(struct kprobe *p)
@@ -609,7 +612,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
regs->flags |= X86_EFLAGS_TF;
regs->flags &= ~X86_EFLAGS_IF;
/* single step inline if the instruction is an int3 */
- if (p->opcode == BREAKPOINT_INSTRUCTION)
+ if (p->opcode == INT3_INSN_OPCODE)
regs->ip = (unsigned long)p->addr;
else
regs->ip = (unsigned long)p->ainsn.insn;
@@ -695,7 +698,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
reset_current_kprobe();
return 1;
}
- } else if (*addr != BREAKPOINT_INSTRUCTION) {
+ } else if (*addr != INT3_INSN_OPCODE) {
/*
* The breakpoint instruction was removed right
* after we hit it. Another cpu has removed
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 8900329c28a7..3f45b5c43a71 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -38,7 +38,7 @@ unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
long offs;
int i;
- for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
+ for (i = 0; i < JMP32_INSN_SIZE; i++) {
kp = get_kprobe((void *)addr - i);
/* This function only handles jump-optimized kprobe */
if (kp && kprobe_optimized(kp)) {
@@ -62,10 +62,10 @@ found:
if (addr == (unsigned long)kp->addr) {
buf[0] = kp->opcode;
- memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+ memcpy(buf + 1, op->optinsn.copied_insn, DISP32_SIZE);
} else {
offs = addr - (unsigned long)kp->addr - 1;
- memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
+ memcpy(buf, op->optinsn.copied_insn + offs, DISP32_SIZE - offs);
}
return (unsigned long)buf;
@@ -141,8 +141,6 @@ STACK_FRAME_NON_STANDARD(optprobe_template_func);
#define TMPL_END_IDX \
((long)optprobe_template_end - (long)optprobe_template_entry)
-#define INT3_SIZE sizeof(kprobe_opcode_t)
-
/* Optimized kprobe call back function: called from optinsn */
static void
optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
@@ -162,7 +160,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
regs->cs |= get_kernel_rpl();
regs->gs = 0;
#endif
- regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
+ regs->ip = (unsigned long)op->kp.addr + INT3_INSN_SIZE;
regs->orig_ax = ~0UL;
__this_cpu_write(current_kprobe, &op->kp);
@@ -179,7 +177,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
struct insn insn;
int len = 0, ret;
- while (len < RELATIVEJUMP_SIZE) {
+ while (len < JMP32_INSN_SIZE) {
ret = __copy_instruction(dest + len, src + len, real + len, &insn);
if (!ret || !can_boost(&insn, src + len))
return -EINVAL;
@@ -271,7 +269,7 @@ static int can_optimize(unsigned long paddr)
return 0;
/* Check there is enough space for a relative jump. */
- if (size - offset < RELATIVEJUMP_SIZE)
+ if (size - offset < JMP32_INSN_SIZE)
return 0;
/* Decode instructions */
@@ -290,15 +288,15 @@ static int can_optimize(unsigned long paddr)
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
/* Another subsystem puts a breakpoint */
- if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
return 0;
/* Recover address */
insn.kaddr = (void *)addr;
insn.next_byte = (void *)(addr + insn.length);
/* Check any instructions don't jump into target */
if (insn_is_indirect_jump(&insn) ||
- insn_jump_into_range(&insn, paddr + INT3_SIZE,
- RELATIVE_ADDR_SIZE))
+ insn_jump_into_range(&insn, paddr + INT3_INSN_SIZE,
+ DISP32_SIZE))
return 0;
addr += insn.length;
}
@@ -374,7 +372,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
* Verify if the address gap is in 2GB range, because this uses
* a relative jump.
*/
- rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+ rel = (long)slot - (long)op->kp.addr + JMP32_INSN_SIZE;
if (abs(rel) > 0x7fffffff) {
ret = -ERANGE;
goto err;
@@ -401,7 +399,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
/* Set returning jmp instruction at the tail of out-of-line buffer */
synthesize_reljump(buf + len, slot + len,
(u8 *)op->kp.addr + op->optinsn.size);
- len += RELATIVEJUMP_SIZE;
+ len += JMP32_INSN_SIZE;
/* We have to use text_poke() for instruction buffer because it is RO */
text_poke(slot, buf, len);
@@ -416,49 +414,50 @@ err:
}
/*
- * Replace breakpoints (int3) with relative jumps.
+ * Replace breakpoints (INT3) with relative jumps (JMP.d32).
* Caller must call with locking kprobe_mutex and text_mutex.
+ *
+ * The caller will have installed a regular kprobe and after that issued
+ * syncrhonize_rcu_tasks(), this ensures that the instruction(s) that live in
+ * the 4 bytes after the INT3 are unused and can now be overwritten.
*/
void arch_optimize_kprobes(struct list_head *oplist)
{
struct optimized_kprobe *op, *tmp;
- u8 insn_buff[RELATIVEJUMP_SIZE];
+ u8 insn_buff[JMP32_INSN_SIZE];
list_for_each_entry_safe(op, tmp, oplist, list) {
s32 rel = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
+ ((long)op->kp.addr + JMP32_INSN_SIZE));
WARN_ON(kprobe_disabled(&op->kp));
/* Backup instructions which will be replaced by jump address */
- memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
- RELATIVE_ADDR_SIZE);
+ memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_INSN_SIZE,
+ DISP32_SIZE);
- insn_buff[0] = RELATIVEJUMP_OPCODE;
+ insn_buff[0] = JMP32_INSN_OPCODE;
*(s32 *)(&insn_buff[1]) = rel;
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL);
+ text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL);
list_del_init(&op->list);
}
}
-/* Replace a relative jump with a breakpoint (int3). */
+/*
+ * Replace a relative jump (JMP.d32) with a breakpoint (INT3).
+ *
+ * After that, we can restore the 4 bytes after the INT3 to undo what
+ * arch_optimize_kprobes() scribbled. This is safe since those bytes will be
+ * unused once the INT3 lands.
+ */
void arch_unoptimize_kprobe(struct optimized_kprobe *op)
{
- u8 insn_buff[RELATIVEJUMP_SIZE];
- u8 emulate_buff[RELATIVEJUMP_SIZE];
-
- /* Set int3 to first byte for kprobes */
- insn_buff[0] = BREAKPOINT_INSTRUCTION;
- memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
-
- emulate_buff[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn -
- ((long)op->kp.addr + RELATIVEJUMP_SIZE));
-
- text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE,
- emulate_buff);
+ arch_arm_kprobe(&op->kp);
+ text_poke(op->kp.addr + INT3_INSN_SIZE,
+ op->optinsn.copied_insn, DISP32_SIZE);
+ text_poke_sync();
}
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 32ef1ee733b7..6efe0410fb72 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -245,17 +245,13 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- enum ctx_state prev_state;
-
switch (kvm_read_and_reset_pf_reason()) {
default:
do_page_fault(regs, error_code, address);
break;
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
- prev_state = exception_enter();
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
- exception_exit(prev_state);
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
@@ -429,7 +425,29 @@ static void __init sev_map_percpu_data(void)
}
}
+static bool pv_tlb_flush_supported(void)
+{
+ return (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
+}
+
+static DEFINE_PER_CPU(cpumask_var_t, __pv_cpu_mask);
+
#ifdef CONFIG_SMP
+
+static bool pv_ipi_supported(void)
+{
+ return kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI);
+}
+
+static bool pv_sched_yield_supported(void)
+{
+ return (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
+ !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME));
+}
+
#define KVM_IPI_CLUSTER_SIZE (2 * BITS_PER_LONG)
static void __send_ipi_mask(const struct cpumask *mask, int vector)
@@ -494,12 +512,12 @@ static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
unsigned int this_cpu = smp_processor_id();
- struct cpumask new_mask;
+ struct cpumask *new_mask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
const struct cpumask *local_mask;
- cpumask_copy(&new_mask, mask);
- cpumask_clear_cpu(this_cpu, &new_mask);
- local_mask = &new_mask;
+ cpumask_copy(new_mask, mask);
+ cpumask_clear_cpu(this_cpu, new_mask);
+ local_mask = new_mask;
__send_ipi_mask(local_mask, vector);
}
@@ -579,7 +597,6 @@ static void __init kvm_apf_trap_init(void)
update_intr_gate(X86_TRAP_PF, async_page_fault);
}
-static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
@@ -587,7 +604,7 @@ static void kvm_flush_tlb_others(const struct cpumask *cpumask,
u8 state;
int cpu;
struct kvm_steal_time *src;
- struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+ struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_cpu_mask);
cpumask_copy(flushmask, cpumask);
/*
@@ -623,11 +640,10 @@ static void __init kvm_guest_init(void)
pv_ops.time.steal_clock = kvm_steal_clock;
}
- if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ if (pv_tlb_flush_supported()) {
pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
+ pr_info("KVM setup pv remote TLB flush\n");
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -636,9 +652,7 @@ static void __init kvm_guest_init(void)
#ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
- if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
- !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+ if (pv_sched_yield_supported()) {
smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
pr_info("KVM setup pv sched yield\n");
}
@@ -704,7 +718,7 @@ static uint32_t __init kvm_detect(void)
static void __init kvm_apic_init(void)
{
#if defined(CONFIG_SMP)
- if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
+ if (pv_ipi_supported())
kvm_setup_pv_ipi();
#endif
}
@@ -736,23 +750,31 @@ static __init int activate_jump_labels(void)
}
arch_initcall(activate_jump_labels);
-static __init int kvm_setup_pv_tlb_flush(void)
+static __init int kvm_alloc_cpumask(void)
{
int cpu;
+ bool alloc = false;
+
+ if (!kvm_para_available() || nopv)
+ return 0;
+
+ if (pv_tlb_flush_supported())
+ alloc = true;
- if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
- !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+#if defined(CONFIG_SMP)
+ if (pv_ipi_supported())
+ alloc = true;
+#endif
+
+ if (alloc)
for_each_possible_cpu(cpu) {
- zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+ zalloc_cpumask_var_node(per_cpu_ptr(&__pv_cpu_mask, cpu),
GFP_KERNEL, cpu_to_node(cpu));
}
- pr_info("KVM setup pv remote TLB flush\n");
- }
return 0;
}
-arch_initcall(kvm_setup_pv_tlb_flush);
+arch_initcall(kvm_alloc_cpumask);
#ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b2463fcb20a8..c57e1ca70fd1 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -28,6 +28,89 @@
#include <asm/desc.h>
#include <asm/mmu_context.h>
#include <asm/syscalls.h>
+#include <asm/pgtable_areas.h>
+
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+ return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+}
+
+void load_mm_ldt(struct mm_struct *mm)
+{
+ struct ldt_struct *ldt;
+
+ /* READ_ONCE synchronizes with smp_store_release */
+ ldt = READ_ONCE(mm->context.ldt);
+
+ /*
+ * Any change to mm->context.ldt is followed by an IPI to all
+ * CPUs with the mm active. The LDT will not be freed until
+ * after the IPI is handled by all such CPUs. This means that,
+ * if the ldt_struct changes before we return, the values we see
+ * will be safe, and the new values will be loaded before we run
+ * any user code.
+ *
+ * NB: don't try to convert this to use RCU without extreme care.
+ * We would still need IRQs off, because we don't want to change
+ * the local LDT after an IPI loaded a newer value than the one
+ * that we can see.
+ */
+
+ if (unlikely(ldt)) {
+ if (static_cpu_has(X86_FEATURE_PTI)) {
+ if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+ /*
+ * Whoops -- either the new LDT isn't mapped
+ * (if slot == -1) or is mapped into a bogus
+ * slot (if slot > 1).
+ */
+ clear_LDT();
+ return;
+ }
+
+ /*
+ * If page table isolation is enabled, ldt->entries
+ * will not be mapped in the userspace pagetables.
+ * Tell the CPU to access the LDT through the alias
+ * at ldt_slot_va(ldt->slot).
+ */
+ set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+ } else {
+ set_ldt(ldt->entries, ldt->nr_entries);
+ }
+ } else {
+ clear_LDT();
+ }
+}
+
+void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+ /*
+ * Load the LDT if either the old or new mm had an LDT.
+ *
+ * An mm will never go from having an LDT to not having an LDT. Two
+ * mms never share an LDT, so we don't gain anything by checking to
+ * see whether the LDT changed. There's also no guarantee that
+ * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+ * then prev->context.ldt will also be non-NULL.
+ *
+ * If we really cared, we could optimize the case where prev == next
+ * and we're exiting lazy mode. Most of the time, if this happens,
+ * we don't actually need to reload LDTR, but modify_ldt() is mostly
+ * used by legacy code and emulators where we don't need this level of
+ * performance.
+ *
+ * This uses | instead of || because it generates better code.
+ */
+ if (unlikely((unsigned long)prev->context.ldt |
+ (unsigned long)next->context.ldt))
+ load_mm_ldt(next);
+
+ DEBUG_LOCKS_WARN_ON(preemptible());
+}
static void refresh_ldt_segments(void)
{
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 7b45e8daad22..02bddfc122a4 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -250,15 +250,3 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-
-void arch_crash_save_vmcoreinfo(void)
-{
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
-#ifdef CONFIG_X86_PAE
- VMCOREINFO_CONFIG(X86_PAE);
-#endif
-}
-
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 16e125a50b33..ad5cdd6a5f23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -398,25 +398,6 @@ void machine_kexec(struct kimage *image)
__ftrace_enabled_restore(save_ftrace_enabled);
}
-void arch_crash_save_vmcoreinfo(void)
-{
- u64 sme_mask = sme_me_mask;
-
- VMCOREINFO_NUMBER(phys_base);
- VMCOREINFO_SYMBOL(init_top_pgt);
- vmcoreinfo_append_str("NUMBER(pgtable_l5_enabled)=%d\n",
- pgtable_l5_enabled());
-
-#ifdef CONFIG_NUMA
- VMCOREINFO_SYMBOL(node_data);
- VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
-#endif
- vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
- kaslr_offset());
- VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
- VMCOREINFO_NUMBER(sme_mask);
-}
-
/* arch-dependent functionality related to kexec file-based syscall */
#ifdef CONFIG_KEXEC_FILE
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index e676a9916c49..54c21d6abd5a 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -104,18 +104,22 @@ static int __init nmi_warning_debugfs(void)
}
fs_initcall(nmi_warning_debugfs);
-static void nmi_max_handler(struct irq_work *w)
+static void nmi_check_duration(struct nmiaction *action, u64 duration)
{
- struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
+ u64 whole_msecs = READ_ONCE(action->max_duration);
int remainder_ns, decimal_msecs;
- u64 whole_msecs = READ_ONCE(a->max_duration);
+
+ if (duration < nmi_longest_ns || duration < action->max_duration)
+ return;
+
+ action->max_duration = duration;
remainder_ns = do_div(whole_msecs, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
printk_ratelimited(KERN_INFO
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
- a->handler, whole_msecs, decimal_msecs);
+ action->handler, whole_msecs, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)
@@ -142,11 +146,7 @@ static int nmi_handle(unsigned int type, struct pt_regs *regs)
delta = sched_clock() - delta;
trace_nmi_handler(a->handler, (int)delta, thishandled);
- if (delta < nmi_longest_ns || delta < a->max_duration)
- continue;
-
- a->max_duration = delta;
- irq_work_queue(&a->irq_work);
+ nmi_check_duration(a, delta);
}
rcu_read_unlock();
@@ -164,8 +164,6 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action)
if (!action->handler)
return -EINVAL;
- init_irq_work(&action->irq_work, nmi_max_handler);
-
raw_spin_lock_irqsave(&desc->lock, flags);
/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 789f5e4f89de..c131ba4e70ef 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -30,6 +30,7 @@
#include <asm/timer.h>
#include <asm/special_insns.h>
#include <asm/tlb.h>
+#include <asm/io_bitmap.h>
/*
* nop stub, which must not clobber anything *including the stack* to
@@ -341,6 +342,10 @@ struct paravirt_patch_template pv_ops = {
.cpu.iret = native_iret,
.cpu.swapgs = native_swapgs,
+#ifdef CONFIG_X86_IOPL_IOPERM
+ .cpu.update_io_bitmap = native_tss_update_io_bitmap,
+#endif
+
.cpu.start_context_switch = paravirt_nop,
.cpu.end_context_switch = paravirt_nop,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 61e93a318983..3053c85e0e42 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -374,7 +374,7 @@ static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
/**
* tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
*/
-void tss_update_io_bitmap(void)
+void native_tss_update_io_bitmap(void)
{
struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
struct thread_struct *t = &current->thread;
@@ -615,12 +615,8 @@ void speculation_ctrl_update_current(void)
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct thread_struct *prev, *next;
unsigned long tifp, tifn;
- prev = &prev_p->thread;
- next = &next_p->thread;
-
tifn = READ_ONCE(task_thread_info(next_p)->flags);
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 323499f48858..5052ced43373 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -124,7 +124,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->ip = new_ip;
regs->sp = new_sp;
regs->flags = X86_EFLAGS_IF;
- force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 506d66830d4d..ffd497804dbc 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -394,7 +394,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
- force_iret();
}
void
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 1daf8f2aa21f..896d74cb5081 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -110,7 +110,7 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
}
/* use bits 31:14, 16 kB aligned */
- rcba_base = ioremap_nocache(rcba, 0x4000);
+ rcba_base = ioremap(rcba, 0x4000);
if (rcba_base == NULL) {
dev_printk(KERN_DEBUG, &dev->dev, "ioremap failed; "
"cannot force enable HPET\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cedfe2077a69..a74262c71484 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -2,130 +2,54 @@
/*
* Copyright (C) 1995 Linus Torvalds
*
- * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
- *
- * Memory region support
- * David Parsons <orc@pell.chi.il.us>, July-August 1999
- *
- * Added E820 sanitization routine (removes overlapping memory regions);
- * Brian Moyle <bmoyle@mvista.com>, February 2001
- *
- * Moved CPU detection code to cpu/${cpu}.c
- * Patrick Mochel <mochel@osdl.org>, March 2002
- *
- * Provisions for empty E820 memory regions (reported by certain BIOSes).
- * Alex Achenbach <xela@slit.de>, December 2002.
- *
- */
-
-/*
- * This file handles the architecture-dependent parts of initialization
+ * This file contains the setup_arch() code, which handles the architecture-dependent
+ * parts of early kernel initialization.
*/
-
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/screen_info.h>
-#include <linux/ioport.h>
-#include <linux/acpi.h>
-#include <linux/sfi.h>
-#include <linux/apm_bios.h>
-#include <linux/initrd.h>
-#include <linux/memblock.h>
-#include <linux/seq_file.h>
#include <linux/console.h>
-#include <linux/root_dev.h>
-#include <linux/highmem.h>
-#include <linux/export.h>
+#include <linux/crash_dump.h>
+#include <linux/dmi.h>
#include <linux/efi.h>
-#include <linux/init.h>
-#include <linux/edd.h>
+#include <linux/init_ohci1394_dma.h>
+#include <linux/initrd.h>
#include <linux/iscsi_ibft.h>
-#include <linux/nodemask.h>
-#include <linux/kexec.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
+#include <linux/memblock.h>
#include <linux/pci.h>
-#include <asm/pci-direct.h>
-#include <linux/init_ohci1394_dma.h>
-#include <linux/kvm_para.h>
-#include <linux/dma-contiguous.h>
-#include <xen/xen.h>
-#include <uapi/linux/mount.h>
-
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/ptrace.h>
-#include <linux/user.h>
-#include <linux/delay.h>
-
-#include <linux/kallsyms.h>
-#include <linux/cpufreq.h>
-#include <linux/dma-mapping.h>
-#include <linux/ctype.h>
-#include <linux/uaccess.h>
-
-#include <linux/percpu.h>
-#include <linux/crash_dump.h>
+#include <linux/root_dev.h>
+#include <linux/sfi.h>
#include <linux/tboot.h>
-#include <linux/jiffies.h>
-#include <linux/mem_encrypt.h>
-#include <linux/sizes.h>
-
#include <linux/usb/xhci-dbgp.h>
-#include <video/edid.h>
-#include <asm/mtrr.h>
+#include <uapi/linux/mount.h>
+
+#include <xen/xen.h>
+
#include <asm/apic.h>
-#include <asm/realmode.h>
-#include <asm/e820/api.h>
-#include <asm/mpspec.h>
-#include <asm/setup.h>
-#include <asm/efi.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/sections.h>
-#include <asm/io_apic.h>
-#include <asm/ist.h>
-#include <asm/setup_arch.h>
#include <asm/bios_ebda.h>
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
#include <asm/bugs.h>
-#include <asm/kasan.h>
-
-#include <asm/vsyscall.h>
#include <asm/cpu.h>
-#include <asm/desc.h>
-#include <asm/dma.h>
-#include <asm/iommu.h>
+#include <asm/efi.h>
#include <asm/gart.h>
-#include <asm/mmu_context.h>
-#include <asm/proto.h>
-
-#include <asm/paravirt.h>
#include <asm/hypervisor.h>
-#include <asm/olpc_ofw.h>
-
-#include <asm/percpu.h>
-#include <asm/topology.h>
-#include <asm/apicdef.h>
-#include <asm/amd_nb.h>
+#include <asm/io_apic.h>
+#include <asm/kasan.h>
+#include <asm/kaslr.h>
#include <asm/mce.h>
-#include <asm/alternative.h>
+#include <asm/mtrr.h>
+#include <asm/realmode.h>
+#include <asm/olpc_ofw.h>
+#include <asm/pci-direct.h>
#include <asm/prom.h>
-#include <asm/microcode.h>
-#include <asm/kaslr.h>
+#include <asm/proto.h>
#include <asm/unwind.h>
+#include <asm/vsyscall.h>
+#include <linux/vmalloc.h>
/*
- * max_low_pfn_mapped: highest direct mapped pfn under 4GB
- * max_pfn_mapped: highest direct mapped pfn over 4GB
+ * max_low_pfn_mapped: highest directly mapped pfn < 4 GB
+ * max_pfn_mapped: highest directly mapped pfn > 4 GB
*
* The direct mapping only covers E820_TYPE_RAM regions, so the ranges and gaps are
- * represented by pfn_mapped
+ * represented by pfn_mapped[].
*/
unsigned long max_low_pfn_mapped;
unsigned long max_pfn_mapped;
@@ -135,14 +59,23 @@ RESERVE_BRK(dmi_alloc, 65536);
#endif
-static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
-unsigned long _brk_end = (unsigned long)__brk_base;
+/*
+ * Range of the BSS area. The size of the BSS area is determined
+ * at link time, with RESERVE_BRK*() facility reserving additional
+ * chunks.
+ */
+static __initdata
+unsigned long _brk_start = (unsigned long)__brk_base;
+unsigned long _brk_end = (unsigned long)__brk_base;
struct boot_params boot_params;
/*
- * Machine setup..
+ * These are the four main kernel memory regions, we put them into
+ * the resource tree so that kdump tools and other debugging tools
+ * recover it:
*/
+
static struct resource rodata_resource = {
.name = "Kernel rodata",
.start = 0,
@@ -173,16 +106,16 @@ static struct resource bss_resource = {
#ifdef CONFIG_X86_32
-/* cpu data as detected by the assembly code in head_32.S */
+/* CPU data as detected by the assembly code in head_32.S */
struct cpuinfo_x86 new_cpu_data;
-/* common cpu data for all cpus */
+/* Common CPU data for all CPUs */
struct cpuinfo_x86 boot_cpu_data __read_mostly;
EXPORT_SYMBOL(boot_cpu_data);
unsigned int def_to_bigsmp;
-/* for MCA, but anyone else can use it if they want */
+/* For MCA, but anyone else can use it if they want */
unsigned int machine_id;
unsigned int machine_submodel_id;
unsigned int BIOS_revision;
@@ -468,15 +401,15 @@ static void __init memblock_x86_reserve_range_setup_data(void)
/*
* Keep the crash kernel below this limit.
*
- * On 32 bits earlier kernels would limit the kernel to the low 512 MiB
+ * Earlier 32-bits kernels would limit the kernel to the low 512 MB range
* due to mapping restrictions.
*
- * On 64bit, kdump kernel need be restricted to be under 64TB, which is
+ * 64-bit kdump kernels need to be restricted to be under 64 TB, which is
* the upper limit of system RAM in 4-level paging mode. Since the kdump
- * jumping could be from 5-level to 4-level, the jumping will fail if
- * kernel is put above 64TB, and there's no way to detect the paging mode
- * of the kernel which will be loaded for dumping during the 1st kernel
- * bootup.
+ * jump could be from 5-level paging to 4-level paging, the jump will fail if
+ * the kernel is put above 64 TB, and during the 1st kernel bootup there's
+ * no good way to detect the paging mode of the target kernel which will be
+ * loaded for dumping.
*/
#ifdef CONFIG_X86_32
# define CRASH_ADDR_LOW_MAX SZ_512M
@@ -887,7 +820,7 @@ void __init setup_arch(char **cmdline_p)
/*
* Note: Quark X1000 CPUs advertise PGE incorrectly and require
* a cr3 based tlb flush, so the following __flush_tlb_all()
- * will not flush anything because the cpu quirk which clears
+ * will not flush anything because the CPU quirk which clears
* X86_FEATURE_PGE has not been invoked yet. Though due to the
* load_cr3() above the TLB has been flushed already. The
* quirk is invoked before subsequent calls to __flush_tlb_all()
@@ -960,8 +893,6 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = _brk_end;
- mpx_mm_init(&init_mm);
-
code_resource.start = __pa_symbol(_text);
code_resource.end = __pa_symbol(_etext)-1;
rodata_resource.start = __pa_symbol(__start_rodata);
@@ -1295,8 +1226,6 @@ void __init setup_arch(char **cmdline_p)
#if defined(CONFIG_VGA_CONSOLE)
if (!efi_enabled(EFI_BOOT) || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
conswitchp = &vga_con;
-#elif defined(CONFIG_DUMMY_CONSOLE)
- conswitchp = &dummy_con;
#endif
#endif
x86_init.oem.banner();
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 8eb7193e158d..8a29573851a3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -151,8 +151,6 @@ static int restore_sigcontext(struct pt_regs *regs,
err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32));
- force_iret();
-
return err;
}
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index f7476ce23b6e..ca3c11a17b5a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -22,7 +22,6 @@
#include <asm/elf.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
-#include <asm/mpx.h>
/*
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
@@ -137,10 +136,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_unmapped_area_info info;
unsigned long begin, end;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
if (flags & MAP_FIXED)
return addr;
@@ -180,10 +175,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
/* requested length too big for entire address space */
if (len > TASK_SIZE)
return -ENOMEM;
diff --git a/arch/x86/kernel/sysfb_simplefb.c b/arch/x86/kernel/sysfb_simplefb.c
index 01f0e2263b86..298fc1edd9c9 100644
--- a/arch/x86/kernel/sysfb_simplefb.c
+++ b/arch/x86/kernel/sysfb_simplefb.c
@@ -90,11 +90,11 @@ __init int create_simplefb(const struct screen_info *si,
if (si->orig_video_isVGA == VIDEO_TYPE_VLFB)
size <<= 16;
length = mode->height * mode->stride;
- length = PAGE_ALIGN(length);
if (length > size) {
printk(KERN_WARNING "sysfb: VRAM smaller than advertised\n");
return -EINVAL;
}
+ length = PAGE_ALIGN(length);
/* setup IORESOURCE_MEM as framebuffer memory */
memset(&res, 0, sizeof(res));
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 4c61f0713832..b89f6ac6a0c0 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -354,7 +354,7 @@ static ssize_t tboot_log_read(struct file *file, char __user *user_buf, size_t c
void *kbuf;
int ret = -EFAULT;
- log_base = ioremap_nocache(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
+ log_base = ioremap(TBOOT_SERIAL_LOG_ADDR, TBOOT_SERIAL_LOG_SIZE);
if (!log_base)
return ret;
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 7ce29cee9f9e..d8673d8a779b 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -91,10 +91,18 @@ void __init hpet_time_init(void)
static __init void x86_late_time_init(void)
{
+ /*
+ * Before PIT/HPET init, select the interrupt mode. This is required
+ * to make the decision whether PIT should be initialized correct.
+ */
+ x86_init.irqs.intr_mode_select();
+
+ /* Setup the legacy timers */
x86_init.timers.timer_init();
+
/*
- * After PIT/HPET timers init, select and setup
- * the final interrupt mode for delivering IRQs.
+ * After PIT/HPET timers init, set up the final interrupt mode for
+ * delivering IRQs.
*/
x86_init.irqs.intr_mode_init();
tsc_init();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 05da6b5b167b..6ef00eb6fbb9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -52,10 +52,10 @@
#include <asm/mach_traps.h>
#include <asm/alternative.h>
#include <asm/fpu/xstate.h>
-#include <asm/trace/mpx.h>
-#include <asm/mpx.h>
#include <asm/vm86.h>
#include <asm/umip.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -434,8 +434,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
{
- const struct mpx_bndcsr *bndcsr;
-
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
@@ -445,84 +443,60 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
if (!user_mode(regs))
die("bounds", regs, error_code);
- if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
- /* The exception is not from Intel MPX */
- goto exit_trap;
- }
+ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+}
- /*
- * We need to look at BNDSTATUS to resolve this exception.
- * A NULL here might mean that it is in its 'init state',
- * which is all zeros which indicates MPX was not
- * responsible for the exception.
- */
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- goto exit_trap;
+enum kernel_gp_hint {
+ GP_NO_HINT,
+ GP_NON_CANONICAL,
+ GP_CANONICAL
+};
- trace_bounds_exception_mpx(bndcsr);
- /*
- * The error code field of the BNDSTATUS register communicates status
- * information of a bound range exception #BR or operation involving
- * bound directory.
- */
- switch (bndcsr->bndstatus & MPX_BNDSTA_ERROR_CODE) {
- case 2: /* Bound directory has invalid entry. */
- if (mpx_handle_bd_fault())
- goto exit_trap;
- break; /* Success, it was handled */
- case 1: /* Bound violation. */
- {
- struct task_struct *tsk = current;
- struct mpx_fault_info mpx;
-
- if (mpx_fault_info(&mpx, regs)) {
- /*
- * We failed to decode the MPX instruction. Act as if
- * the exception was not caused by MPX.
- */
- goto exit_trap;
- }
- /*
- * Success, we decoded the instruction and retrieved
- * an 'mpx' containing the address being accessed
- * which caused the exception. This information
- * allows and application to possibly handle the
- * #BR exception itself.
- */
- if (!do_trap_no_signal(tsk, X86_TRAP_BR, "bounds", regs,
- error_code))
- break;
+/*
+ * When an uncaught #GP occurs, try to determine the memory address accessed by
+ * the instruction and return that address to the caller. Also, try to figure
+ * out whether any part of the access to that address was non-canonical.
+ */
+static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
+ unsigned long *addr)
+{
+ u8 insn_buf[MAX_INSN_SIZE];
+ struct insn insn;
- show_signal(tsk, SIGSEGV, "trap ", "bounds", regs, error_code);
+ if (probe_kernel_read(insn_buf, (void *)regs->ip, MAX_INSN_SIZE))
+ return GP_NO_HINT;
- force_sig_bnderr(mpx.addr, mpx.lower, mpx.upper);
- break;
- }
- case 0: /* No exception caused by Intel MPX operations. */
- goto exit_trap;
- default:
- die("bounds", regs, error_code);
- }
+ kernel_insn_init(&insn, insn_buf, MAX_INSN_SIZE);
+ insn_get_modrm(&insn);
+ insn_get_sib(&insn);
- return;
+ *addr = (unsigned long)insn_get_addr_ref(&insn, regs);
+ if (*addr == -1UL)
+ return GP_NO_HINT;
-exit_trap:
+#ifdef CONFIG_X86_64
/*
- * This path out is for all the cases where we could not
- * handle the exception in some way (like allocating a
- * table or telling userspace about it. We will also end
- * up here if the kernel has MPX turned off at compile
- * time..
+ * Check that:
+ * - the operand is not in the kernel half
+ * - the last byte of the operand is not in the user canonical half
*/
- do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
+ if (*addr < ~__VIRTUAL_MASK &&
+ *addr + insn.opnd_bytes - 1 > __VIRTUAL_MASK)
+ return GP_NON_CANONICAL;
+#endif
+
+ return GP_CANONICAL;
}
-dotraplinkage void
-do_general_protection(struct pt_regs *regs, long error_code)
+#define GPFSTR "general protection fault"
+
+dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
{
- const char *desc = "general protection fault";
+ char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
+ enum kernel_gp_hint hint = GP_NO_HINT;
struct task_struct *tsk;
+ unsigned long gp_addr;
+ int ret;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
@@ -539,48 +513,61 @@ do_general_protection(struct pt_regs *regs, long error_code)
}
tsk = current;
- if (!user_mode(regs)) {
- if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
- return;
+ if (user_mode(regs)) {
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- /*
- * To be potentially processing a kprobe fault and to
- * trust the result from kprobe_running(), we have to
- * be non-preemptible.
- */
- if (!preemptible() && kprobe_running() &&
- kprobe_fault_handler(regs, X86_TRAP_GP))
- return;
+ show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
+ force_sig(SIGSEGV);
- if (notify_die(DIE_GPF, desc, regs, error_code,
- X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
- die(desc, regs, error_code);
return;
}
+ if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
+ return;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
- show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
+ /*
+ * To be potentially processing a kprobe fault and to trust the result
+ * from kprobe_running(), we have to be non-preemptible.
+ */
+ if (!preemptible() &&
+ kprobe_running() &&
+ kprobe_fault_handler(regs, X86_TRAP_GP))
+ return;
+
+ ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
+ if (ret == NOTIFY_STOP)
+ return;
+
+ if (error_code)
+ snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
+ else
+ hint = get_kernel_gp_address(regs, &gp_addr);
+
+ if (hint != GP_NO_HINT)
+ snprintf(desc, sizeof(desc), GPFSTR ", %s 0x%lx",
+ (hint == GP_NON_CANONICAL) ? "probably for non-canonical address"
+ : "maybe for address",
+ gp_addr);
+
+ /*
+ * KASAN is interested only in the non-canonical case, clear it
+ * otherwise.
+ */
+ if (hint != GP_NON_CANONICAL)
+ gp_addr = 0;
+
+ die_addr(desc, regs, error_code, gp_addr);
- force_sig(SIGSEGV);
}
NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * ftrace must be first, everything else may cause a recursive crash.
- * See note by declaration of modifying_ftrace_code in ftrace.c
- */
- if (unlikely(atomic_read(&modifying_ftrace_code)) &&
- ftrace_int3_handler(regs))
- return;
-#endif
if (poke_int3_handler(regs))
return;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index b8acf639abd1..32a818764e03 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -233,7 +233,6 @@ static cycles_t check_tsc_warp(unsigned int timeout)
* The measurement runs for 'timeout' msecs:
*/
end = start + (cycles_t) tsc_khz * timeout;
- now = start;
for (i = 0; ; i++) {
/*
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 332ae6530fa8..e9cc182aa97e 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -187,6 +187,8 @@ static struct orc_entry *orc_find(unsigned long ip)
return orc_ftrace_find(ip);
}
+#ifdef CONFIG_MODULES
+
static void orc_sort_swap(void *_a, void *_b, int size)
{
struct orc_entry *orc_a, *orc_b;
@@ -229,7 +231,6 @@ static int orc_sort_cmp(const void *_a, const void *_b)
return orc_a->sp_reg == ORC_REG_UNDEFINED && !orc_a->end ? -1 : 1;
}
-#ifdef CONFIG_MODULES
void unwind_module_init(struct module *mod, void *_orc_ip, size_t orc_ip_size,
void *_orc, size_t orc_size)
{
@@ -273,9 +274,11 @@ void __init unwind_init(void)
return;
}
- /* Sort the .orc_unwind and .orc_unwind_ip tables: */
- sort(__start_orc_unwind_ip, num_entries, sizeof(int), orc_sort_cmp,
- orc_sort_swap);
+ /*
+ * Note, the orc_unwind and orc_unwind_ip tables were already
+ * sorted at build time via the 'sorttable' tool.
+ * It's ready for binary search straight away, no need to sort it.
+ */
/* Initialize the fast lookup table: */
lookup_num_blocks = orc_lookup_end - orc_lookup;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index a76c12b38e92..91d55454e702 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -381,7 +381,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
mark_screen_rdonly(tsk->mm);
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
- force_iret();
return regs->ax;
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3a1a819da137..e3296aa028fe 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -193,12 +193,10 @@ SECTIONS
__vvar_beginning_hack = .;
/* Place all vvars at the offsets in asm/vvar.h. */
-#define EMIT_VVAR(name, offset) \
+#define EMIT_VVAR(name, offset) \
. = __vvar_beginning_hack + offset; \
*(.vvar_ ## name)
-#define __VVAR_KERNEL_LDS
#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
#undef EMIT_VVAR
/*
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index ce89430a7f80..85f1a90c55cd 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -20,7 +20,7 @@
#include <asm/irq.h>
#include <asm/io_apic.h>
#include <asm/hpet.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
#include <asm/mach_traps.h>
@@ -80,6 +80,7 @@ struct x86_init_ops x86_init __initdata = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
.trap_init = x86_init_noop,
+ .intr_mode_select = apic_intr_mode_select,
.intr_mode_init = apic_intr_mode_init
},
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 840e12583b85..9fea0757db92 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -59,14 +59,25 @@ config KVM
If unsure, say N.
+config KVM_WERROR
+ bool "Compile KVM with -Werror"
+ # KASAN may cause the build to fail due to larger frames
+ default y if X86_64 && !KASAN
+ # We use the dependency on !COMPILE_TEST to not be enabled
+ # blindly in allmodconfig or allyesconfig configurations
+ depends on (X86_64 && !KASAN) || !COMPILE_TEST
+ depends on EXPERT
+ help
+ Add -Werror to the build flags for KVM.
+
+ If in doubt, say "N".
+
config KVM_INTEL
- tristate "KVM for Intel processors support"
- depends on KVM
- # for perf_guest_get_msrs():
- depends on CPU_SUP_INTEL
+ tristate "KVM for Intel (and compatible) processors support"
+ depends on KVM && IA32_FEAT_CTL
---help---
- Provides support for KVM on Intel processors equipped with the VT
- extensions.
+ Provides support for KVM on processors equipped with Intel's VT
+ extensions, a.k.a. Virtual Machine Extensions (VMX).
To compile this as a module, choose M here: the module
will be called kvm-intel.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index b19ef421084d..e553f0fdd87d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
ccflags-y += -Iarch/x86/kvm
+ccflags-$(CONFIG_KVM_WERROR) += -Werror
KVM := ../../../virt/kvm
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cf55629ff0ff..b1c469446b07 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -62,7 +62,7 @@ u64 kvm_supported_xcr0(void)
return xcr0;
}
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
@@ -281,8 +281,9 @@ out:
return r;
}
-static void cpuid_mask(u32 *word, int wordnum)
+static __always_inline void cpuid_mask(u32 *word, int wordnum)
{
+ reverse_cpuid_check(wordnum);
*word &= boot_cpu_data.x86_capability[wordnum];
}
@@ -352,6 +353,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
unsigned f_la57;
+ unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
/* cpuid 7.0.ebx */
const u32 kvm_cpuid_7_0_ebx_x86_features =
@@ -363,7 +365,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
@@ -392,6 +394,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* Set LA57 based on hardware capability. */
entry->ecx |= f_la57;
entry->ecx |= f_umip;
+ entry->ecx |= f_pku;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a61408243..7366c618aa04 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -53,15 +53,46 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_ECX] = { 7, 0, CPUID_ECX},
[CPUID_8000_0007_EBX] = {0x80000007, 0, CPUID_EBX},
[CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+ [CPUID_7_1_EAX] = { 7, 1, CPUID_EAX},
};
-static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+/*
+ * Reverse CPUID and its derivatives can only be used for hardware-defined
+ * feature words, i.e. words whose bits directly correspond to a CPUID leaf.
+ * Retrieving a feature bit or masking guest CPUID from a Linux-defined word
+ * is nonsensical as the bit number/mask is an arbitrary software-defined value
+ * and can't be used by KVM to query/control guest capabilities. And obviously
+ * the leaf being queried must have an entry in the lookup table.
+ */
+static __always_inline void reverse_cpuid_check(unsigned x86_leaf)
{
- unsigned x86_leaf = x86_feature / 32;
-
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_1);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_2);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_3);
+ BUILD_BUG_ON(x86_leaf == CPUID_LNX_4);
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+}
+
+/*
+ * Retrieve the bit mask from an X86_FEATURE_* definition. Features contain
+ * the hardware defined bit number (stored in bits 4:0) and a software defined
+ * "word" (stored in bits 31:5). The word is used to index into arrays of
+ * bit masks that hold the per-cpu feature capabilities, e.g. this_cpu_has().
+ */
+static __always_inline u32 __feature_bit(int x86_feature)
+{
+ reverse_cpuid_check(x86_feature / 32);
+ return 1 << (x86_feature & 31);
+}
+#define feature_bit(name) __feature_bit(X86_FEATURE_##name)
+
+static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
+{
+ unsigned x86_leaf = x86_feature / 32;
+
+ reverse_cpuid_check(x86_leaf);
return reverse_cpuid[x86_leaf];
}
@@ -93,15 +124,11 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_
{
int *reg;
- if (x86_feature == X86_FEATURE_XSAVE &&
- !static_cpu_has(X86_FEATURE_XSAVE))
- return false;
-
reg = guest_cpuid_get_register(vcpu, x86_feature);
if (!reg)
return false;
- return *reg & bit(x86_feature);
+ return *reg & __feature_bit(x86_feature);
}
static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature)
@@ -110,7 +137,7 @@ static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x8
reg = guest_cpuid_get_register(vcpu, x86_feature);
if (reg)
- *reg &= ~bit(x86_feature);
+ *reg &= ~__feature_bit(x86_feature);
}
static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 952d1a4f4d7e..bc00642e5d3b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,6 +22,7 @@
#include "kvm_cache_regs.h"
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
@@ -190,25 +191,6 @@
#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
#define FASTOP_SIZE 8
-/*
- * fastop functions have a special calling convention:
- *
- * dst: rax (in/out)
- * src: rdx (in/out)
- * src2: rcx (in)
- * flags: rflags (in/out)
- * ex: rsi (in:fastop pointer, out:zero if exception)
- *
- * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
- * different operand sizes can be reached by calculation, rather than a jump
- * table (which would be bigger than the code).
- *
- * fastop functions are declared as taking a never-defined fastop parameter,
- * so they can't be called from C directly.
- */
-
-struct fastop;
-
struct opcode {
u64 flags : 56;
u64 intercept : 8;
@@ -310,7 +292,20 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
#define ON64(x)
#endif
-static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
+/*
+ * fastop functions have a special calling convention:
+ *
+ * dst: rax (in/out)
+ * src: rdx (in/out)
+ * src2: rcx (in)
+ * flags: rflags (in/out)
+ * ex: rsi (in:fastop pointer, out:zero if exception)
+ *
+ * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
+ * different operand sizes can be reached by calculation, rather than a jump
+ * table (which would be bigger than the code).
+ */
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
@@ -1075,8 +1070,23 @@ static void fetch_register_operand(struct operand *op)
}
}
-static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
+static void emulator_get_fpu(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static void emulator_put_fpu(void)
+{
+ fpregs_unlock();
+}
+
+static void read_sse_reg(sse128_t *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
@@ -1098,11 +1108,12 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
#endif
default: BUG();
}
+ emulator_put_fpu();
}
-static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
- int reg)
+static void write_sse_reg(sse128_t *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
@@ -1124,10 +1135,12 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
#endif
default: BUG();
}
+ emulator_put_fpu();
}
-static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+static void read_mmx_reg(u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
@@ -1139,10 +1152,12 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
-static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
+static void write_mmx_reg(u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
@@ -1154,6 +1169,7 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
static int em_fninit(struct x86_emulate_ctxt *ctxt)
@@ -1161,7 +1177,9 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fninit");
+ emulator_put_fpu();
return X86EMUL_CONTINUE;
}
@@ -1172,7 +1190,9 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstcw %0": "+m"(fcw));
+ emulator_put_fpu();
ctxt->dst.val = fcw;
@@ -1186,7 +1206,9 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstsw %0": "+m"(fsw));
+ emulator_put_fpu();
ctxt->dst.val = fsw;
@@ -1205,7 +1227,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = reg;
- read_sse_reg(ctxt, &op->vec_val, reg);
+ read_sse_reg(&op->vec_val, reg);
return;
}
if (ctxt->d & Mmx) {
@@ -1256,7 +1278,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
op->type = OP_XMM;
op->bytes = 16;
op->addr.xmm = ctxt->modrm_rm;
- read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
+ read_sse_reg(&op->vec_val, ctxt->modrm_rm);
return rc;
}
if (ctxt->d & Mmx) {
@@ -1833,10 +1855,10 @@ static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
op->bytes * op->count);
break;
case OP_XMM:
- write_sse_reg(ctxt, &op->vec_val, op->addr.xmm);
+ write_sse_reg(&op->vec_val, op->addr.xmm);
break;
case OP_MM:
- write_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ write_mmx_reg(&op->mm_val, op->addr.mm);
break;
case OP_NONE:
/* no writeback */
@@ -2348,12 +2370,7 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
#ifdef CONFIG_X86_64
- u32 eax, ebx, ecx, edx;
-
- eax = 0x80000001;
- ecx = 0;
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- return edx & bit(X86_FEATURE_LM);
+ return ctxt->ops->guest_has_long_mode(ctxt);
#else
return false;
#endif
@@ -3618,18 +3635,11 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-#define FFL(x) bit(X86_FEATURE_##x)
-
static int em_movbe(struct x86_emulate_ctxt *ctxt)
{
- u32 ebx, ecx, edx, eax = 1;
u16 tmp;
- /*
- * Check MOVBE is set in the guest-visible CPUID leaf.
- */
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- if (!(ecx & FFL(MOVBE)))
+ if (!ctxt->ops->guest_has_movbe(ctxt))
return emulate_ud(ctxt);
switch (ctxt->op_bytes) {
@@ -4027,10 +4037,7 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt)
static int check_fxsr(struct x86_emulate_ctxt *ctxt)
{
- u32 eax = 1, ebx, ecx = 0, edx;
-
- ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
- if (!(edx & FFL(FXSR)))
+ if (!ctxt->ops->guest_has_fxsr(ctxt))
return emulate_ud(ctxt);
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
@@ -4092,8 +4099,12 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ emulator_put_fpu();
+
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -4136,6 +4147,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
if (size < __fxstate_size(16)) {
rc = fxregs_fixup(&fx_state, size);
if (rc != X86EMUL_CONTINUE)
@@ -4151,6 +4164,8 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
out:
+ emulator_put_fpu();
+
return rc;
}
@@ -5158,6 +5173,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->fetch.ptr = ctxt->fetch.data;
ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
else {
@@ -5210,16 +5226,28 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
case 0x3e: /* DS override */
has_seg_override = true;
- ctxt->seg_override = (ctxt->b >> 3) & 3;
+ ctxt->seg_override = VCPU_SREG_DS;
break;
case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
case 0x65: /* GS override */
has_seg_override = true;
- ctxt->seg_override = ctxt->b & 7;
+ ctxt->seg_override = VCPU_SREG_GS;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -5303,10 +5331,15 @@ done_prefixes:
}
break;
case Escape:
- if (ctxt->modrm > 0xbf)
- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
- else
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
case InstrDual:
if ((ctxt->modrm >> 6) == 3)
@@ -5448,7 +5481,9 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
{
int rc;
+ emulator_get_fpu();
rc = asm_safe("fwait");
+ emulator_put_fpu();
if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
@@ -5456,14 +5491,13 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)
return X86EMUL_CONTINUE;
}
-static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
- struct operand *op)
+static void fetch_possible_mmx_operand(struct operand *op)
{
if (op->type == OP_MM)
- read_mmx_reg(ctxt, &op->mm_val, op->addr.mm);
+ read_mmx_reg(&op->mm_val, op->addr.mm);
}
-static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
+static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop)
{
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
@@ -5539,10 +5573,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
* Now that we know the fpu is exception safe, we can fetch
* operands from it.
*/
- fetch_possible_mmx_operand(ctxt, &ctxt->src);
- fetch_possible_mmx_operand(ctxt, &ctxt->src2);
+ fetch_possible_mmx_operand(&ctxt->src);
+ fetch_possible_mmx_operand(&ctxt->src2);
if (!(ctxt->d & Mov))
- fetch_possible_mmx_operand(ctxt, &ctxt->dst);
+ fetch_possible_mmx_operand(&ctxt->dst);
}
if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
@@ -5641,14 +5675,10 @@ special_insn:
ctxt->eflags &= ~X86_EFLAGS_RF;
if (ctxt->execute) {
- if (ctxt->d & Fastop) {
- void (*fop)(struct fastop *) = (void *)ctxt->execute;
- rc = fastop(ctxt, fop);
- if (rc != X86EMUL_CONTINUE)
- goto done;
- goto writeback;
- }
- rc = ctxt->execute(ctxt);
+ if (ctxt->d & Fastop)
+ rc = fastop(ctxt, ctxt->fop);
+ else
+ rc = ctxt->execute(ctxt);
if (rc != X86EMUL_CONTINUE)
goto done;
goto writeback;
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 23ff65504d7e..a86fda7a1d03 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -33,6 +33,7 @@
#include <trace/events/kvm.h>
#include "trace.h"
+#include "irq.h"
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
@@ -775,9 +776,10 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
/*
* Hyper-V SynIC auto EOI SINT's are
- * not compatible with APICV, so deactivate APICV
+ * not compatible with APICV, so request
+ * to deactivate APICV permanently.
*/
- kvm_vcpu_deactivate_apicv(vcpu);
+ kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV);
synic->active = true;
synic->dont_zero_synic_pages = dont_zero_synic_pages;
return 0;
@@ -809,11 +811,12 @@ static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 *pdata)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- *pdata = hv->hv_crash_param[index];
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
return 0;
}
@@ -852,11 +855,12 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
u32 index, u64 data)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- hv->hv_crash_param[index] = data;
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
return 0;
}
@@ -1058,7 +1062,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
}
@@ -1121,7 +1125,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
/*
- * Clear apic_assist portion of f(struct hv_vp_assist_page
+ * Clear apic_assist portion of struct hv_vp_assist_page
* only, there can be valuable data in the rest which needs
* to be preserved e.g. on migration.
*/
@@ -1178,7 +1182,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
return 1;
break;
default:
- vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
+ vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n",
msr, data);
return 1;
}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4a6dc54cc12b..b24c606ac04b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -295,12 +295,24 @@ void kvm_pit_set_reinject(struct kvm_pit *pit, bool reinject)
if (atomic_read(&ps->reinject) == reinject)
return;
+ /*
+ * AMD SVM AVIC accelerates EOI write and does not trap.
+ * This cause in-kernel PIT re-inject mode to fail
+ * since it checks ps->irq_ack before kvm_set_irq()
+ * and relies on the ack notifier to timely queue
+ * the pt->worker work iterm and reinject the missed tick.
+ * So, deactivate APICv when PIT is in reinject mode.
+ */
if (reinject) {
+ kvm_request_apicv_update(kvm, false,
+ APICV_INHIBIT_REASON_PIT_REINJ);
/* The initial state is preserved while ps->reinject == 0. */
kvm_pit_reset_reinject(pit);
kvm_register_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
} else {
+ kvm_request_apicv_update(kvm, true,
+ APICV_INHIBIT_REASON_PIT_REINJ);
kvm_unregister_irq_ack_notifier(kvm, &ps->irq_ack_notifier);
kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 8b38bb4868a6..629a09ca9860 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -460,10 +460,14 @@ static int picdev_write(struct kvm_pic *s,
switch (addr) {
case 0x20:
case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
case 0xa0:
case 0xa1:
pic_lock(s);
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_ioport_write(&s->pics[1], addr, data);
pic_unlock(s);
break;
case 0x4d0:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 9fd2dd89a1c5..750ff0b29404 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -48,6 +49,11 @@
static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
bool line_status);
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin);
+
static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
unsigned long addr,
unsigned long length)
@@ -68,13 +74,14 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
default:
{
u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
+ u64 redir_content = ~0ULL;
+
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
- if (redir_index < IOAPIC_NUM_PINS)
- redir_content =
- ioapic->redirtbl[redir_index].bits;
- else
- redir_content = ~0ULL;
+ redir_content = ioapic->redirtbl[index].bits;
+ }
result = (ioapic->ioregsel & 0x1) ?
(redir_content >> 32) & 0xffffffff :
@@ -108,8 +115,9 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
union kvm_ioapic_redirect_entry *e;
e = &ioapic->redirtbl[RTC_GSI];
- if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
- e->fields.dest_mode))
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id,
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode)))
return;
new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
@@ -151,10 +159,16 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
__rtc_irq_eoi_tracking_restore_one(vcpu);
}
-static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu,
+ int vector)
{
- if (test_and_clear_bit(vcpu->vcpu_id,
- ioapic->rtc_status.dest_map.map)) {
+ struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
+
+ /* RTC special handling */
+ if (test_bit(vcpu->vcpu_id, dest_map->map) &&
+ (vector == dest_map->vectors[vcpu->vcpu_id]) &&
+ (test_and_clear_bit(vcpu->vcpu_id,
+ ioapic->rtc_status.dest_map.map))) {
--ioapic->rtc_status.pending_eoi;
rtc_status_pending_eoi_check_valid(ioapic);
}
@@ -168,6 +182,31 @@ static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
return false;
}
+static void ioapic_lazy_update_eoi(struct kvm_ioapic *ioapic, int irq)
+{
+ int i;
+ struct kvm_vcpu *vcpu;
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm) {
+ if (!kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ entry->fields.dest_id,
+ entry->fields.dest_mode) ||
+ kvm_apic_pending_eoi(vcpu, entry->fields.vector))
+ continue;
+
+ /*
+ * If no longer has pending EOI in LAPICs, update
+ * EOI for this vetor.
+ */
+ rtc_irq_eoi(ioapic, vcpu, entry->fields.vector);
+ kvm_ioapic_update_eoi_one(vcpu, ioapic,
+ entry->fields.trig_mode,
+ irq);
+ break;
+ }
+}
+
static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
int irq_level, bool line_status)
{
@@ -186,9 +225,18 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
}
/*
+ * AMD SVM AVIC accelerate EOI write and do not trap,
+ * in-kernel IOAPIC will not be able to receive the EOI.
+ * In this case, we do lazy update of the pending EOI when
+ * trying to set IOAPIC irq.
+ */
+ if (kvm_apicv_activated(ioapic->kvm))
+ ioapic_lazy_update_eoi(ioapic, irq);
+
+ /*
* Return 0 for coalesced interrupts; for edge-triggered interrupts,
* this only happens if a previous edge has not been delivered due
- * do masking. For level interrupts, the remote_irr field tells
+ * to masking. For level interrupts, the remote_irr field tells
* us if the interrupt is waiting for an EOI.
*
* RTC is special: it is edge-triggered, but userspace likes to know
@@ -250,8 +298,10 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
index == RTC_GSI) {
- if (kvm_apic_match_dest(vcpu, NULL, 0,
- e->fields.dest_id, e->fields.dest_mode) ||
+ u16 dm = kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+
+ if (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
+ e->fields.dest_id, dm) ||
kvm_apic_pending_eoi(vcpu, e->fields.vector))
__set_bit(e->fields.vector,
ioapic_handled_vectors);
@@ -292,6 +342,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (index >= IOAPIC_NUM_PINS)
return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
/* Preserve read-only fields */
@@ -327,11 +378,15 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
if (e->fields.delivery_mode == APIC_DM_FIXED) {
struct kvm_lapic_irq irq;
- irq.shorthand = 0;
irq.vector = e->fields.vector;
irq.delivery_mode = e->fields.delivery_mode << 8;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(!!e->fields.dest_mode);
+ irq.level = false;
+ irq.trig_mode = e->fields.trig_mode;
+ irq.shorthand = APIC_DEST_NOSHORT;
irq.dest_id = e->fields.dest_id;
- irq.dest_mode = e->fields.dest_mode;
+ irq.msi_redir_hint = false;
bitmap_zero(&vcpu_bitmap, 16);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
&vcpu_bitmap);
@@ -343,7 +398,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
* keep ioapic_handled_vectors synchronized.
*/
irq.dest_id = old_dest_id;
- irq.dest_mode = old_dest_mode;
+ irq.dest_mode =
+ kvm_lapic_irq_dest_mode(
+ !!e->fields.dest_mode);
kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq,
&vcpu_bitmap);
}
@@ -369,11 +426,11 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.dest_id = entry->fields.dest_id;
irqe.vector = entry->fields.vector;
- irqe.dest_mode = entry->fields.dest_mode;
+ irqe.dest_mode = kvm_lapic_irq_dest_mode(!!entry->fields.dest_mode);
irqe.trig_mode = entry->fields.trig_mode;
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
- irqe.shorthand = 0;
+ irqe.shorthand = APIC_DEST_NOSHORT;
irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
@@ -445,72 +502,68 @@ static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
}
#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
-
-static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
- struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+static void kvm_ioapic_update_eoi_one(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic,
+ int trigger_mode,
+ int pin)
{
- struct dest_map *dest_map = &ioapic->rtc_status.dest_map;
struct kvm_lapic *apic = vcpu->arch.apic;
- int i;
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[pin];
- /* RTC special handling */
- if (test_bit(vcpu->vcpu_id, dest_map->map) &&
- vector == dest_map->vectors[vcpu->vcpu_id])
- rtc_irq_eoi(ioapic, vcpu);
-
- for (i = 0; i < IOAPIC_NUM_PINS; i++) {
- union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
-
- if (ent->fields.vector != vector)
- continue;
-
- /*
- * We are dropping lock while calling ack notifiers because ack
- * notifier callbacks for assigned devices call into IOAPIC
- * recursively. Since remote_irr is cleared only after call
- * to notifiers if the same vector will be delivered while lock
- * is dropped it will be put into irr and will be delivered
- * after ack notifier returns.
- */
- spin_unlock(&ioapic->lock);
- kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
- spin_lock(&ioapic->lock);
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, pin);
+ spin_lock(&ioapic->lock);
- if (trigger_mode != IOAPIC_LEVEL_TRIG ||
- kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
- continue;
+ if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+ kvm_lapic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
+ return;
- ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
- ent->fields.remote_irr = 0;
- if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
- ++ioapic->irq_eoi[i];
- if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
- /*
- * Real hardware does not deliver the interrupt
- * immediately during eoi broadcast, and this
- * lets a buggy guest make slow progress
- * even if it does not correctly handle a
- * level-triggered interrupt. Emulate this
- * behavior if we detect an interrupt storm.
- */
- schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
- ioapic->irq_eoi[i] = 0;
- trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
- } else {
- ioapic_service(ioapic, i, false);
- }
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << pin))) {
+ ++ioapic->irq_eoi[pin];
+ if (ioapic->irq_eoi[pin] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[pin] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
} else {
- ioapic->irq_eoi[i] = 0;
+ ioapic_service(ioapic, pin, false);
}
+ } else {
+ ioapic->irq_eoi[pin] = 0;
}
}
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
{
+ int i;
struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
spin_lock(&ioapic->lock);
- __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ rtc_irq_eoi(ioapic, vcpu, vector);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+ kvm_ioapic_update_eoi_one(vcpu, ioapic, trigger_mode, i);
+ }
spin_unlock(&ioapic->lock);
}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ea1a4e0297da..2fb2e3c80724 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -116,9 +116,6 @@ static inline int ioapic_in_kernel(struct kvm *kvm)
}
void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
-bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode);
-int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
int trigger_mode);
int kvm_ioapic_init(struct kvm *kvm);
@@ -126,9 +123,6 @@ void kvm_ioapic_destroy(struct kvm *kvm);
int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
int level, bool line_status);
void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
-int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq,
- struct dest_map *dest_map);
void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7c6233d37c64..f173ab6b407e 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -113,5 +113,8 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu);
int kvm_setup_default_irq_routing(struct kvm *kvm);
int kvm_setup_empty_irq_routing(struct kvm *kvm);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq,
+ struct dest_map *dest_map);
#endif
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8ecd48d31800..c47d2acec529 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -52,15 +52,15 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
unsigned long dest_vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)];
unsigned int dest_vcpus = 0;
- if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_lowest_prio_delivery(irq)) {
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ if (irq->dest_mode == APIC_DEST_PHYSICAL &&
+ irq->dest_id == 0xff && kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
- if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
- return r;
-
memset(dest_vcpu_bitmap, 0, sizeof(dest_vcpu_bitmap));
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -114,13 +114,14 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
- irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->dest_mode = kvm_lapic_irq_dest_mode(
+ !!((1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo));
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
irq->msi_redir_hint = ((e->msi.address_lo
& MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
- irq->shorthand = 0;
+ irq->shorthand = APIC_DEST_NOSHORT;
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
@@ -416,7 +417,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ if (irq.trig_mode &&
+ kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT,
irq.dest_id, irq.dest_mode))
__set_bit(irq.vector, ioapic_handled_vectors);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index cf9177b4a07f..e3099c642fec 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -56,9 +56,6 @@
#define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16))
#define LAPIC_MMIO_LENGTH (1 << 12)
/* followed define is not in apicdef.h */
-#define APIC_SHORT_MASK 0xc0000
-#define APIC_DEST_NOSHORT 0x0
-#define APIC_DEST_MASK 0x800
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
@@ -630,9 +627,11 @@ static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
{
u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0)
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
return val & 0x1;
}
@@ -792,13 +791,13 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
}
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode)
+ int shorthand, unsigned int dest, int dest_mode)
{
struct kvm_lapic *target = vcpu->arch.apic;
u32 mda = kvm_apic_mda(vcpu, dest, source, target);
ASSERT(target);
- switch (short_hand) {
+ switch (shorthand) {
case APIC_DEST_NOSHORT:
if (dest_mode == APIC_DEST_PHYSICAL)
return kvm_apic_match_physical_addr(target, mda);
@@ -967,12 +966,12 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
}
/*
- * This routine tries to handler interrupts in posted mode, here is how
+ * This routine tries to handle interrupts in posted mode, here is how
* it deals with different cases:
* - For single-destination interrupts, handle it in posted mode
* - Else if vector hashing is enabled and it is a lowest-priority
* interrupt, handle it in posted mode and use the following mechanism
- * to find the destinaiton vCPU.
+ * to find the destination vCPU.
* 1. For lowest-priority interrupts, store all the possible
* destination vCPUs in an array.
* 2. Use "guest vector % max number of destination vCPUs" to find
@@ -1049,11 +1048,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
apic->regs + APIC_TMR);
}
- if (vcpu->arch.apicv_active)
- kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
- else {
+ if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
-
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
@@ -1083,9 +1079,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
result = 1;
/* assumes that there are only KVM_APIC_INIT/SIPI */
apic->pending_events = (1UL << KVM_APIC_INIT);
- /* make sure pending_events is visible before sending
- * the request */
- smp_wmb();
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
@@ -1151,7 +1144,7 @@ void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq,
if (!kvm_apic_present(vcpu))
continue;
if (!kvm_apic_match_dest(vcpu, NULL,
- irq->delivery_mode,
+ irq->shorthand,
irq->dest_id,
irq->dest_mode))
continue;
@@ -1574,9 +1567,9 @@ static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic)
struct kvm_timer *ktimer = &apic->lapic_timer;
kvm_apic_local_deliver(apic, APIC_LVTT);
- if (apic_lvtt_tscdeadline(apic))
+ if (apic_lvtt_tscdeadline(apic)) {
ktimer->tscdeadline = 0;
- if (apic_lvtt_oneshot(apic)) {
+ } else if (apic_lvtt_oneshot(apic)) {
ktimer->tscdeadline = 0;
ktimer->target_expiration = 0;
}
@@ -1963,15 +1956,20 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR:
+ case APIC_LVTERR: {
/* TODO: Check vector */
+ size_t size;
+ u32 index;
+
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
-
break;
+ }
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
@@ -2185,6 +2183,21 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
pr_warn_once("APIC base relocation is unsupported by KVM");
}
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (vcpu->arch.apicv_active) {
+ /* irr_pending is always true when apicv is activated. */
+ apic->irr_pending = true;
+ apic->isr_count = 1;
+ } else {
+ apic->irr_pending = (apic_search_irr(apic) != -1);
+ apic->isr_count = count_vectors(apic->regs + APIC_ISR);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_apicv);
+
void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -2227,8 +2240,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_lapic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
kvm_lapic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
}
- apic->irr_pending = vcpu->arch.apicv_active;
- apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
update_divide_count(apic);
atomic_set(&apic->lapic_timer.pending, 0);
@@ -2373,14 +2385,13 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
{
u32 lvt0 = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVT0);
- int r = 0;
if (!kvm_apic_hw_enabled(vcpu->arch.apic))
- r = 1;
+ return 1;
if ((lvt0 & APIC_LVT_MASKED) == 0 &&
GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
- r = 1;
- return r;
+ return 1;
+ return 0;
}
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
@@ -2486,9 +2497,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
update_divide_count(apic);
start_apic_timer(apic);
- apic->irr_pending = true;
- apic->isr_count = vcpu->arch.apicv_active ?
- 1 : count_vectors(apic->regs + APIC_ISR);
+ kvm_apic_update_apicv(vcpu);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
kvm_x86_ops->apicv_post_state_restore(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 39925afdfcdc..ec6fbfe325cf 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -10,8 +10,9 @@
#define KVM_APIC_SIPI 1
#define KVM_APIC_LVT_NUM 6
-#define KVM_APIC_SHORT_MASK 0xc0000
-#define KVM_APIC_DEST_MASK 0x800
+#define APIC_SHORT_MASK 0xc0000
+#define APIC_DEST_NOSHORT 0x0
+#define APIC_DEST_MASK 0x800
#define APIC_BUS_CYCLE_NS 1
#define APIC_BUS_FREQUENCY (1000000000ULL / APIC_BUS_CYCLE_NS)
@@ -82,14 +83,15 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val);
int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
void *data);
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, unsigned int dest, int dest_mode);
-
+ int shorthand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
+void kvm_apic_update_apicv(struct kvm_vcpu *vcpu);
bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d55674f44a18..a647601c9e1c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -102,6 +102,19 @@ static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
kvm_get_active_pcid(vcpu));
}
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault);
+
+static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ u32 err, bool prefault)
+{
+#ifdef CONFIG_RETPOLINE
+ if (likely(vcpu->arch.mmu->page_fault == kvm_tdp_page_fault))
+ return kvm_tdp_page_fault(vcpu, cr2_or_gpa, err, prefault);
+#endif
+ return vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa, err, prefault);
+}
+
/*
* Currently, we have two sorts of write-protection, a) the first one
* write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6f92b40d798c..87e9ba27ada1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -40,7 +40,7 @@
#include <linux/kthread.h>
#include <asm/page.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/cmpxchg.h>
#include <asm/e820/api.h>
#include <asm/io.h>
@@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte)
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
-#define MMIO_SPTE_GEN_HIGH_START 52
-#define MMIO_SPTE_GEN_HIGH_END 61
+#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_END 62
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+
static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+ BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -444,16 +446,14 @@ static u64 get_mmio_spte_generation(u64 spte)
{
u64 gen;
- spte &= ~shadow_mmio_mask;
-
gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
return gen;
}
-static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
- unsigned access)
+static u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access)
{
+
u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
u64 mask = generation_mmio_spte_mask(gen);
u64 gpa = gfn << PAGE_SHIFT;
@@ -464,6 +464,17 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
<< shadow_nonpresent_or_rsvd_mask_len;
+ return mask;
+}
+
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
+ unsigned int access)
+{
+ u64 mask = make_mmio_spte(vcpu, gfn, access);
+ unsigned int gen = get_mmio_spte_generation(mask);
+
+ access = mask & ACC_ALL;
+
trace_mark_mmio_spte(sptep, gfn, access, gen);
mmu_spte_set(sptep, mask);
}
@@ -484,7 +495,7 @@ static unsigned get_mmio_spte_access(u64 spte)
}
static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- kvm_pfn_t pfn, unsigned access)
+ kvm_pfn_t pfn, unsigned int access)
{
if (unlikely(is_noslot_pfn(pfn))) {
mark_mmio_spte(vcpu, sptep, gfn, access);
@@ -538,16 +549,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
static u8 kvm_get_shadow_phys_bits(void)
{
/*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
- * in CPU detection code, but MKTME treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore for MKTME
- * we should still return physical address bits reported by CPUID.
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID.
*/
- if (!boot_cpu_has(X86_FEATURE_TME) ||
- WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
- return boot_cpu_data.x86_phys_bits;
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
- return cpuid_eax(0x80000008) & 0xff;
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
}
static void kvm_mmu_reset_all_pte_masks(void)
@@ -1260,56 +1275,6 @@ static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
list_del(&sp->lpage_disallowed_link);
}
-static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
- struct kvm_memory_slot *slot)
-{
- struct kvm_lpage_info *linfo;
-
- if (slot) {
- linfo = lpage_info_slot(gfn, slot, level);
- return !!linfo->disallow_lpage;
- }
-
- return true;
-}
-
-static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
- int level)
-{
- struct kvm_memory_slot *slot;
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
-}
-
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
-{
- unsigned long page_size;
- int i, ret = 0;
-
- page_size = kvm_host_page_size(kvm, gfn);
-
- for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- if (page_size >= KVM_HPAGE_SIZE(i))
- ret = i;
- else
- break;
- }
-
- return ret;
-}
-
-static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
- bool no_dirty_log)
-{
- if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
- return false;
- if (no_dirty_log && slot->dirty_bitmap)
- return false;
-
- return true;
-}
-
static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
@@ -1317,40 +1282,14 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
struct kvm_memory_slot *slot;
slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- if (!memslot_valid_for_gpte(slot, no_dirty_log))
- slot = NULL;
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+ return NULL;
+ if (no_dirty_log && slot->dirty_bitmap)
+ return NULL;
return slot;
}
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
- bool *force_pt_level)
-{
- int host_level, level, max_level;
- struct kvm_memory_slot *slot;
-
- if (unlikely(*force_pt_level))
- return PT_PAGE_TABLE_LEVEL;
-
- slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
- *force_pt_level = !memslot_valid_for_gpte(slot, true);
- if (unlikely(*force_pt_level))
- return PT_PAGE_TABLE_LEVEL;
-
- host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
- if (host_level == PT_PAGE_TABLE_LEVEL)
- return host_level;
-
- max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
-
- for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
- break;
-
- return level - 1;
-}
-
/*
* About rmap_head encoding:
*
@@ -1410,7 +1349,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
if (j != 0)
return;
if (!prev_desc && !desc->more)
- rmap_head->val = (unsigned long)desc->sptes[0];
+ rmap_head->val = 0;
else
if (prev_desc)
prev_desc->more = desc->more;
@@ -1525,7 +1464,7 @@ struct rmap_iterator {
/*
* Iteration must be started by this function. This should also be used after
* removing/dropping sptes from the rmap link because in such cases the
- * information in the itererator may not be valid.
+ * information in the iterator may not be valid.
*
* Returns sptep if found, NULL otherwise.
*/
@@ -2547,7 +2486,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gva_t gaddr,
unsigned level,
int direct,
- unsigned access)
+ unsigned int access)
{
union kvm_mmu_page_role role;
unsigned quadrant;
@@ -2899,6 +2838,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
}
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+ LIST_HEAD(invalid_list);
+
+ if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+ return 0;
+
+ while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+ if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+ break;
+
+ ++vcpu->kvm->stat.mmu_recycled;
+ }
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+ if (!kvm_mmu_available_pages(vcpu->kvm))
+ return -ENOSPC;
+ return 0;
+}
+
/*
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -3042,7 +3001,7 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
- unsigned pte_access, int level,
+ unsigned int pte_access, int level,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -3099,17 +3058,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= (u64)pfn << PAGE_SHIFT;
if (pte_access & ACC_WRITE_MASK) {
-
- /*
- * Other vcpu creates new sp in the window between
- * mapping_level() and acquiring mmu-lock. We can
- * allow guest to retry the access, the mapping can
- * be fixed if guest refault.
- */
- if (level > PT_PAGE_TABLE_LEVEL &&
- mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
- goto done;
-
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
/*
@@ -3141,13 +3089,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
if (mmu_spte_update(sptep, spte))
ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
-done:
return ret;
}
-static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+ unsigned int pte_access, int write_fault, int level,
+ gfn_t gfn, kvm_pfn_t pfn, bool speculative,
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -3229,7 +3177,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
{
struct page *pages[PTE_PREFETCH_NUM];
struct kvm_memory_slot *slot;
- unsigned access = sp->role.access;
+ unsigned int access = sp->role.access;
int i, ret;
gfn_t gfn;
@@ -3294,6 +3242,83 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
__direct_pte_prefetch(vcpu, sp, sptep);
}
+static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn,
+ kvm_pfn_t pfn, struct kvm_memory_slot *slot)
+{
+ unsigned long hva;
+ pte_t *pte;
+ int level;
+
+ BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K ||
+ PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M ||
+ PT_PDPE_LEVEL != (int)PG_LEVEL_1G);
+
+ if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn))
+ return PT_PAGE_TABLE_LEVEL;
+
+ /*
+ * Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
+ * is not solely for performance, it's also necessary to avoid the
+ * "writable" check in __gfn_to_hva_many(), which will always fail on
+ * read-only memslots due to gfn_to_hva() assuming writes. Earlier
+ * page fault steps have already verified the guest isn't writing a
+ * read-only memslot.
+ */
+ hva = __gfn_to_hva_memslot(slot, gfn);
+
+ pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level);
+ if (unlikely(!pte))
+ return PT_PAGE_TABLE_LEVEL;
+
+ return level;
+}
+
+static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int max_level, kvm_pfn_t *pfnp)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm_lpage_info *linfo;
+ kvm_pfn_t pfn = *pfnp;
+ kvm_pfn_t mask;
+ int level;
+
+ if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
+ return PT_PAGE_TABLE_LEVEL;
+
+ if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn))
+ return PT_PAGE_TABLE_LEVEL;
+
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true);
+ if (!slot)
+ return PT_PAGE_TABLE_LEVEL;
+
+ max_level = min(max_level, kvm_x86_ops->get_lpage_level());
+ for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+ linfo = lpage_info_slot(gfn, slot, max_level);
+ if (!linfo->disallow_lpage)
+ break;
+ }
+
+ if (max_level == PT_PAGE_TABLE_LEVEL)
+ return PT_PAGE_TABLE_LEVEL;
+
+ level = host_pfn_mapping_level(vcpu, gfn, pfn, slot);
+ if (level == PT_PAGE_TABLE_LEVEL)
+ return level;
+
+ level = min(level, max_level);
+
+ /*
+ * mmu_notifier_retry() was successful and mmu_lock is held, so
+ * the pmd can't be split from under us.
+ */
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ *pfnp = pfn & ~mask;
+
+ return level;
+}
+
static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
{
@@ -3318,18 +3343,20 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
- int map_writable, int level, kvm_pfn_t pfn,
- bool prefault, bool lpage_disallowed)
+ int map_writable, int max_level, kvm_pfn_t pfn,
+ bool prefault, bool account_disallowed_nx_lpage)
{
struct kvm_shadow_walk_iterator it;
struct kvm_mmu_page *sp;
- int ret;
+ int level, ret;
gfn_t gfn = gpa >> PAGE_SHIFT;
gfn_t base_gfn = gfn;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
return RET_PF_RETRY;
+ level = kvm_mmu_hugepage_adjust(vcpu, gfn, max_level, &pfn);
+
trace_kvm_mmu_spte_requested(gpa, level, pfn);
for_each_shadow_entry(vcpu, gpa, it) {
/*
@@ -3348,7 +3375,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
it.level - 1, true, ACC_ALL);
link_shadow_page(vcpu, it.sptep, sp);
- if (lpage_disallowed)
+ if (account_disallowed_nx_lpage)
account_huge_nx_page(vcpu->kvm, sp);
}
}
@@ -3384,47 +3411,9 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return -EFAULT;
}
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
- gfn_t gfn, kvm_pfn_t *pfnp,
- int *levelp)
-{
- kvm_pfn_t pfn = *pfnp;
- int level = *levelp;
-
- /*
- * Check if it's a transparent hugepage. If this would be an
- * hugetlbfs page, level wouldn't be set to
- * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
- * here.
- */
- if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
- !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
- PageTransCompoundMap(pfn_to_page(pfn)) &&
- !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
- unsigned long mask;
- /*
- * mmu_notifier_retry was successful and we hold the
- * mmu_lock here, so the pmd can't become splitting
- * from under us, and in turn
- * __split_huge_page_refcount() can't run from under
- * us and we can safely transfer the refcount from
- * PG_tail to PG_head as we switch the pfn to tail to
- * head.
- */
- *levelp = level = PT_DIRECTORY_LEVEL;
- mask = KVM_PAGES_PER_HPAGE(level) - 1;
- VM_BUG_ON((gfn & mask) != (pfn & mask));
- if (pfn & mask) {
- kvm_release_pfn_clean(pfn);
- pfn &= ~mask;
- kvm_get_pfn(pfn);
- *pfnp = pfn;
- }
- }
-}
-
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned access, int *ret_val)
+ kvm_pfn_t pfn, unsigned int access,
+ int *ret_val)
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
@@ -3528,7 +3517,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
* - true: let the vcpu to access on the same address again.
* - false: let the real page fault path to fix it.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
@@ -3537,9 +3526,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
u64 spte = 0ull;
uint retry_count = 0;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return false;
-
if (!page_fault_can_be_fast(error_code))
return false;
@@ -3548,9 +3534,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
- if (!is_shadow_present_pte(spte) ||
- iterator.level < level)
+ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
+ if (!is_shadow_present_pte(spte))
break;
sp = page_header(__pa(iterator.sptep));
@@ -3626,71 +3611,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
} while (true);
- trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
spte, fault_handled);
walk_shadow_page_lockless_end(vcpu);
return fault_handled;
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
- gfn_t gfn, bool prefault)
-{
- int r;
- int level;
- bool force_pt_level;
- kvm_pfn_t pfn;
- unsigned long mmu_seq;
- bool map_writable, write = error_code & PFERR_WRITE_MASK;
- bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
- is_nx_huge_page_enabled();
-
- force_pt_level = lpage_disallowed;
- level = mapping_level(vcpu, gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- /*
- * This path builds a PAE pagetable - so we can map
- * 2mb pages at maximum. Therefore check if the level
- * is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
-
- if (fast_page_fault(vcpu, v, level, error_code))
- return RET_PF_RETRY;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return RET_PF_RETRY;
-
- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
- return r;
-
- r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
- goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
- goto out_unlock;
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, pfn,
- prefault, false);
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return r;
-}
-
static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
struct list_head *invalid_list)
{
@@ -3981,7 +3908,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
{
if (exception)
@@ -3989,7 +3916,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
return vaddr;
}
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -4001,20 +3928,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
static bool
__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
{
- int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
-
- return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
- ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
-}
+ int bit7 = (pte >> 7) & 1;
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
+ return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
}
-static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
+static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
{
- return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+ return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
}
static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -4038,11 +3959,11 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
{
struct kvm_shadow_walk_iterator iterator;
u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+ struct rsvd_bits_validate *rsvd_check;
int root, leaf;
bool reserved = false;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- goto exit;
+ rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
walk_shadow_page_lockless_begin(vcpu);
@@ -4058,8 +3979,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
if (!is_shadow_present_pte(spte))
break;
- reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
- iterator.level);
+ /*
+ * Use a bitwise-OR instead of a logical-OR to aggregate the
+ * reserved bit and EPT's invalid memtype/XWR checks to avoid
+ * adding a Jcc in the loop.
+ */
+ reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
+ __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
}
walk_shadow_page_lockless_end(vcpu);
@@ -4073,7 +3999,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
root--;
}
}
-exit:
+
*sptep = spte;
return reserved;
}
@@ -4092,7 +4018,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
- unsigned access = get_mmio_spte_access(spte);
+ unsigned int access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
return RET_PF_INVALID;
@@ -4137,9 +4063,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
struct kvm_shadow_walk_iterator iterator;
u64 spte;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
-
walk_shadow_page_lockless_begin(vcpu);
for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
clear_sp_write_flooding_count(iterator.sptep);
@@ -4149,29 +4072,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
walk_shadow_page_lockless_end(vcpu);
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code, bool prefault)
-{
- gfn_t gfn = gva >> PAGE_SHIFT;
- int r;
-
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return RET_PF_EMULATE;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
-
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
-
- return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
@@ -4180,11 +4082,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu->direct_map;
arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -4204,12 +4108,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return false; /* *pfn has correct page already */
if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(gva, gfn);
+ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(gva, gfn);
+ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
return true;
}
@@ -4217,11 +4121,77 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return false;
}
+static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault, int max_level, bool is_tdp)
+{
+ bool write = error_code & PFERR_WRITE_MASK;
+ bool exec = error_code & PFERR_FETCH_MASK;
+ bool lpage_disallowed = exec && is_nx_huge_page_enabled();
+ bool map_writable;
+
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ unsigned long mmu_seq;
+ kvm_pfn_t pfn;
+ int r;
+
+ if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ return RET_PF_EMULATE;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (lpage_disallowed)
+ max_level = PT_PAGE_TABLE_LEVEL;
+
+ if (fast_page_fault(vcpu, gpa, error_code))
+ return RET_PF_RETRY;
+
+ mmu_seq = vcpu->kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return RET_PF_RETRY;
+
+ if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ return r;
+
+ r = RET_PF_RETRY;
+ spin_lock(&vcpu->kvm->mmu_lock);
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+ goto out_unlock;
+ if (make_mmu_pages_available(vcpu) < 0)
+ goto out_unlock;
+ r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn,
+ prefault, is_tdp && lpage_disallowed);
+
+out_unlock:
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+ u32 error_code, bool prefault)
+{
+ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+ /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+ return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
+ PT_DIRECTORY_LEVEL, false);
+}
+
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len)
{
int r = 1;
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
@@ -4249,76 +4219,23 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
}
EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
-static bool
-check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
-{
- int page_num = KVM_PAGES_PER_HPAGE(level);
-
- gfn &= ~(page_num - 1);
-
- return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
- bool prefault)
+int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+ bool prefault)
{
- kvm_pfn_t pfn;
- int r;
- int level;
- bool force_pt_level;
- gfn_t gfn = gpa >> PAGE_SHIFT;
- unsigned long mmu_seq;
- int write = error_code & PFERR_WRITE_MASK;
- bool map_writable;
- bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
- is_nx_huge_page_enabled();
-
- MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
+ int max_level;
- if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return RET_PF_EMULATE;
-
- r = mmu_topup_memory_caches(vcpu);
- if (r)
- return r;
+ for (max_level = PT_MAX_HUGEPAGE_LEVEL;
+ max_level > PT_PAGE_TABLE_LEVEL;
+ max_level--) {
+ int page_num = KVM_PAGES_PER_HPAGE(max_level);
+ gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
- force_pt_level =
- lpage_disallowed ||
- !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
- level = mapping_level(vcpu, gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- if (level > PT_DIRECTORY_LEVEL &&
- !check_hugepage_cache_consistency(vcpu, gfn, level))
- level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+ break;
}
- if (fast_page_fault(vcpu, gpa, level, error_code))
- return RET_PF_RETRY;
-
- mmu_seq = vcpu->kvm->mmu_notifier_seq;
- smp_rmb();
-
- if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return RET_PF_RETRY;
-
- if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
- return r;
-
- r = RET_PF_RETRY;
- spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
- goto out_unlock;
- if (make_mmu_pages_available(vcpu) < 0)
- goto out_unlock;
- if (likely(!force_pt_level))
- transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
- prefault, lpage_disallowed);
-out_unlock:
- spin_unlock(&vcpu->kvm->mmu_lock);
- kvm_release_pfn_clean(pfn);
- return r;
+ return direct_page_fault(vcpu, gpa, error_code, prefault,
+ max_level, true);
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4445,7 +4362,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
}
static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
- unsigned access, int *nr_present)
+ unsigned int access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -5008,7 +4925,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
return;
context->mmu_role.as_u64 = new_role.as_u64;
- context->page_fault = tdp_page_fault;
+ context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -5496,49 +5413,31 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
-{
- LIST_HEAD(invalid_list);
-
- if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
- return 0;
-
- while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
- if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
- break;
-
- ++vcpu->kvm->stat.mmu_recycled;
- }
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
- if (!kvm_mmu_available_pages(vcpu->kvm))
- return -ENOSPC;
- return 0;
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = 0;
bool direct = vcpu->arch.mmu->direct_map;
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+ return RET_PF_RETRY;
+
/* With shadow page tables, fault_address contains a GVA or nGPA. */
if (vcpu->arch.mmu->direct_map) {
vcpu->arch.gpa_available = true;
- vcpu->arch.gpa_val = cr2;
+ vcpu->arch.gpa_val = cr2_or_gpa;
}
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, cr2, direct);
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = vcpu->arch.mmu->page_fault(vcpu, cr2,
- lower_32_bits(error_code),
- false);
+ r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
+ lower_32_bits(error_code), false);
WARN_ON(r == RET_PF_INVALID);
}
@@ -5556,7 +5455,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
*/
if (vcpu->arch.mmu->direct_map &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
return 1;
}
@@ -5571,7 +5470,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* explicitly shadowing L1's page tables, i.e. unprotecting something
* for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
@@ -5586,7 +5485,7 @@ emulate:
return 1;
}
- return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -6015,8 +5914,8 @@ restart:
* mapping if the indirect sp has level = 1.
*/
if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
- !kvm_is_zone_device_pfn(pfn) &&
- PageTransCompoundMap(pfn_to_page(pfn))) {
+ (kvm_is_zone_device_pfn(pfn) ||
+ PageCompound(pfn_to_page(pfn)))) {
pte_list_remove(rmap_head, sptep);
if (kvm_available_flush_tlb_with_range())
@@ -6249,7 +6148,7 @@ static void kvm_set_mmio_spte_mask(void)
* If reserved bit is not supported, clear the present bit to disable
* mmio page fault.
*/
- if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+ if (shadow_phys_bits == 52)
mask &= ~1ull;
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 97b21e7fd013..e4c8a4cbf407 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -33,7 +33,7 @@
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#define CMPXCHG cmpxchg
#else
#define CMPXCHG cmpxchg64
@@ -128,6 +128,21 @@ static inline int FNAME(is_present_gpte)(unsigned long pte)
#endif
}
+static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+{
+#if PTTYPE != PTTYPE_EPT
+ return false;
+#else
+ return __is_bad_mt_xwr(rsvd_check, gpte);
+#endif
+}
+
+static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) ||
+ FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
+}
+
static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
@@ -175,9 +190,6 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
{
- if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
@@ -186,6 +198,9 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
!(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
+ if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
return false;
no_present:
@@ -291,11 +306,11 @@ static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
}
/*
- * Fetch a guest pte for a guest virtual address
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
+ gpa_t addr, u32 access)
{
int ret;
pt_element_t pte;
@@ -400,7 +415,7 @@ retry_walk:
if (unlikely(!FNAME(is_present_gpte)(pte)))
goto error;
- if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
+ if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
goto error;
}
@@ -496,7 +511,7 @@ error:
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
@@ -611,17 +626,17 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
struct guest_walker *gw,
- int write_fault, int hlevel,
+ int write_fault, int max_level,
kvm_pfn_t pfn, bool map_writable, bool prefault,
bool lpage_disallowed)
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, ret;
- gfn_t gfn, base_gfn;
+ int top_level, hlevel, ret;
+ gfn_t base_gfn = gw->gfn;
direct_access = gw->pte_access;
@@ -637,7 +652,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (FNAME(gpte_changed)(vcpu, gw, top_level))
goto out_gpte_changed;
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+ if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
goto out_gpte_changed;
for (shadow_walk_init(&it, vcpu, addr);
@@ -666,12 +681,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(vcpu, it.sptep, sp);
}
- /*
- * FNAME(page_fault) might have clobbered the bottom bits of
- * gw->gfn, restore them from the virtual address.
- */
- gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
- base_gfn = gfn;
+ hlevel = kvm_mmu_hugepage_adjust(vcpu, gw->gfn, max_level, &pfn);
trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
@@ -682,9 +692,9 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
* We cannot overwrite existing page tables with an NX
* large page, as the leaf could be executable.
*/
- disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
+ disallowed_hugepage_adjust(it, gw->gfn, &pfn, &hlevel);
- base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+ base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
if (it.level == hlevel)
break;
@@ -765,7 +775,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
@@ -773,12 +783,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
struct guest_walker walker;
int r;
kvm_pfn_t pfn;
- int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
bool map_writable, is_self_change_mapping;
bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
is_nx_huge_page_enabled();
- bool force_pt_level = lpage_disallowed;
+ int max_level;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -818,14 +827,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
- if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
- level = mapping_level(vcpu, walker.gfn, &force_pt_level);
- if (likely(!force_pt_level)) {
- level = min(walker.level, level);
- walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
- }
- } else
- force_pt_level = true;
+ if (lpage_disallowed || is_self_change_mapping)
+ max_level = PT_PAGE_TABLE_LEVEL;
+ else
+ max_level = walker.level;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -865,10 +870,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
if (make_mmu_pages_available(vcpu) < 0)
goto out_unlock;
- if (!force_pt_level)
- transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
- r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
- level, pfn, map_writable, prefault, lpage_disallowed);
+ r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn,
+ map_writable, prefault, lpage_disallowed);
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
out_unlock:
@@ -945,18 +948,19 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
@@ -964,7 +968,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
}
#if PTTYPE != PTTYPE_EPT
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -972,6 +977,11 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
gpa_t gpa = UNMAPPED_GVA;
int r;
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE(vaddr >> 32);
+#endif
+
r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
if (r) {
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 7ca8831c7d1a..ffcd96fc02d0 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -249,13 +249,13 @@ TRACE_EVENT(
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
u64 *sptep, u64 old_spte, bool retry),
- TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
TP_STRUCT__entry(
__field(int, vcpu_id)
- __field(gva_t, gva)
+ __field(gpa_t, cr2_or_gpa)
__field(u32, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
@@ -265,7 +265,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->gva = gva;
+ __entry->cr2_or_gpa = cr2_or_gpa;
__entry->error_code = error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
@@ -273,9 +273,9 @@ TRACE_EVENT(
__entry->retry = retry;
),
- TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
" new %llx spurious %d fixed %d", __entry->vcpu_id,
- __entry->gva, __print_flags(__entry->error_code, "|",
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
kvm_mmu_trace_pferr_flags), __entry->sptep,
__entry->old_spte, __entry->new_spte,
__spte_satisfied(old_spte), __spte_satisfied(new_spte)
@@ -339,7 +339,7 @@ TRACE_EVENT(
/* These depend on page entry type, so compute them now. */
__field(bool, r)
__field(bool, x)
- __field(u8, u)
+ __field(signed char, u)
),
TP_fast_assign(
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 25ce3edd1872..7f0059aa30e1 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -192,11 +192,15 @@ static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
break;
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
*seg = 1;
- *unit = msr - MSR_MTRRfix16K_80000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
break;
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
*seg = 2;
- *unit = msr - MSR_MTRRfix4K_C0000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
break;
default:
return false;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 7ebb62326c14..13332984b6d5 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -2,6 +2,8 @@
#ifndef __KVM_X86_PMU_H
#define __KVM_X86_PMU_H
+#include <linux/nospec.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -102,8 +104,12 @@ static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
return NULL;
}
@@ -113,8 +119,12 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
{
int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
return NULL;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 122d4ce3b1ab..91000501756e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -57,11 +57,13 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+#ifdef MODULE
static const struct x86_cpu_id svm_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_SVM),
{}
};
MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
+#endif
#define IOPM_ALLOC_ORDER 2
#define MSRPM_ALLOC_ORDER 1
@@ -387,6 +389,8 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
static void svm_complete_interrupts(struct vcpu_svm *svm);
+static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate);
+static inline void avic_post_state_restore(struct kvm_vcpu *vcpu);
static int nested_svm_exit_handled(struct vcpu_svm *svm);
static int nested_svm_intercept(struct vcpu_svm *svm);
@@ -1003,33 +1007,32 @@ static void svm_cpu_uninit(int cpu)
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd;
- int r;
sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
if (!sd)
return -ENOMEM;
sd->cpu = cpu;
- r = -ENOMEM;
sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
- goto err_1;
+ goto free_cpu_data;
if (svm_sev_enabled()) {
- r = -ENOMEM;
sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
sizeof(void *),
GFP_KERNEL);
if (!sd->sev_vmcbs)
- goto err_1;
+ goto free_save_area;
}
per_cpu(svm_data, cpu) = sd;
return 0;
-err_1:
+free_save_area:
+ __free_page(sd->save_area);
+free_cpu_data:
kfree(sd);
- return r;
+ return -ENOMEM;
}
@@ -1307,6 +1310,65 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static void svm_hardware_teardown(void)
+{
+ int cpu;
+
+ if (svm_sev_enabled()) {
+ bitmap_free(sev_asid_bitmap);
+ bitmap_free(sev_reclaim_asid_bitmap);
+
+ sev_flush_asids();
+ }
+
+ for_each_possible_cpu(cpu)
+ svm_cpu_uninit(cpu);
+
+ __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
+ iopm_base = 0;
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1361,6 +1423,8 @@ static __init int svm_hardware_setup(void)
}
}
+ svm_adjust_mmio_mask();
+
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
@@ -1418,29 +1482,10 @@ static __init int svm_hardware_setup(void)
return 0;
err:
- __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
- iopm_base = 0;
+ svm_hardware_teardown();
return r;
}
-static __exit void svm_hardware_unsetup(void)
-{
- int cpu;
-
- if (svm_sev_enabled()) {
- bitmap_free(sev_asid_bitmap);
- bitmap_free(sev_reclaim_asid_bitmap);
-
- sev_flush_asids();
- }
-
- for_each_possible_cpu(cpu)
- svm_cpu_uninit(cpu);
-
- __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = 0;
-}
-
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
@@ -1502,7 +1547,10 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
- vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ if (kvm_apicv_activated(svm->vcpu.kvm))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -1686,23 +1734,28 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
* field of the VMCB. Therefore, we set up the
* APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
*/
-static int avic_init_access_page(struct kvm_vcpu *vcpu)
+static int avic_update_access_page(struct kvm *kvm, bool activate)
{
- struct kvm *kvm = vcpu->kvm;
int ret = 0;
mutex_lock(&kvm->slots_lock);
- if (kvm->arch.apic_access_page_done)
+ /*
+ * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger
+ * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT
+ * memory region. So, we need to ensure that kvm->mm == current->mm.
+ */
+ if ((kvm->arch.apic_access_page_done == activate) ||
+ (kvm->mm != current->mm))
goto out;
ret = __x86_set_memory_region(kvm,
APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
APIC_DEFAULT_PHYS_BASE,
- PAGE_SIZE);
+ activate ? PAGE_SIZE : 0);
if (ret)
goto out;
- kvm->arch.apic_access_page_done = true;
+ kvm->arch.apic_access_page_done = activate;
out:
mutex_unlock(&kvm->slots_lock);
return ret;
@@ -1710,21 +1763,24 @@ out:
static int avic_init_backing_page(struct kvm_vcpu *vcpu)
{
- int ret;
u64 *entry, new_entry;
int id = vcpu->vcpu_id;
struct vcpu_svm *svm = to_svm(vcpu);
- ret = avic_init_access_page(vcpu);
- if (ret)
- return ret;
-
if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
return -EINVAL;
if (!svm->vcpu.arch.apic->regs)
return -EINVAL;
+ if (kvm_apicv_activated(vcpu->kvm)) {
+ int ret;
+
+ ret = avic_update_access_page(vcpu->kvm, true);
+ if (ret)
+ return ret;
+ }
+
svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
/* Setting AVIC backing page address in the phy APIC ID table */
@@ -2009,6 +2065,18 @@ free_avic:
return err;
}
+static int svm_vm_init(struct kvm *kvm)
+{
+ if (avic) {
+ int ret = avic_vm_init(kvm);
+ if (ret)
+ return ret;
+ }
+
+ kvm_apicv_init(kvm, avic);
+ return 0;
+}
+
static inline int
avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
{
@@ -2107,7 +2175,6 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
u32 dummy;
u32 eax = 1;
- vcpu->arch.microcode_version = 0x01000065;
svm->spec_ctrl = 0;
svm->virt_spec_ctrl = 0;
@@ -2129,8 +2196,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static int avic_init_vcpu(struct vcpu_svm *svm)
{
int ret;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
+ if (!avic || !irqchip_in_kernel(vcpu->kvm))
return 0;
ret = avic_init_backing_page(&svm->vcpu);
@@ -2144,7 +2212,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm)
return ret;
}
-static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
+static int svm_create_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm;
struct page *page;
@@ -2153,39 +2221,13 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
struct page *nested_msrpm_pages;
int err;
- BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0,
- "struct kvm_vcpu must be at offset 0 for arch usercopy region");
-
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
- if (!svm) {
- err = -ENOMEM;
- goto out;
- }
-
- svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!svm->vcpu.arch.user_fpu) {
- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
- err = -ENOMEM;
- goto free_partial_svm;
- }
-
- svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!svm->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_user_fpu;
- }
-
- err = kvm_vcpu_init(&svm->vcpu, kvm, id);
- if (err)
- goto free_svm;
+ BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0);
+ svm = to_svm(vcpu);
err = -ENOMEM;
page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
- goto uninit;
+ goto out;
msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
@@ -2206,7 +2248,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
/* We initialize this flag to true to make sure that the is_running
* bit would be set the first time the vcpu is loaded.
*/
- svm->avic_is_running = true;
+ if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
+ svm->avic_is_running = true;
svm->nested.hsave = page_address(hsave_page);
@@ -2222,9 +2265,10 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->asid_generation = 0;
init_vmcb(svm);
- svm_init_osvw(&svm->vcpu);
+ svm_init_osvw(vcpu);
+ vcpu->arch.microcode_version = 0x01000065;
- return &svm->vcpu;
+ return 0;
free_page4:
__free_page(hsave_page);
@@ -2234,16 +2278,8 @@ free_page2:
__free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
free_page1:
__free_page(page);
-uninit:
- kvm_vcpu_uninit(&svm->vcpu);
-free_svm:
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
-free_partial_svm:
- kmem_cache_free(kvm_vcpu_cache, svm);
out:
- return ERR_PTR(err);
+ return err;
}
static void svm_clear_current_vmcb(struct vmcb *vmcb)
@@ -2269,10 +2305,6 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, svm);
}
static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -2343,6 +2375,8 @@ static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
avic_set_running(vcpu, true);
}
@@ -4192,6 +4226,8 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
@@ -4277,16 +4313,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
return 1;
svm->spec_ctrl = data;
-
if (!data)
break;
@@ -4310,13 +4346,12 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_AMD_IBPB))
+ return 1;
if (!data)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- if (is_guest_mode(vcpu))
- break;
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
@@ -4438,6 +4473,14 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
{
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
+
+ /*
+ * For AVIC, the only reason to end up here is ExtINTs.
+ * In this case AVIC was temporarily disabled for
+ * requesting the IRQ window and we have to re-enable it.
+ */
+ svm_toggle_avic_for_irq_window(&svm->vcpu, true);
+
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
mark_dirty(svm->vmcb, VMCB_INTR);
++svm->vcpu.stat.irq_window_exits;
@@ -4519,9 +4562,9 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
*/
kvm_for_each_vcpu(i, vcpu, kvm) {
bool m = kvm_apic_match_dest(vcpu, apic,
- icrl & KVM_APIC_SHORT_MASK,
+ icrl & APIC_SHORT_MASK,
GET_APIC_DEST_FIELD(icrh),
- icrl & KVM_APIC_DEST_MASK);
+ icrl & APIC_DEST_MASK);
if (m && !avic_vcpu_is_running(vcpu))
kvm_vcpu_wake_up(vcpu);
@@ -4935,7 +4978,8 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
*info2 = control->exit_info_2;
}
-static int handle_exit(struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
@@ -4993,7 +5037,10 @@ static int handle_exit(struct kvm_vcpu *vcpu)
__func__, svm->vmcb->control.exit_int_info,
exit_code);
- if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
+ if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
+ return 1;
+ } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
dump_vmcb(vcpu);
@@ -5129,30 +5176,82 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
return;
}
-static bool svm_get_enable_apicv(struct kvm *kvm)
+static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
- return avic && irqchip_split(kvm);
}
-static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
+static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
{
}
-static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
+static void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate)
{
+ if (!avic || !lapic_in_kernel(vcpu))
+ return;
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ kvm_request_apicv_update(vcpu->kvm, activate,
+ APICV_INHIBIT_REASON_IRQWIN);
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+}
+
+static int svm_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate)
+{
+ int ret = 0;
+ unsigned long flags;
+ struct amd_svm_iommu_ir *ir;
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!kvm_arch_has_assigned_device(vcpu->kvm))
+ return 0;
+
+ /*
+ * Here, we go through the per-vcpu ir_list to update all existing
+ * interrupt remapping table entry targeting this vcpu.
+ */
+ spin_lock_irqsave(&svm->ir_list_lock, flags);
+
+ if (list_empty(&svm->ir_list))
+ goto out;
+
+ list_for_each_entry(ir, &svm->ir_list, node) {
+ if (activate)
+ ret = amd_iommu_activate_guest_mode(ir->data);
+ else
+ ret = amd_iommu_deactivate_guest_mode(ir->data);
+ if (ret)
+ break;
+ }
+out:
+ spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ return ret;
}
-/* Note: Currently only used by Hyper-V. */
static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
+ bool activated = kvm_vcpu_apicv_active(vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
+ if (!avic)
+ return;
+
+ if (activated) {
+ /**
+ * During AVIC temporary deactivation, guest could update
+ * APIC ID, DFR and LDR registers, which would not be trapped
+ * by avic_unaccelerated_access_interception(). In this case,
+ * we need to check and update the AVIC logical APIC ID table
+ * accordingly before re-activating.
+ */
+ avic_post_state_restore(vcpu);
vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
- else
+ } else {
vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ }
mark_dirty(vmcb, VMCB_AVIC);
+
+ svm_set_pi_irte_mode(vcpu, activated);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5160,8 +5259,11 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
return;
}
-static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
{
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
kvm_lapic_set_irr(vec, vcpu->arch.apic);
smp_mb__after_atomic();
@@ -5173,6 +5275,8 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
put_cpu();
} else
kvm_vcpu_wake_up(vcpu);
+
+ return 0;
}
static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -5439,9 +5543,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (kvm_vcpu_apicv_active(vcpu))
- return;
-
/*
* In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
* 1, because that's a separate STGI/VMRUN intercept. The next time we
@@ -5451,6 +5552,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
* window under the assumption that the hardware will set the GIF.
*/
if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
+ /*
+ * IRQ window is not needed when AVIC is enabled,
+ * unless we have pending ExtINT since it cannot be injected
+ * via AVIC. In such case, we need to temporarily disable AVIC,
+ * and fallback to injecting IRQ via V_IRQ.
+ */
+ svm_toggle_avic_for_irq_window(vcpu, false);
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
@@ -5913,6 +6021,7 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
boot_cpu_has(X86_FEATURE_XSAVES);
/* Update nrips enabled cache */
@@ -5922,16 +6031,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
return;
guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
+
+ /*
+ * Currently, AVIC does not work with nested virtualization.
+ * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
+ */
+ if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
+ kvm_request_apicv_update(vcpu->kvm, false,
+ APICV_INHIBIT_REASON_NESTED);
}
-#define F(x) bit(X86_FEATURE_##x)
+#define F feature_bit
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
case 0x1:
if (avic)
- entry->ecx &= ~bit(X86_FEATURE_X2APIC);
+ entry->ecx &= ~F(X2APIC);
break;
case 0x80000001:
if (nested)
@@ -6001,6 +6118,11 @@ static bool svm_has_wbinvd_exit(void)
return true;
}
+static bool svm_pku_supported(void)
+{
+ return false;
+}
+
#define PRE_EX(exit) { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
@@ -6186,9 +6308,13 @@ out:
return ret;
}
-static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion *exit_fastpath)
{
-
+ if (!is_guest_mode(vcpu) &&
+ to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
+ to_svm(vcpu)->vmcb->control.exit_info_1)
+ *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -7242,11 +7368,27 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
(svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
}
+static bool svm_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV) |
+ BIT(APICV_INHIBIT_REASON_NESTED) |
+ BIT(APICV_INHIBIT_REASON_IRQWIN) |
+ BIT(APICV_INHIBIT_REASON_PIT_REINJ);
+
+ return supported & BIT(bit);
+}
+
+static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate)
+{
+ avic_update_access_page(kvm, activate);
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
.hardware_setup = svm_hardware_setup,
- .hardware_unsetup = svm_hardware_unsetup,
+ .hardware_unsetup = svm_hardware_teardown,
.check_processor_compatibility = svm_check_processor_compat,
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
@@ -7259,7 +7401,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.vm_alloc = svm_vm_alloc,
.vm_free = svm_vm_free,
- .vm_init = avic_vm_init,
+ .vm_init = svm_vm_init,
.vm_destroy = svm_vm_destroy,
.prepare_guest_switch = svm_prepare_guest_switch,
@@ -7301,6 +7443,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
+ .update_emulated_instruction = NULL,
.set_interrupt_shadow = svm_set_interrupt_shadow,
.get_interrupt_shadow = svm_get_interrupt_shadow,
.patch_hypercall = svm_patch_hypercall,
@@ -7316,8 +7459,9 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.enable_irq_window = enable_irq_window,
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_apic_mode = svm_set_virtual_apic_mode,
- .get_enable_apicv = svm_get_enable_apicv,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
+ .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons,
+ .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
@@ -7341,6 +7485,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+ .pku_supported = svm_pku_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c741a0c5f80..f194dd058470 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1291,6 +1291,25 @@ TRACE_EVENT(kvm_hv_stimer_cleanup,
__entry->vcpu_id, __entry->timer_index)
);
+TRACE_EVENT(kvm_apicv_update_request,
+ TP_PROTO(bool activate, unsigned long bit),
+ TP_ARGS(activate, bit),
+
+ TP_STRUCT__entry(
+ __field(bool, activate)
+ __field(unsigned long, bit)
+ ),
+
+ TP_fast_assign(
+ __entry->activate = activate;
+ __entry->bit = bit;
+ ),
+
+ TP_printk("%s bit=%lu",
+ __entry->activate ? "activate" : "deactivate",
+ __entry->bit)
+);
+
/*
* Tracepoint for AMD AVIC
*/
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 7aa69716d516..f486e2606247 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,6 +12,7 @@ extern bool __read_mostly enable_ept;
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_apicv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -145,6 +146,11 @@ static inline bool vmx_umip_emulated(void)
SECONDARY_EXEC_DESC;
}
+static inline bool vmx_pku_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PKU);
+}
+
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 72359709cdc1..303813423c3e 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -7,6 +7,7 @@
#include "evmcs.h"
#include "vmcs.h"
#include "vmx.h"
+#include "trace.h"
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
@@ -346,26 +347,93 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
return 0;
}
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
+{
+ u32 ctl_low = (u32)*pdata;
+ u32 ctl_high = (u32)(*pdata >> 32);
+
+ /*
+ * Hyper-V 2016 and 2019 try using these features even when eVMCS
+ * is enabled but there are no corresponding fields.
+ */
+ switch (msr_index) {
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ ctl_high &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ ctl_high &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ ctl_high &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ break;
+ }
+
+ *pdata = ctl_low | ((u64)ctl_high << 32);
+}
+
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
+{
+ int ret = 0;
+ u32 unsupp_ctl;
+
+ unsupp_ctl = vmcs12->pin_based_vm_exec_control &
+ EVMCS1_UNSUPPORTED_PINCTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported pin-based VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->secondary_vm_exec_control &
+ EVMCS1_UNSUPPORTED_2NDEXEC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported secondary VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_exit_controls &
+ EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-exit controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_entry_controls &
+ EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-entry controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-function controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled;
vmx->nested.enlightened_vmcs_enabled = true;
if (vmcs_version)
*vmcs_version = nested_get_evmcs_version(vcpu);
- /* We don't support disabling the feature for simplicity. */
- if (evmcs_already_enabled)
- return 0;
-
- vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
- vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
- vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
- vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
- vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
-
return 0;
}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 07ebf6882a45..6de47f2569c9 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -10,6 +10,7 @@
#include "capabilities.h"
#include "vmcs.h"
+#include "vmcs12.h"
struct vmcs_config;
@@ -201,5 +202,7 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 4aea7d304beb..9750e590c89d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -28,16 +28,6 @@ module_param(nested_early_check, bool, S_IRUGO);
failed; \
})
-#define SET_MSR_OR_WARN(vcpu, idx, data) \
-({ \
- bool failed = kvm_set_msr(vcpu, idx, data); \
- if (failed) \
- pr_warn_ratelimited( \
- "%s cannot write MSR (0x%x, 0x%llx)\n", \
- __func__, idx, data); \
- failed; \
-})
-
/*
* Hyper-V requires all of these, so mark them as supported even though
* they are just treated the same as all-context.
@@ -234,7 +224,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
return;
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs_vmptr = -1ull;
+ vmx->nested.hv_evmcs_vmptr = 0;
vmx->nested.hv_evmcs = NULL;
}
@@ -554,7 +544,8 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
}
}
-static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
+{
int msr;
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
@@ -1084,10 +1075,10 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
}
/*
- * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
- * emulating VM entry into a guest with EPT enabled.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
+ * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
+ * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
+ * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
+ * @entry_failure_code.
*/
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
u32 *entry_failure_code)
@@ -1932,7 +1923,8 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
return 1;
- if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ if (unlikely(!vmx->nested.hv_evmcs ||
+ evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
if (!vmx->nested.hv_evmcs)
vmx->nested.current_vmptr = -1ull;
@@ -1991,7 +1983,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
}
/*
- * Clean fields data can't de used on VMLAUNCH and when we switch
+ * Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
if (from_launch || evmcs_gpa_changed)
@@ -2172,8 +2164,8 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
* EXEC CONTROLS
*/
exec_control = vmx_exec_control(vmx); /* L0's desires */
- exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
@@ -2550,8 +2542,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
- SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->guest_ia32_perf_global_ctrl))
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl)))
return -EINVAL;
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
@@ -2566,7 +2558,7 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
return -EINVAL;
if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
- nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
+ nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
return -EINVAL;
return 0;
@@ -2767,6 +2759,9 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
+ if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+ return nested_evmcs_check_controls(vmcs12);
+
return 0;
}
@@ -2823,7 +2818,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(vmcs12->host_ss_selector == 0 && !ia32e))
return -EINVAL;
-#ifdef CONFIG_X86_64
if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
@@ -2831,7 +2825,6 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
return -EINVAL;
-#endif
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
@@ -2899,6 +2892,10 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
return -EINVAL;
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+ CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ return -EINVAL;
+
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
return -EINVAL;
@@ -3048,9 +3045,6 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
return 0;
}
-static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12);
-
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -3168,10 +3162,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
*
* Returns:
- * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
- * NVMX_ENTRY_VMFAIL: Consistency check VMFail
- * NVMX_ENTRY_VMEXIT: Consistency check VMExit
- * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
+ * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
*/
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry)
@@ -3183,7 +3177,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
u32 exit_qual;
evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -3230,7 +3224,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
}
enter_guest_mode(vcpu);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
@@ -3294,7 +3288,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
* 26.7 "VM-entry failures during or after loading guest state".
*/
vmentry_fail_vmexit_guest_mode:
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
@@ -3407,8 +3401,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
*/
if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
- !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
- !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
+ !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
+ !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
return kvm_vcpu_halt(vcpu);
@@ -3427,7 +3421,7 @@ vmentry_failed:
/*
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
- * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
* This function returns the new value we should put in vmcs12.guest_cr0.
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
@@ -3583,25 +3577,80 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
}
+/*
+ * Returns true if a debug trap is pending delivery.
+ *
+ * In KVM, debug traps bear an exception payload. As such, the class of a #DB
+ * exception may be inferred from the presence of an exception payload.
+ */
+static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == DB_VECTOR &&
+ vcpu->arch.exception.payload;
+}
+
+/*
+ * Certain VM-exits set the 'pending debug exceptions' field to indicate a
+ * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
+ * represents these debug traps with a payload that is said to be compatible
+ * with the 'pending debug exceptions' field, write the payload to the VMCS
+ * field if a VM-exit is delivered before the debug trap.
+ */
+static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
+{
+ if (vmx_pending_dbg_trap(vcpu))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vcpu->arch.exception.payload);
+}
+
static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
bool block_nested_events =
vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+ bool mtf_pending = vmx->nested.mtf_pending;
struct kvm_lapic *apic = vcpu->arch.apic;
+ /*
+ * Clear the MTF state. If a higher priority VM-exit is delivered first,
+ * this state is discarded.
+ */
+ vmx->nested.mtf_pending = false;
+
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
if (block_nested_events)
return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
clear_bit(KVM_APIC_INIT, &apic->pending_events);
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
return 0;
}
+ /*
+ * Process any exceptions that are not debug traps before MTF.
+ */
+ if (vcpu->arch.exception.pending &&
+ !vmx_pending_dbg_trap(vcpu) &&
+ nested_vmx_check_exception(vcpu, &exit_qual)) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 0;
+ }
+
+ if (mtf_pending) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+ return 0;
+ }
+
if (vcpu->arch.exception.pending &&
- nested_vmx_check_exception(vcpu, &exit_qual)) {
+ nested_vmx_check_exception(vcpu, &exit_qual)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
@@ -3999,8 +4048,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
- vmcs12->host_ia32_perf_global_ctrl);
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
/* Set L1 segment info according to Intel SDM
27.5.2 Loading Host Segment and Descriptor-Table Registers */
@@ -4209,7 +4258,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
@@ -4588,8 +4637,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
gpa_t vmptr;
uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
- | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
+ | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
* The Intel VMX Instruction Reference lists a bunch of bits that are
@@ -4734,8 +4783,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
return nested_vmx_succeed(vcpu);
}
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
-
/* Emulate the VMLAUNCH instruction */
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
{
@@ -4751,36 +4798,32 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
static int handle_vmread(struct kvm_vcpu *vcpu)
{
- unsigned long field;
- u64 field_value;
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- int len;
- gva_t gva = 0;
- struct vmcs12 *vmcs12;
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct x86_exception e;
+ unsigned long field;
+ u64 value;
+ gva_t gva = 0;
short offset;
+ int len;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu))
- vmcs12 = get_vmcs12(vcpu);
- else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
/* Decode instruction info and find the field to read */
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
if (offset < 0)
@@ -4790,25 +4833,26 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 field_value */
- field_value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
- if (vmx_instruction_info & (1u << 10)) {
- kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
- field_value);
+ if (instr_info & BIT(10)) {
+ kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
} else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, len, &gva))
+ instr_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
+ if (kvm_write_guest_virt_system(vcpu, gva, &value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
}
return nested_vmx_succeed(vcpu);
@@ -4840,46 +4884,58 @@ static bool is_shadow_field_ro(unsigned long field)
static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
unsigned long field;
- int len;
+ short offset;
gva_t gva;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ int len;
- /* The value to write might be 32 or 64 bits, depending on L1's long
+ /*
+ * The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the appropriate number of
+ * bit (value), and then copies only the appropriate number of
* bits into the vmcs12 field.
*/
- u64 field_value = 0;
- struct x86_exception e;
- struct vmcs12 *vmcs12;
- short offset;
+ u64 value = 0;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (vmx->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMWRITE sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_readl(vcpu,
- (((vmx_instruction_info) >> 3) & 0xf));
+ if (instr_info & BIT(10))
+ value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, false, len, &gva))
+ instr_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
+ if (kvm_read_guest_virt(vcpu, gva, &value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
return 1;
}
}
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT);
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/*
* If the vCPU supports "VMWRITE to any supported field in the
* VMCS," then the "read-only" fields are actually read/write.
@@ -4889,29 +4945,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- if (!is_guest_mode(vcpu)) {
- vmcs12 = get_vmcs12(vcpu);
-
- /*
- * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
- * vmcs12, else we may crush a field or consume a stale value.
- */
- if (!is_shadow_field_rw(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- } else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/*
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
@@ -4922,9 +4961,9 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
* the stripped down value, L2 sees the full value as stored by KVM).
*/
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
- field_value &= 0x1f0ff;
+ value &= 0x1f0ff;
- vmcs12_write_any(vmcs12, field, offset, field_value);
+ vmcs12_write_any(vmcs12, field, offset, value);
/*
* Do not track vmcs12 dirty-state if in guest-mode as we actually
@@ -4941,7 +4980,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
preempt_disable();
vmcs_load(vmx->vmcs01.shadow_vmcs);
- __vmcs_writel(field, field_value);
+ __vmcs_writel(field, value);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
vmcs_load(vmx->loaded_vmcs->vmcs);
@@ -5274,24 +5313,17 @@ fail:
return 1;
}
-
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
{
- unsigned long exit_qualification;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gpa_t bitmap, last_bitmap;
- unsigned int port;
- int size;
u8 b;
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- port = exit_qualification >> 16;
- size = (exit_qualification & 7) + 1;
-
last_bitmap = (gpa_t)-1;
b = -1;
@@ -5318,8 +5350,26 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
return false;
}
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
/*
- * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access,
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
* disinterest in the current event (read or write a specific MSR) by using an
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
@@ -5524,10 +5574,10 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
return false;
case EXIT_REASON_TRIPLE_FAULT:
return true;
- case EXIT_REASON_PENDING_INTERRUPT:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
+ case EXIT_REASON_INTERRUPT_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
case EXIT_REASON_NMI_WINDOW:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
+ return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
case EXIT_REASON_TASK_SWITCH:
return true;
case EXIT_REASON_CPUID:
@@ -5701,6 +5751,9 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
if (vmx->nested.nested_run_pending)
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+
+ if (vmx->nested.mtf_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
}
}
@@ -5881,6 +5934,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
vmx->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ vmx->nested.mtf_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
+
ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
@@ -5938,8 +5994,7 @@ void nested_vmx_set_vmcs_shadowing_bitmap(void)
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
- bool apicv)
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
{
/*
* Note that as a general rule, the high half of the MSRs (bits in
@@ -5966,7 +6021,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
PIN_BASED_VIRTUAL_NMIS |
- (apicv ? PIN_BASED_POSTED_INTR : 0);
+ (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
@@ -6015,8 +6070,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
msrs->procbased_ctls_high &=
- CPU_BASED_VIRTUAL_INTR_PENDING |
- CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index fc874d4ead0f..9aeda46f473e 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -17,8 +17,7 @@ enum nvmx_vmentry_status {
};
void vmx_leave_nested(struct kvm_vcpu *vcpu);
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
- bool apicv);
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_set_vmcs_shadowing_bitmap(void);
@@ -34,6 +33,8 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size);
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
@@ -175,6 +176,11 @@ static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}
+static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+}
+
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 7023138b1cb0..fd21cdb10b79 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -86,10 +86,14 @@ static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
static unsigned intel_find_fixed_event(int idx)
{
- if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
return PERF_COUNT_HW_MAX;
- return intel_arch_events[fixed_pmc_events[idx]].event_type;
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -130,16 +134,20 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
+ unsigned int num_counters;
idx &= ~(3u << 30);
- if (!fixed && idx >= pmu->nr_arch_gp_counters)
- return NULL;
- if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
return NULL;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
*mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
-
- return &counters[idx];
+ return &counters[array_index_nospec(idx, num_counters)];
}
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -252,13 +260,12 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
default:
if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (msr_info->host_initiated)
- pmc->counter = data;
- else
- pmc->counter = (s32)data;
+ if (!msr_info->host_initiated)
+ data = (s64)(s32)data;
+ pmc->counter += data - pmc_read_counter(pmc);
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
- pmc->counter = data;
+ pmc->counter += data - pmc_read_counter(pmc);
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index eb1ecd16fd22..cad128d1657b 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -23,12 +23,12 @@ BUILD_BUG_ON(1)
*
* When adding or removing fields here, note that shadowed
* fields must always be synced by prepare_vmcs02, not just
- * prepare_vmcs02_full.
+ * prepare_vmcs02_rare.
*/
/*
* Keeping the fields ordered by size is an attempt at improving
- * branch prediction in vmcs_read_any and vmcs_write_any.
+ * branch prediction in vmcs12_read_any and vmcs12_write_any.
*/
/* 16-bits */
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e3394c839dea..26f8f31563e9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -64,11 +64,13 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
X86_FEATURE_MATCH(X86_FEATURE_VMX),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+#endif
bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
@@ -95,7 +97,7 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
-static bool __read_mostly enable_apicv = 1;
+bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
@@ -1057,6 +1059,12 @@ static unsigned long segment_base(u16 selector)
}
#endif
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return (pt_mode == PT_MODE_HOST_GUEST) &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
@@ -1169,6 +1177,10 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
vmx->guest_msrs[i].mask);
}
+
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
+ nested_sync_vmcs12_to_shadow(vcpu);
+
if (vmx->guest_state_loaded)
return;
@@ -1422,8 +1434,6 @@ static bool emulation_required(struct kvm_vcpu *vcpu)
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1595,6 +1605,40 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ return;
+
+ /*
+ * Per the SDM, MTF takes priority over debug-trap exceptions besides
+ * T-bit traps. As instruction emulation is completed (i.e. at the
+ * instruction boundary), any #DB exception pending delivery must be a
+ * debug-trap. Record the pending MTF state to be delivered in
+ * vmx_check_nested_events().
+ */
+ if (nested_cpu_has_mtf(vmcs12) &&
+ (!vcpu->arch.exception.pending ||
+ vcpu->arch.exception.nr == DB_VECTOR))
+ vmx->nested.mtf_pending = true;
+ else
+ vmx->nested.mtf_pending = false;
+}
+
+static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ vmx_update_emulated_instruction(vcpu);
+ return skip_emulated_instruction(vcpu);
+}
+
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/*
@@ -1716,7 +1760,7 @@ static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
return vcpu->arch.tsc_offset;
@@ -1734,7 +1778,7 @@ static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1773,8 +1817,6 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
default:
return 1;
}
-
- return 0;
}
/*
@@ -1839,18 +1881,30 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE))
+ FEAT_CTL_LMCE_ENABLED))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
- &msr_info->data);
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
+ return 1;
+ /*
+ * Enlightened VMCS v1 doesn't have certain fields, but buggy
+ * Hyper-V versions are still trying to use corresponding
+ * features when they are exposed. Filter out the essential
+ * minimum.
+ */
+ if (!msr_info->host_initiated &&
+ vmx->nested.enlightened_vmcs_enabled)
+ nested_evmcs_filter_control_msr(msr_info->index,
+ &msr_info->data);
+ break;
case MSR_IA32_RTIT_CTL:
if (pt_mode != PT_MODE_HOST_GUEST)
return 1;
@@ -1916,7 +1970,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
}
/*
- * Writes msr value into into the appropriate "register".
+ * Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
@@ -1994,12 +2048,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (data & ~kvm_spec_ctrl_valid_bits(vcpu))
return 1;
vmx->spec_ctrl = data;
-
if (!data)
break;
@@ -2010,7 +2062,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
@@ -2033,7 +2085,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL))
+ return 1;
if (!data)
break;
@@ -2046,7 +2099,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
@@ -2074,15 +2127,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE)) ||
+ FEAT_CTL_LMCE_ENABLED)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
if (!vmx_feature_control_msr_valid(vcpu, data) ||
(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ FEAT_CTL_LOCKED && !msr_info->host_initiated))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
@@ -2104,47 +2157,50 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
pt_update_intercept_for_msr(vmx);
break;
case MSR_IA32_RTIT_STATUS:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (data & MSR_IA32_RTIT_STATUS_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_cr3_filtering))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)) ||
- (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK)
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges))
+ return 1;
+ if (is_noncanonical_address(data, vcpu))
return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
@@ -2204,29 +2260,8 @@ static __init int cpu_has_kvm_support(void)
static __init int vmx_disabled_by_bios(void)
{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- /* launched w/ TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && tboot_enabled())
- return 1;
- /* launched w/o TXT and VMX only enabled w/ TXT */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && !tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- "activate TXT before enabling KVM\n");
- return 1;
- }
- /* launched w/o TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
- return 1;
- }
-
- return 0;
+ return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX);
}
static void kvm_cpu_vmxon(u64 addr)
@@ -2241,7 +2276,6 @@ static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
@@ -2269,17 +2303,6 @@ static int hardware_enable(void)
*/
crash_enable_local_vmclear(cpu);
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
- if ((old & test_bits) != test_bits) {
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
- }
kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
@@ -2315,6 +2338,17 @@ static void hardware_disable(void)
kvm_cpu_vmxoff();
}
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
+ */
+static bool cpu_has_sgx(void)
+{
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -2355,7 +2389,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
@@ -2395,8 +2429,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_ENCLS_EXITING;
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ if (cpu_has_sgx())
+ opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2690,8 +2725,6 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->rmode.vm86_active = 0;
- vmx_segment_cache_clear(vmx);
-
vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
@@ -2966,6 +2999,9 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static int get_ept_level(struct kvm_vcpu *vcpu)
{
+ /* Nested EPT currently only supports 4-level walks. */
+ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+ return 4;
if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
return 5;
return 4;
@@ -3477,7 +3513,7 @@ out:
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- int i, idx, r = 0;
+ int i, r = 0;
kvm_pfn_t identity_map_pfn;
u32 tmp;
@@ -3485,7 +3521,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
mutex_lock(&kvm->slots_lock);
if (likely(kvm_vmx->ept_identity_pagetable_done))
- goto out2;
+ goto out;
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
@@ -3494,9 +3530,8 @@ static int init_rmode_identity_map(struct kvm *kvm)
r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
- goto out2;
+ goto out;
- idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -3512,9 +3547,6 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_vmx->ept_identity_pagetable_done = true;
out:
- srcu_read_unlock(&kvm->srcu, idx);
-
-out2:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -3752,11 +3784,6 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
}
}
-static bool vmx_get_enable_apicv(struct kvm *kvm)
-{
- return enable_apicv;
-}
-
static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3843,24 +3870,29 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
* 2. If target vcpu isn't running(root mode), kick it to pick up the
* interrupt from PIR in next vmentry.
*/
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
- return;
+ return 0;
+
+ if (!vcpu->arch.apicv_active)
+ return -1;
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return;
+ return 0;
/* If a previous notification has sent the IPI, nothing to do. */
if (pi_test_and_set_on(&vmx->pi_desc))
- return;
+ return 0;
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
+
+ return 0;
}
/*
@@ -4042,6 +4074,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_xsaves_supported()) {
/* Exposing XSAVES only when XSAVE is exposed */
bool xsaves_enabled =
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
@@ -4265,7 +4298,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx->msr_ia32_umwait_control = 0;
- vcpu->arch.microcode_version = 0x100000000ULL;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
@@ -4352,7 +4384,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4363,7 +4395,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4488,8 +4520,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
if (enable_unrestricted_guest)
return 0;
- ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
- PAGE_SIZE * 3);
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
if (ret)
return ret;
to_kvm_vmx(kvm)->tss_addr = addr;
@@ -4971,7 +5006,7 @@ static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5184,7 +5219,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
WARN_ON_ONCE(!enable_vnmi);
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5205,7 +5240,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
intr_window_requested = exec_controls_get(vmx) &
- CPU_BASED_VIRTUAL_INTR_PENDING;
+ CPU_BASED_INTR_WINDOW_EXITING;
while (vmx->emulation_required && count-- != 0) {
if (intr_window_requested && vmx_interrupt_allowed(vcpu))
@@ -5529,7 +5564,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
[EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
@@ -5816,7 +5851,8 @@ void dump_vmcs(void)
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exit_reason = vmx->exit_reason;
@@ -5902,34 +5938,44 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
}
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason]) {
+ if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) {
+ kvm_skip_emulated_instruction(vcpu);
+ return 1;
+ }
+
+ if (exit_reason >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
#ifdef CONFIG_RETPOLINE
- if (exit_reason == EXIT_REASON_MSR_WRITE)
- return kvm_emulate_wrmsr(vcpu);
- else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
- return handle_preemption_timer(vcpu);
- else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT)
- return handle_interrupt_window(vcpu);
- else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
- return handle_external_interrupt(vcpu);
- else if (exit_reason == EXIT_REASON_HLT)
- return kvm_emulate_halt(vcpu);
- else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
- return handle_ept_misconfig(vcpu);
+ if (exit_reason == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
#endif
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
- } else {
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason);
- dump_vmcs();
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
+
+ exit_reason = array_index_nospec(exit_reason,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_reason])
+ goto unexpected_vmexit;
+
+ return kvm_vmx_exit_handlers[exit_reason](vcpu);
+
+unexpected_vmexit:
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+ dump_vmcs();
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
- vcpu->run->internal.data[0] = exit_reason;
- return 0;
- }
+ vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.data[0] = exit_reason;
+ return 0;
}
/*
@@ -6250,7 +6296,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
}
STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
-static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
+static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu,
+ enum exit_fastpath_completion *exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6258,6 +6305,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
handle_external_interrupt_irqoff(vcpu);
else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
+ else if (!is_guest_mode(vcpu) &&
+ vmx->exit_reason == EXIT_REASON_MSR_WRITE)
+ *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu);
}
static bool vmx_has_emulated_msr(int index)
@@ -6489,8 +6539,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}
- if (vmx->nested.need_vmcs12_to_shadow_sync)
- nested_sync_vmcs12_to_shadow(vcpu);
+ /*
+ * We did this in prepare_switch_to_guest, because it needs to
+ * be within srcu_read_lock.
+ */
+ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -6666,60 +6719,31 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
- int err;
struct vcpu_vmx *vmx;
unsigned long *msr_bitmap;
- int i, cpu;
-
- BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
- "struct kvm_vcpu must be at offset 0 for arch usercopy region");
+ int i, cpu, err;
- vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.user_fpu) {
- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
- err = -ENOMEM;
- goto free_partial_vcpu;
- }
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_user_fpu;
- }
+ err = -ENOMEM;
vmx->vpid = allocate_vpid();
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
-
- err = -ENOMEM;
-
/*
* If PML is turned on, failure on enabling PML just results in failure
* of creating the vcpu, therefore we can simplify PML logic (by
* avoiding dealing with cases, such as enabling PML partially on vcpus
- * for the guest, etc.
+ * for the guest), etc.
*/
if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
- goto uninit_vcpu;
+ goto free_vpid;
}
BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) != NR_SHARED_MSRS);
@@ -6764,7 +6788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(kvm)) {
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
@@ -6774,34 +6798,34 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
init_vmcs(vmx);
- vmx_vcpu_put(&vmx->vcpu);
+ vmx_vcpu_put(vcpu);
put_cpu();
- if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- err = alloc_apic_access_page(kvm);
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (enable_ept && !enable_unrestricted_guest) {
- err = init_rmode_identity_map(kvm);
+ err = init_rmode_identity_map(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (nested)
nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept,
- kvm_vcpu_apicv_active(&vmx->vcpu));
+ vmx_capability.ept);
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
/*
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
@@ -6812,22 +6836,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->ept_pointer = INVALID_PAGE;
- return &vmx->vcpu;
+ return 0;
free_vmcs:
free_loaded_vmcs(vmx->loaded_vmcs);
free_pml:
vmx_destroy_pml_buffer(vmx);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
+free_vpid:
free_vpid(vmx->vpid);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-free_partial_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
+ return err;
}
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
@@ -6863,6 +6880,7 @@ static int vmx_vm_init(struct kvm *kvm)
break;
}
}
+ kvm_apicv_init(kvm, enable_apicv);
return 0;
}
@@ -6871,11 +6889,16 @@ static int __init vmx_check_processor_compat(void)
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
+ return -EIO;
+ }
+
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
- enable_apicv);
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
@@ -6973,28 +6996,28 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
} while (0)
entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
- cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
- cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
- cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
- cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
- cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
- cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
- cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
- cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
- cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
- cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
- cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
- cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
- cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
- cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
- cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
- cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
- cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
- cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57));
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
#undef cr4_fixed1_update
}
@@ -7099,12 +7122,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX |
- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX);
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
@@ -7128,7 +7151,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
if (func == 1 && nested)
- entry->ecx |= bit(X86_FEATURE_VMX);
+ entry->ecx |= feature_bit(VMX);
}
static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7136,6 +7159,40 @@ static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->req_immediate_exit = true;
}
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
@@ -7143,19 +7200,45 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ switch (info->intercept) {
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
*/
- if (info->intercept == x86_intercept_rdtscp &&
- !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- ctxt->exception.vector = UD_VECTOR;
- ctxt->exception.error_code_valid = false;
- return X86EMUL_PROPAGATE_FAULT;
- }
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ break;
/* TODO: check more intercepts... */
- return X86EMUL_CONTINUE;
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
}
#ifdef CONFIG_X86_64
@@ -7523,10 +7606,10 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_LMCE;
+ FEAT_CTL_LMCE_ENABLED;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_LMCE;
+ ~FEAT_CTL_LMCE_ENABLED;
}
static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
@@ -7737,7 +7820,7 @@ static __init int hardware_setup(void)
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
- vmx_capability.ept, enable_apicv);
+ vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
@@ -7758,6 +7841,14 @@ static __exit void hardware_unsetup(void)
free_kvm_area();
}
+static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV);
+
+ return supported & BIT(bit);
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -7813,7 +7904,8 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
.set_interrupt_shadow = vmx_set_interrupt_shadow,
.get_interrupt_shadow = vmx_get_interrupt_shadow,
.patch_hypercall = vmx_patch_hypercall,
@@ -7830,10 +7922,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.update_cr8_intercept = update_cr8_intercept,
.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .get_enable_apicv = vmx_get_enable_apicv,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
.apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
@@ -7870,6 +7962,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+ .pku_supported = vmx_pku_supported,
.request_immediate_exit = vmx_request_immediate_exit,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index a4f7f737c5d4..e64da06c7009 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -150,6 +150,9 @@ struct nested_vmx {
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+ /* Pending MTF VM-exit into L1. */
+ bool mtf_pending;
+
struct loaded_vmcs vmcs02;
/*
@@ -289,7 +292,7 @@ struct vcpu_vmx {
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
- * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
* in msr_ia32_feature_control_valid_bits.
*/
u64 msr_ia32_feature_control;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf917139de6b..3156e25b0774 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -26,6 +26,7 @@
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
+#include "lapic.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -93,6 +94,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
@@ -435,6 +438,14 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
* for #DB exceptions under VMX.
*/
vcpu->arch.dr6 ^= payload & DR6_RTM;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
vcpu->arch.cr2 = payload;
@@ -487,19 +498,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
- /*
- * In guest mode, payload delivery should be deferred,
- * so that the L1 hypervisor can intercept #PF before
- * CR2 is modified (or intercept #DB before DR6 is
- * modified under nVMX). However, for ABI
- * compatibility with KVM_GET_VCPU_EVENTS and
- * KVM_SET_VCPU_EVENTS, we can't delay payload
- * delivery unless userspace has enabled this
- * functionality via the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD.
- */
- if (!vcpu->kvm->arch.exception_payload_enabled ||
- !is_guest_mode(vcpu))
+ if (!is_guest_mode(vcpu))
kvm_deliver_exception_payload(vcpu);
return;
}
@@ -879,30 +878,46 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
- if (cr4 & CR4_RESERVED_BITS)
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
+#define __cr4_reserved_bits(__cpu_has, __c) \
+({ \
+ u64 __reserved_bits = CR4_RESERVED_BITS; \
+ \
+ if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \
+ __reserved_bits |= X86_CR4_OSXSAVE; \
+ if (!__cpu_has(__c, X86_FEATURE_SMEP)) \
+ __reserved_bits |= X86_CR4_SMEP; \
+ if (!__cpu_has(__c, X86_FEATURE_SMAP)) \
+ __reserved_bits |= X86_CR4_SMAP; \
+ if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \
+ __reserved_bits |= X86_CR4_FSGSBASE; \
+ if (!__cpu_has(__c, X86_FEATURE_PKU)) \
+ __reserved_bits |= X86_CR4_PKE; \
+ if (!__cpu_has(__c, X86_FEATURE_LA57)) \
+ __reserved_bits |= X86_CR4_LA57; \
+ if (!__cpu_has(__c, X86_FEATURE_UMIP)) \
+ __reserved_bits |= X86_CR4_UMIP; \
+ __reserved_bits; \
+})
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return -EINVAL;
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
+{
+ u64 reserved_bits = __cr4_reserved_bits(cpu_has, c);
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return -EINVAL;
+ if (cpuid_ecx(0x7) & feature_bit(LA57))
+ reserved_bits &= ~X86_CR4_LA57;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return -EINVAL;
+ if (kvm_x86_ops->umip_emulated())
+ reserved_bits &= ~X86_CR4_UMIP;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return -EINVAL;
+ return reserved_bits;
+}
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ if (cr4 & cr4_reserved_bits)
return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu))
return -EINVAL;
return 0;
@@ -1047,9 +1062,11 @@ static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -1064,7 +1081,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
case 5:
/* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr7_valid(val))
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -1086,9 +1103,11 @@ EXPORT_SYMBOL_GPL(kvm_set_dr);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1142,7 +1161,7 @@ static const u32 msrs_to_save_all[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
@@ -1212,6 +1231,7 @@ static const u32 emulated_msrs_all[] = {
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
/*
* The following list leaves out MSRs whose values are determined
@@ -1526,6 +1546,49 @@ int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (lapic_in_kernel(vcpu) && apic_x2apic_mode(vcpu->arch.apic) &&
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) {
+
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+ return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data);
+ }
+
+ return 1;
+}
+
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data = kvm_read_edx_eax(vcpu);
+ int ret = 0;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data);
+ break;
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+
+ if (!ret) {
+ trace_kvm_msr_write(msr, data);
+ return EXIT_FASTPATH_SKIP_EMUL_INS;
+ }
+
+ return EXIT_FASTPATH_NONE;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
+/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
@@ -1545,6 +1608,8 @@ struct pvclock_clock {
u64 mask;
u32 mult;
u32 shift;
+ u64 base_cycles;
+ u64 offset;
};
struct pvclock_gtod_data {
@@ -1553,11 +1618,8 @@ struct pvclock_gtod_data {
struct pvclock_clock clock; /* extract of a clocksource struct */
struct pvclock_clock raw_clock; /* extract of a clocksource struct */
- u64 boot_ns_raw;
- u64 boot_ns;
- u64 nsec_base;
+ ktime_t offs_boot;
u64 wall_time_sec;
- u64 monotonic_raw_nsec;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1565,10 +1627,6 @@ static struct pvclock_gtod_data pvclock_gtod_data;
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns, boot_ns_raw;
-
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
- boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
@@ -1578,23 +1636,35 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode;
vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
vdata->raw_clock.mask = tk->tkr_raw.mask;
vdata->raw_clock.mult = tk->tkr_raw.mult;
vdata->raw_clock.shift = tk->tkr_raw.shift;
-
- vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
vdata->wall_time_sec = tk->xtime_sec;
- vdata->boot_ns_raw = boot_ns_raw;
- vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec;
+ vdata->offs_boot = tk->offs_boot;
write_seqcount_end(&vdata->seq);
}
+
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
+}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
#endif
void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
@@ -1608,7 +1678,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec64 boot;
+ u64 wall_nsec;
if (!wall_clock)
return;
@@ -1628,17 +1698,12 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
+ * wall clock specified here. We do the reverse here.
*/
- getboottime64(&boot);
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) {
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
- boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
+ wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -1886,7 +1951,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boottime_ns();
+ ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
@@ -2061,10 +2126,10 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
do {
seq = read_seqcount_begin(&gtod->seq);
- ns = gtod->monotonic_raw_nsec;
+ ns = gtod->raw_clock.base_cycles;
ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
- ns >>= gtod->clock.shift;
- ns += gtod->boot_ns_raw;
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
*t = ns;
@@ -2081,7 +2146,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
do {
seq = read_seqcount_begin(&gtod->seq);
ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->nsec_base;
+ ns = gtod->clock.base_cycles;
ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -2224,7 +2289,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boottime_ns() + ka->kvmclock_offset;
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2240,7 +2305,7 @@ u64 get_kvmclock_ns(struct kvm *kvm)
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2339,7 +2404,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boottime_ns();
+ kernel_ns = get_kvmclock_base_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -2379,6 +2444,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
+ WARN_ON((s64)vcpu->hv_clock.system_time < 0);
/* If the host uses TSC clocksource, then it is stable */
pvclock_flags = 0;
@@ -2485,7 +2551,10 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2581,45 +2650,47 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
return;
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st->preempted & KVM_VCPU_FLUSH_TLB);
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb(vcpu, false);
- if (vcpu->arch.st.steal.version & 1)
- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
+ vcpu->arch.st.preempted = 0;
- vcpu->arch.st.steal.version += 1;
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ st->version += 1;
smp_wmb();
- vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+ st->steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
-
smp_wmb();
- vcpu->arch.st.steal.version += 1;
+ st->version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -2786,11 +2857,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS,
- sizeof(struct kvm_steal_time)))
- return 1;
-
vcpu->arch.st.msr_val = data;
if (!(data & KVM_MSR_ENABLED))
@@ -2926,7 +2992,10 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -3458,10 +3527,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->vcpu_load(vcpu, cpu);
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
@@ -3501,15 +3566,25 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+ if (vcpu->arch.st.preempted)
+ return;
+
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal.preempted,
- offsetof(struct kvm_steal_time, preempted),
- sizeof(vcpu->arch.st.steal.preempted));
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3717,6 +3792,21 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
process_nmi(vcpu);
/*
+ * In guest mode, payload delivery should be deferred,
+ * so that the L1 hypervisor can intercept #PF before
+ * CR2 is modified (or intercept #DB before DR6 is
+ * modified under nVMX). Unless the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
+ * opportunistically defer the exception payload, deliver it if the
+ * capability hasn't been requested before processing a
+ * KVM_GET_VCPU_EVENTS.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+ kvm_deliver_exception_payload(vcpu);
+
+ /*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
@@ -4660,9 +4750,6 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
{
struct kvm_pit *pit = kvm->arch.vpit;
- if (!pit)
- return -ENXIO;
-
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
@@ -5029,6 +5116,9 @@ set_identity_unlock:
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
@@ -6186,6 +6276,21 @@ static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
}
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read(emul_to_vcpu(ctxt), reg);
@@ -6263,6 +6368,9 @@ static const struct x86_emulate_ops emulate_ops = {
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .guest_has_long_mode = emulator_guest_has_long_mode,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
@@ -6379,11 +6487,11 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool write_fault_to_shadow_pgtable,
int emulation_type)
{
- gpa_t gpa = cr2;
+ gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
@@ -6397,7 +6505,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
* Write permission should be allowed since only
* write access need to be emulated.
*/
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
/*
* If the mapping is invalid in guest, let cpu retry
@@ -6454,10 +6562,10 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- unsigned long cr2, int emulation_type)
+ gpa_t cr2_or_gpa, int emulation_type)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
last_retry_eip = vcpu->arch.last_retry_eip;
last_retry_addr = vcpu->arch.last_retry_addr;
@@ -6486,14 +6594,14 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (x86_page_table_writing_insn(ctxt))
return false;
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
return false;
vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
if (!vcpu->arch.mmu->direct_map)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6639,11 +6747,8 @@ static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
return false;
}
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
{
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -6689,8 +6794,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
- emulation_type))
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ write_fault_to_spt,
+ emulation_type))
return 1;
if (ctxt->have_exception) {
/*
@@ -6724,7 +6830,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
return 1;
}
- if (retry_instruction(ctxt, cr2, emulation_type))
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
return 1;
/* this is needed for vmware backdoor interface to work since it
@@ -6736,7 +6842,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
restart:
/* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2;
+ ctxt->exception.address = cr2_or_gpa;
r = x86_emulate_insn(ctxt);
@@ -6744,7 +6850,7 @@ restart:
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
emulation_type))
return 1;
@@ -6785,6 +6891,8 @@ restart:
kvm_rip_write(vcpu, ctxt->eip);
if (r && ctxt->tf)
r = kvm_vcpu_do_singlestep(vcpu);
+ if (kvm_x86_ops->update_emulated_instruction)
+ kvm_x86_ops->update_emulated_instruction(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7082,14 +7190,16 @@ static void kvm_timer_init(void)
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
+ struct cpufreq_policy *policy;
int cpu;
- memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
@@ -7200,12 +7310,12 @@ int kvm_arch_init(void *opaque)
}
if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
@@ -7357,8 +7467,8 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
{
struct kvm_lapic_irq lapic_irq;
- lapic_irq.shorthand = 0;
- lapic_irq.dest_mode = 0;
+ lapic_irq.shorthand = APIC_DEST_NOSHORT;
+ lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -7367,18 +7477,22 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+bool kvm_apicv_activated(struct kvm *kvm)
{
- if (!lapic_in_kernel(vcpu)) {
- WARN_ON_ONCE(vcpu->arch.apicv_active);
- return;
- }
- if (!vcpu->arch.apicv_active)
- return;
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
- vcpu->arch.apicv_active = false;
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+void kvm_apicv_init(struct kvm *kvm, bool enable)
+{
+ if (enable)
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+ else
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
}
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
{
@@ -7907,6 +8021,47 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm)
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ kvm_apic_update_apicv(vcpu);
+ kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+
+/*
+ * NOTE: Do not hold any lock prior to calling this.
+ *
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
+ * locked, because it calls __x86_set_memory_region() which does
+ * synchronize_srcu(&kvm->srcu).
+ */
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ if (!kvm_x86_ops->check_apicv_inhibit_reasons ||
+ !kvm_x86_ops->check_apicv_inhibit_reasons(bit))
+ return;
+
+ if (activate) {
+ if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
+ !kvm_apicv_activated(kvm))
+ return;
+ } else {
+ if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) ||
+ kvm_apicv_activated(kvm))
+ return;
+ }
+
+ trace_kvm_apicv_update_request(activate, bit);
+ if (kvm_x86_ops->pre_update_apicv_exec_ctrl)
+ kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate);
+ kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
+}
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
@@ -7997,6 +8152,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
+ enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE;
bool req_immediate_exit = false;
@@ -8096,6 +8252,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8198,8 +8356,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
trace_kvm_entry(vcpu->vcpu_id);
guest_enter_irqoff();
- /* The preempt notifier should have taken care of the FPU already. */
- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -8243,7 +8402,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops->handle_exit_irqoff(vcpu);
+ kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath);
/*
* Consume any pending interrupts, including the possible source of
@@ -8287,7 +8446,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_lapic_sync_from_vapic(vcpu);
vcpu->arch.gpa_available = false;
- r = kvm_x86_ops->handle_exit(vcpu);
+ r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath);
return r;
cancel_injection:
@@ -8471,12 +8630,26 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
return 0;
}
+static void kvm_save_current_fpu(struct fpu *fpu)
+{
+ /*
+ * If the target FPU state is not resident in the CPU registers, just
+ * memcpy() from current, else save CPU state directly to the target.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ memcpy(&fpu->state, &current->thread.fpu.state,
+ fpu_kernel_xstate_size);
+ else
+ copy_fpregs_to_fpstate(fpu);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
+
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
@@ -8492,7 +8665,8 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
+
copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
fpregs_mark_activate();
@@ -8714,6 +8888,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
{
vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
@@ -8722,6 +8898,8 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
else
mp_state->mp_state = vcpu->arch.mp_state;
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return 0;
}
@@ -8779,7 +8957,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -9082,33 +9259,91 @@ static void fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= X86_CR0_ET;
}
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
-
- kvmclock_reset(vcpu);
+ if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
- kvm_x86_ops->vcpu_free(vcpu);
- free_cpumask_var(wbinvd_dirty_mask);
+ return 0;
}
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu;
+ struct page *page;
+ int r;
- if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
+ vcpu->arch.emulate_ctxt.ops = &emulate_ops;
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
- return vcpu;
-}
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ if (kvm_apicv_activated(vcpu->kvm))
+ vcpu->arch.apicv_active = true;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks)
+ goto fail_free_pio_data;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.user_fpu) {
+ pr_err("kvm: failed to allocate userspace's fpu\n");
+ goto free_wbinvd_dirty_mask;
+ }
+
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.guest_fpu) {
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
+ goto free_user_fpu;
+ }
+ fx_init(vcpu);
+
+ vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ r = kvm_x86_ops->vcpu_create(vcpu);
+ if (r)
+ goto free_guest_fpu;
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
-{
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
@@ -9117,6 +9352,22 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
+
+free_guest_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+free_user_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
@@ -9149,13 +9400,29 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+ int idx;
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
+
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9171,7 +9438,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
- vcpu->arch.exception.pending = false;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
@@ -9347,6 +9613,8 @@ int kvm_arch_hardware_setup(void)
if (r != 0)
return r;
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
if (kvm_has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -9375,6 +9643,13 @@ void kvm_arch_hardware_unsetup(void)
int kvm_arch_check_processor_compat(void)
{
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ WARN_ON(!irqs_disabled());
+
+ if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits)
+ return -EIO;
+
return kvm_x86_ops->check_processor_compatibility();
}
@@ -9392,98 +9667,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int r;
-
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->arch.pio_data = page_address(page);
-
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm);
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
- } else
- static_key_slow_inc(&kvm_no_apic_vcpu);
-
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.mce_banks) {
- r = -ENOMEM;
- goto fail_free_lapic;
- }
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
-
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
- GFP_KERNEL_ACCOUNT)) {
- r = -ENOMEM;
- goto fail_free_mce_banks;
- }
-
- fx_init(vcpu);
-
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
- kvm_async_pf_hash_reset(vcpu);
- kvm_pmu_init(vcpu);
-
- vcpu->arch.pending_external_vector = -1;
- vcpu->arch.preempted_in_kernel = false;
-
- kvm_hv_vcpu_init(vcpu);
-
- return 0;
-
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
-fail_free_lapic:
- kvm_free_lapic(vcpu);
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- int idx;
-
- kvm_hv_vcpu_uninit(vcpu);
- kvm_pmu_destroy(vcpu);
- kfree(vcpu->arch.mce_banks);
- kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- free_page((unsigned long)vcpu->arch.pio_data);
- if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
-}
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -9518,7 +9701,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -9558,7 +9741,7 @@ static void kvm_free_vcpus(struct kvm *kvm)
kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_free(vcpu);
+ kvm_vcpu_destroy(vcpu);
mutex_lock(&kvm->lock);
for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
@@ -9627,18 +9810,6 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
-{
- int r;
-
- mutex_lock(&kvm->slots_lock);
- r = __x86_set_memory_region(kvm, id, gpa, size);
- mutex_unlock(&kvm->slots_lock);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
-
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
kvm_mmu_pre_destroy_vm(kvm);
@@ -9652,9 +9823,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
* unless the the memory map has changed due to process exit
* or fd copying.
*/
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
}
if (kvm_x86_ops->vm_destroy)
kvm_x86_ops->vm_destroy(kvm);
@@ -9758,11 +9933,18 @@ out_free:
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9792,7 +9974,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
*
* The reason is, in case of PML, we need to set D-bit for any slots
* with dirty logging disabled in order to eliminate unnecessary GPA
- * logging in PML buffer (and potential PML buffer full VMEXT). This
+ * logging in PML buffer (and potential PML buffer full VMEXIT). This
* guarantees leaving PML enabled during guest's lifetime won't have
* any additional overhead from PML when guest is running with dirty
* logging disabled for memory slots.
@@ -10014,7 +10196,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
return;
- vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -10127,7 +10309,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
{
struct x86_exception fault;
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (kvm_can_deliver_async_pf(vcpu) &&
@@ -10162,7 +10344,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
!apf_get_user(vcpu, &val)) {
@@ -10284,7 +10466,6 @@ bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
}
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
@@ -10292,6 +10473,28 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
+{
+ uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS))
+ bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) &&
+ !boot_cpu_has(X86_FEATURE_AMD_IBRS))
+ bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP);
+
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ bits &= ~SPEC_CTRL_SSBD;
+ if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ bits &= ~SPEC_CTRL_SSBD;
+
+ return bits;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
@@ -10313,3 +10516,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 29391af8871d..3624665acee4 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -144,11 +144,6 @@ static inline bool is_pae_paging(struct kvm_vcpu *vcpu)
return !is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu);
}
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
@@ -166,21 +161,13 @@ static inline u64 get_canonical(u64 la, u8 vaddr_bits)
static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu)
{
-#ifdef CONFIG_X86_64
return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la;
-#else
- return false;
-#endif
}
static inline bool emul_is_noncanonical_address(u64 la,
struct x86_emulate_ctxt *ctxt)
{
-#ifdef CONFIG_X86_64
return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la;
-#else
- return false;
-#endif
}
static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
@@ -289,8 +276,9 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
+enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
@@ -369,7 +357,14 @@ static inline bool kvm_pat_valid(u64 data)
return (data | ((data & 0x0202020202020202ull) << 1)) == data;
}
+static inline bool kvm_dr7_valid(u64 data)
+{
+ /* Bits [63:32] are reserved */
+ return !(data >> 32);
+}
+
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu);
void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu);
+u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 306c3a0902ba..31600d851fd8 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -155,7 +155,7 @@ static bool check_seg_overrides(struct insn *insn, int regoff)
*/
static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
{
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
/*
* Resolve the default segment register as described in Section 3.7.4
@@ -266,7 +266,7 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
* which may be invalid at this point.
*/
if (regoff == offsetof(struct pt_regs, ip)) {
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
return INAT_SEG_REG_IGNORE;
else
return INAT_SEG_REG_CS;
@@ -289,7 +289,7 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
* In long mode, segment override prefixes are ignored, except for
* overrides for FS and GS.
*/
- if (user_64bit_mode(regs)) {
+ if (any_64bit_mode(regs)) {
if (idx != INAT_SEG_REG_FS &&
idx != INAT_SEG_REG_GS)
idx = INAT_SEG_REG_IGNORE;
@@ -646,23 +646,27 @@ unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
*/
return (unsigned long)(sel << 4);
- if (user_64bit_mode(regs)) {
+ if (any_64bit_mode(regs)) {
/*
* Only FS or GS will have a base address, the rest of
* the segments' bases are forced to 0.
*/
unsigned long base;
- if (seg_reg_idx == INAT_SEG_REG_FS)
+ if (seg_reg_idx == INAT_SEG_REG_FS) {
rdmsrl(MSR_FS_BASE, base);
- else if (seg_reg_idx == INAT_SEG_REG_GS)
+ } else if (seg_reg_idx == INAT_SEG_REG_GS) {
/*
* swapgs was called at the kernel entry point. Thus,
* MSR_KERNEL_GS_BASE will have the user-space GS base.
*/
- rdmsrl(MSR_KERNEL_GS_BASE, base);
- else
+ if (user_mode(regs))
+ rdmsrl(MSR_KERNEL_GS_BASE, base);
+ else
+ rdmsrl(MSR_GS_BASE, base);
+ } else {
base = 0;
+ }
return base;
}
@@ -703,7 +707,7 @@ static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
if (sel < 0)
return 0;
- if (user_64bit_mode(regs) || v8086_mode(regs))
+ if (any_64bit_mode(regs) || v8086_mode(regs))
return -1L;
if (!sel)
@@ -948,7 +952,7 @@ static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
* following instruction.
*/
if (*regoff == -EDOM) {
- if (user_64bit_mode(regs))
+ if (any_64bit_mode(regs))
tmp = regs->ip + insn->length;
else
tmp = 0;
@@ -1250,7 +1254,7 @@ static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
* After computed, the effective address is treated as an unsigned
* quantity.
*/
- if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
+ if (!any_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
goto out;
/*
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 337830d7a59c..7ff00ea64e4f 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -29,10 +29,7 @@
SYM_FUNC_START_ALIAS(memmove)
SYM_FUNC_START(__memmove)
- /* Handle more 32 bytes in loop */
mov %rdi, %rax
- cmp $0x20, %rdx
- jb 1f
/* Decide forward/backward copy mode */
cmp %rdi, %rsi
@@ -42,7 +39,9 @@ SYM_FUNC_START(__memmove)
cmp %rdi, %r8
jg 2f
+ /* FSRM implies ERMS => no length checks, do the copy directly */
.Lmemmove_begin_forward:
+ ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
/*
@@ -114,6 +113,8 @@ SYM_FUNC_START(__memmove)
*/
.p2align 4
2:
+ cmp $0x20, %rdx
+ jb 1f
cmp $680, %rdx
jb 6f
cmp %dil, %sil
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 8908c58bd6cd..53adc1762ec0 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -929,7 +929,7 @@ EndTable
GrpTable: Grp3_2
0: TEST Ev,Iz
-1:
+1: TEST Ev,Iz
2: NOT Ev
3: NEG Ev
4: MUL rAX,Ev
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3b89c201ac26..98f7c6fa2eaa 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -12,8 +12,10 @@ CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
endif
-obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
+obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
+ pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o
+
+obj-y += pat/
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -23,13 +25,11 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp)
CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace
-obj-$(CONFIG_X86_PAT) += pat_interval.o
-
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o
-obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
+obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o
+obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
@@ -45,7 +45,6 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c
index 39001a401eff..4a3b62f780b4 100644
--- a/arch/x86/mm/debug_pagetables.c
+++ b/arch/x86/mm/debug_pagetables.c
@@ -7,7 +7,7 @@
static int ptdump_show(struct seq_file *m, void *v)
{
- ptdump_walk_pgd_level_debugfs(m, NULL, false);
+ ptdump_walk_pgd_level_debugfs(m, &init_mm, false);
return 0;
}
@@ -15,11 +15,8 @@ DEFINE_SHOW_ATTRIBUTE(ptdump);
static int ptdump_curknl_show(struct seq_file *m, void *v)
{
- if (current->mm->pgd) {
- down_read(&current->mm->mmap_sem);
- ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, false);
- up_read(&current->mm->mmap_sem);
- }
+ if (current->mm->pgd)
+ ptdump_walk_pgd_level_debugfs(m, current->mm, false);
return 0;
}
@@ -28,11 +25,8 @@ DEFINE_SHOW_ATTRIBUTE(ptdump_curknl);
#ifdef CONFIG_PAGE_TABLE_ISOLATION
static int ptdump_curusr_show(struct seq_file *m, void *v)
{
- if (current->mm->pgd) {
- down_read(&current->mm->mmap_sem);
- ptdump_walk_pgd_level_debugfs(m, current->mm->pgd, true);
- up_read(&current->mm->mmap_sem);
- }
+ if (current->mm->pgd)
+ ptdump_walk_pgd_level_debugfs(m, current->mm, true);
return 0;
}
@@ -43,7 +37,7 @@ DEFINE_SHOW_ATTRIBUTE(ptdump_curusr);
static int ptdump_efi_show(struct seq_file *m, void *v)
{
if (efi_mm.pgd)
- ptdump_walk_pgd_level_debugfs(m, efi_mm.pgd, false);
+ ptdump_walk_pgd_level_debugfs(m, &efi_mm, false);
return 0;
}
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index ab67822fd2f4..69309cd56fdf 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -16,6 +16,7 @@
#include <linux/seq_file.h>
#include <linux/highmem.h>
#include <linux/pci.h>
+#include <linux/ptdump.h>
#include <asm/e820/types.h>
#include <asm/pgtable.h>
@@ -26,16 +27,18 @@
* when a "break" in the continuity is found.
*/
struct pg_state {
+ struct ptdump_state ptdump;
int level;
- pgprot_t current_prot;
+ pgprotval_t current_prot;
pgprotval_t effective_prot;
+ pgprotval_t prot_levels[5];
unsigned long start_address;
- unsigned long current_address;
const struct addr_marker *marker;
unsigned long lines;
bool to_dmesg;
bool check_wx;
unsigned long wx_pages;
+ struct seq_file *seq;
};
struct addr_marker {
@@ -174,11 +177,10 @@ static struct addr_marker address_markers[] = {
/*
* Print a readable form of a pgprot_t to the seq_file
*/
-static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
+static void printk_prot(struct seq_file *m, pgprotval_t pr, int level, bool dmsg)
{
- pgprotval_t pr = pgprot_val(prot);
static const char * const level_name[] =
- { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
+ { "pgd", "p4d", "pud", "pmd", "pte" };
if (!(pr & _PAGE_PRESENT)) {
/* Not present */
@@ -202,12 +204,12 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
pt_dump_cont_printf(m, dmsg, " ");
/* Bit 7 has a different meaning on level 3 vs 4 */
- if (level <= 4 && pr & _PAGE_PSE)
+ if (level <= 3 && pr & _PAGE_PSE)
pt_dump_cont_printf(m, dmsg, "PSE ");
else
pt_dump_cont_printf(m, dmsg, " ");
- if ((level == 5 && pr & _PAGE_PAT) ||
- ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
+ if ((level == 4 && pr & _PAGE_PAT) ||
+ ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
pt_dump_cont_printf(m, dmsg, "PAT ");
else
pt_dump_cont_printf(m, dmsg, " ");
@@ -223,24 +225,11 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
}
-/*
- * On 64 bits, sign-extend the 48 bit address to 64 bit
- */
-static unsigned long normalize_addr(unsigned long u)
-{
- int shift;
- if (!IS_ENABLED(CONFIG_X86_64))
- return u;
-
- shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
- return (signed long)(u << shift) >> shift;
-}
-
-static void note_wx(struct pg_state *st)
+static void note_wx(struct pg_state *st, unsigned long addr)
{
unsigned long npages;
- npages = (st->current_address - st->start_address) / PAGE_SIZE;
+ npages = (addr - st->start_address) / PAGE_SIZE;
#ifdef CONFIG_PCI_BIOS
/*
@@ -248,7 +237,7 @@ static void note_wx(struct pg_state *st)
* Inform about it, but avoid the warning.
*/
if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
- st->current_address <= PAGE_OFFSET + BIOS_END) {
+ addr <= PAGE_OFFSET + BIOS_END) {
pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
return;
}
@@ -260,27 +249,47 @@ static void note_wx(struct pg_state *st)
(void *)st->start_address);
}
+static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
+{
+ return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
+ ((prot1 | prot2) & _PAGE_NX);
+}
+
/*
* This function gets called on a break in a continuous series
* of PTE entries; the next one is different so we need to
* print what we collected so far.
*/
-static void note_page(struct seq_file *m, struct pg_state *st,
- pgprot_t new_prot, pgprotval_t new_eff, int level)
+static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level,
+ unsigned long val)
{
- pgprotval_t prot, cur, eff;
+ struct pg_state *st = container_of(pt_st, struct pg_state, ptdump);
+ pgprotval_t new_prot, new_eff;
+ pgprotval_t cur, eff;
static const char units[] = "BKMGTPE";
+ struct seq_file *m = st->seq;
+
+ new_prot = val & PTE_FLAGS_MASK;
+
+ if (level > 0) {
+ new_eff = effective_prot(st->prot_levels[level - 1],
+ new_prot);
+ } else {
+ new_eff = new_prot;
+ }
+
+ if (level >= 0)
+ st->prot_levels[level] = new_eff;
/*
* If we have a "break" in the series, we need to flush the state that
* we have now. "break" is either changing perms, levels or
* address space marker.
*/
- prot = pgprot_val(new_prot);
- cur = pgprot_val(st->current_prot);
+ cur = st->current_prot;
eff = st->effective_prot;
- if (!st->level) {
+ if (st->level == -1) {
/* First entry */
st->current_prot = new_prot;
st->effective_prot = new_eff;
@@ -289,14 +298,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->lines = 0;
pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
st->marker->name);
- } else if (prot != cur || new_eff != eff || level != st->level ||
- st->current_address >= st->marker[1].start_address) {
+ } else if (new_prot != cur || new_eff != eff || level != st->level ||
+ addr >= st->marker[1].start_address) {
const char *unit = units;
unsigned long delta;
int width = sizeof(unsigned long) * 2;
if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
- note_wx(st);
+ note_wx(st, addr);
/*
* Now print the actual finished series
@@ -306,9 +315,9 @@ static void note_page(struct seq_file *m, struct pg_state *st,
pt_dump_seq_printf(m, st->to_dmesg,
"0x%0*lx-0x%0*lx ",
width, st->start_address,
- width, st->current_address);
+ width, addr);
- delta = st->current_address - st->start_address;
+ delta = addr - st->start_address;
while (!(delta & 1023) && unit[1]) {
delta >>= 10;
unit++;
@@ -325,7 +334,7 @@ static void note_page(struct seq_file *m, struct pg_state *st,
* such as the start of vmalloc space etc.
* This helps in the interpretation.
*/
- if (st->current_address >= st->marker[1].start_address) {
+ if (addr >= st->marker[1].start_address) {
if (st->marker->max_lines &&
st->lines > st->marker->max_lines) {
unsigned long nskip =
@@ -341,222 +350,40 @@ static void note_page(struct seq_file *m, struct pg_state *st,
st->marker->name);
}
- st->start_address = st->current_address;
+ st->start_address = addr;
st->current_prot = new_prot;
st->effective_prot = new_eff;
st->level = level;
}
}
-static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
-{
- return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
- ((prot1 | prot2) & _PAGE_NX);
-}
-
-static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pte_t *pte;
- pgprotval_t prot, eff;
-
- for (i = 0; i < PTRS_PER_PTE; i++) {
- st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
- pte = pte_offset_map(&addr, st->current_address);
- prot = pte_flags(*pte);
- eff = effective_prot(eff_in, prot);
- note_page(m, st, __pgprot(prot), eff, 5);
- pte_unmap(pte);
- }
-}
-#ifdef CONFIG_KASAN
-
-/*
- * This is an optimization for KASAN=y case. Since all kasan page tables
- * eventually point to the kasan_early_shadow_page we could call note_page()
- * right away without walking through lower level page tables. This saves
- * us dozens of seconds (minutes for 5-level config) while checking for
- * W+X mapping or reading kernel_page_tables debugfs file.
- */
-static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
- void *pt)
-{
- if (__pa(pt) == __pa(kasan_early_shadow_pmd) ||
- (pgtable_l5_enabled() &&
- __pa(pt) == __pa(kasan_early_shadow_p4d)) ||
- __pa(pt) == __pa(kasan_early_shadow_pud)) {
- pgprotval_t prot = pte_flags(kasan_early_shadow_pte[0]);
- note_page(m, st, __pgprot(prot), 0, 5);
- return true;
- }
- return false;
-}
-#else
-static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
- void *pt)
-{
- return false;
-}
-#endif
-
-#if PTRS_PER_PMD > 1
-
-static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pmd_t *start, *pmd_start;
- pgprotval_t prot, eff;
-
- pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
- st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
- if (!pmd_none(*start)) {
- prot = pmd_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (pmd_large(*start) || !pmd_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 4);
- } else if (!kasan_page_table(m, st, pmd_start)) {
- walk_pte_level(m, st, *start, eff,
- P + i * PMD_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 4);
- start++;
- }
-}
-
-#else
-#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
-#define pud_large(a) pmd_large(__pmd(pud_val(a)))
-#define pud_none(a) pmd_none(__pmd(pud_val(a)))
-#endif
-
-#if PTRS_PER_PUD > 1
-
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- pud_t *start, *pud_start;
- pgprotval_t prot, eff;
-
- pud_start = start = (pud_t *)p4d_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_PUD; i++) {
- st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
- if (!pud_none(*start)) {
- prot = pud_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (pud_large(*start) || !pud_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 3);
- } else if (!kasan_page_table(m, st, pud_start)) {
- walk_pmd_level(m, st, *start, eff,
- P + i * PUD_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 3);
-
- start++;
- }
-}
-
-#else
-#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
-#define p4d_large(a) pud_large(__pud(p4d_val(a)))
-#define p4d_none(a) pud_none(__pud(p4d_val(a)))
-#endif
-
-static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
- pgprotval_t eff_in, unsigned long P)
-{
- int i;
- p4d_t *start, *p4d_start;
- pgprotval_t prot, eff;
-
- if (PTRS_PER_P4D == 1)
- return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
-
- p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
-
- for (i = 0; i < PTRS_PER_P4D; i++) {
- st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
- if (!p4d_none(*start)) {
- prot = p4d_flags(*start);
- eff = effective_prot(eff_in, prot);
- if (p4d_large(*start) || !p4d_present(*start)) {
- note_page(m, st, __pgprot(prot), eff, 2);
- } else if (!kasan_page_table(m, st, p4d_start)) {
- walk_pud_level(m, st, *start, eff,
- P + i * P4D_LEVEL_MULT);
- }
- } else
- note_page(m, st, __pgprot(0), 0, 2);
-
- start++;
- }
-}
-
-#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
-#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
-
-static inline bool is_hypervisor_range(int idx)
+static void ptdump_walk_pgd_level_core(struct seq_file *m,
+ struct mm_struct *mm, pgd_t *pgd,
+ bool checkwx, bool dmesg)
{
+ const struct ptdump_range ptdump_ranges[] = {
#ifdef CONFIG_X86_64
- /*
- * A hole in the beginning of kernel address space reserved
- * for a hypervisor.
- */
- return (idx >= pgd_index(GUARD_HOLE_BASE_ADDR)) &&
- (idx < pgd_index(GUARD_HOLE_END_ADDR));
+ {0, PTRS_PER_PGD * PGD_LEVEL_MULT / 2},
+ {GUARD_HOLE_END_ADDR, ~0UL},
#else
- return false;
+ {0, ~0UL},
#endif
-}
-
-static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
- bool checkwx, bool dmesg)
-{
- pgd_t *start = INIT_PGD;
- pgprotval_t prot, eff;
- int i;
- struct pg_state st = {};
-
- if (pgd) {
- start = pgd;
- st.to_dmesg = dmesg;
- }
+ {0, 0}
+};
- st.check_wx = checkwx;
- if (checkwx)
- st.wx_pages = 0;
+ struct pg_state st = {
+ .ptdump = {
+ .note_page = note_page,
+ .range = ptdump_ranges
+ },
+ .level = -1,
+ .to_dmesg = dmesg,
+ .check_wx = checkwx,
+ .seq = m
+ };
- for (i = 0; i < PTRS_PER_PGD; i++) {
- st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
- if (!pgd_none(*start) && !is_hypervisor_range(i)) {
- prot = pgd_flags(*start);
-#ifdef CONFIG_X86_PAE
- eff = _PAGE_USER | _PAGE_RW;
-#else
- eff = prot;
-#endif
- if (pgd_large(*start) || !pgd_present(*start)) {
- note_page(m, &st, __pgprot(prot), eff, 1);
- } else {
- walk_p4d_level(m, &st, *start, eff,
- i * PGD_LEVEL_MULT);
- }
- } else
- note_page(m, &st, __pgprot(0), 0, 1);
-
- cond_resched();
- start++;
- }
+ ptdump_walk_pgd(&st.ptdump, mm, pgd);
- /* Flush out the last page */
- st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
- note_page(m, &st, __pgprot(0), 0, 0);
if (!checkwx)
return;
if (st.wx_pages)
@@ -566,18 +393,20 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
}
-void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
+void ptdump_walk_pgd_level(struct seq_file *m, struct mm_struct *mm)
{
- ptdump_walk_pgd_level_core(m, pgd, false, true);
+ ptdump_walk_pgd_level_core(m, mm, mm->pgd, false, true);
}
-void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, struct mm_struct *mm,
+ bool user)
{
+ pgd_t *pgd = mm->pgd;
#ifdef CONFIG_PAGE_TABLE_ISOLATION
if (user && boot_cpu_has(X86_FEATURE_PTI))
pgd = kernel_to_user_pgdp(pgd);
#endif
- ptdump_walk_pgd_level_core(m, pgd, false, false);
+ ptdump_walk_pgd_level_core(m, mm, pgd, false, false);
}
EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
@@ -592,13 +421,13 @@ void ptdump_walk_user_pgd_level_checkwx(void)
pr_info("x86/mm: Checking user space page tables\n");
pgd = kernel_to_user_pgdp(pgd);
- ptdump_walk_pgd_level_core(NULL, pgd, true, false);
+ ptdump_walk_pgd_level_core(NULL, &init_mm, pgd, true, false);
#endif
}
void ptdump_walk_pgd_level_checkwx(void)
{
- ptdump_walk_pgd_level_core(NULL, NULL, true, false);
+ ptdump_walk_pgd_level_core(NULL, &init_mm, INIT_PGD, true, false);
}
static int __init pt_dump_init(void)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 304d31d8cbbc..629fdf13f846 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -29,6 +29,7 @@
#include <asm/efi.h> /* efi_recover_from_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
+#include <asm/pgtable_areas.h> /* VMALLOC_START, ... */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -189,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}
-void vmalloc_sync_all(void)
+static void vmalloc_sync(void)
{
unsigned long address;
@@ -216,6 +217,16 @@ void vmalloc_sync_all(void)
}
}
+void vmalloc_sync_mappings(void)
+{
+ vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ vmalloc_sync();
+}
+
/*
* 32-bit:
*
@@ -318,11 +329,23 @@ out:
#else /* CONFIG_X86_64: */
-void vmalloc_sync_all(void)
+void vmalloc_sync_mappings(void)
{
+ /*
+ * 64-bit mappings might allocate new p4d/pud pages
+ * that need to be propagated to all tasks' PGDs.
+ */
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
+void vmalloc_sync_unmappings(void)
+{
+ /*
+ * Unmappings never allocate or free p4d/pud pages.
+ * No work is required here.
+ */
+}
+
/*
* 64-bit:
*
@@ -1486,27 +1509,6 @@ good_area:
}
NOKPROBE_SYMBOL(do_user_addr_fault);
-/*
- * Explicitly marked noinline such that the function tracer sees this as the
- * page_fault entry point.
- */
-static noinline void
-__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
- unsigned long address)
-{
- prefetchw(&current->mm->mmap_sem);
-
- if (unlikely(kmmio_fault(regs, address)))
- return;
-
- /* Was the fault on kernel-controlled part of the address space? */
- if (unlikely(fault_in_kernel_space(address)))
- do_kern_addr_fault(regs, hw_error_code, address);
- else
- do_user_addr_fault(regs, hw_error_code, address);
-}
-NOKPROBE_SYMBOL(__do_page_fault);
-
static __always_inline void
trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
unsigned long address)
@@ -1521,13 +1523,19 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
}
dotraplinkage void
-do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
+do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+ unsigned long address)
{
- enum ctx_state prev_state;
+ prefetchw(&current->mm->mmap_sem);
+ trace_page_fault_entries(regs, hw_error_code, address);
+
+ if (unlikely(kmmio_fault(regs, address)))
+ return;
- prev_state = exception_enter();
- trace_page_fault_entries(regs, error_code, address);
- __do_page_fault(regs, error_code, address);
- exception_exit(prev_state);
+ /* Was the fault on kernel-controlled part of the address space? */
+ if (unlikely(fault_in_kernel_space(address)))
+ do_kern_addr_fault(regs, hw_error_code, address);
+ else
+ do_user_addr_fault(regs, hw_error_code, address);
}
NOKPROBE_SYMBOL(do_page_fault);
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index fab095362c50..5bfd5aef5378 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -19,7 +19,6 @@
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/elf.h>
-#include <asm/mpx.h>
#if 0 /* This is just for testing */
struct page *
@@ -151,10 +150,6 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len & ~huge_page_mask(h))
return -EINVAL;
- addr = mpx_unmapped_area_check(addr, len, flags);
- if (IS_ERR_VALUE(addr))
- return addr;
-
if (len > TASK_SIZE)
return -ENOMEM;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0a74407ef92e..23df4885bbed 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,6 +52,7 @@
#include <asm/page_types.h>
#include <asm/cpu_entry_area.h>
#include <asm/init.h>
+#include <asm/pgtable_areas.h>
#include "mm_internal.h"
@@ -872,34 +873,6 @@ void arch_remove_memory(int nid, u64 start, u64 size,
int kernel_set_to_readonly __read_mostly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, start+size);
-
- set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long size = PFN_ALIGN(_etext) - start;
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, start+size);
-
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-}
-
static void mark_nxdata_nx(void)
{
/*
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bcfede46fe02..abbdecb75fad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1258,42 +1258,6 @@ void __init mem_init(void)
int kernel_set_to_readonly;
-void set_kernel_text_rw(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(_etext);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read write\n",
- start, end);
-
- /*
- * Make the kernel identity mapping for text RW. Kernel text
- * mapping will always be RO. Refer to the comment in
- * static_protections() in pageattr.c
- */
- set_memory_rw(start, (end - start) >> PAGE_SHIFT);
-}
-
-void set_kernel_text_ro(void)
-{
- unsigned long start = PFN_ALIGN(_text);
- unsigned long end = PFN_ALIGN(_etext);
-
- if (!kernel_set_to_readonly)
- return;
-
- pr_debug("Set kernel text: %lx - %lx for read only\n",
- start, end);
-
- /*
- * Set the kernel identity mapping for text RO.
- */
- set_memory_ro(start, (end - start) >> PAGE_SHIFT);
-}
-
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 6748b4c2baff..f60398aeb644 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -4,7 +4,7 @@
*/
#include <asm/iomap.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <linux/export.h>
#include <linux/highmem.h>
@@ -26,7 +26,7 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
if (!is_io_mapping_possible(base, size))
return -EINVAL;
- ret = io_reserve_memtype(base, base + size, &pcm);
+ ret = memtype_reserve_io(base, base + size, &pcm);
if (ret)
return ret;
@@ -40,7 +40,7 @@ EXPORT_SYMBOL_GPL(iomap_create_wc);
void iomap_free(resource_size_t base, unsigned long size)
{
- io_free_memtype(base, base + size);
+ memtype_free_io(base, base + size);
}
EXPORT_SYMBOL_GPL(iomap_free);
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b3a2936377b5..935a91e1fd77 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,7 +24,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/setup.h>
#include "physaddr.h"
@@ -106,6 +106,19 @@ static unsigned int __ioremap_check_encrypted(struct resource *res)
return 0;
}
+/*
+ * The EFI runtime services data area is not covered by walk_mem_res(), but must
+ * be mapped encrypted when SEV is active.
+ */
+static void __ioremap_check_other(resource_size_t addr, struct ioremap_desc *desc)
+{
+ if (!sev_active())
+ return;
+
+ if (efi_mem_type(addr) == EFI_RUNTIME_SERVICES_DATA)
+ desc->flags |= IORES_MAP_ENCRYPTED;
+}
+
static int __ioremap_collect_map_flags(struct resource *res, void *arg)
{
struct ioremap_desc *desc = arg;
@@ -124,6 +137,9 @@ static int __ioremap_collect_map_flags(struct resource *res, void *arg)
* To avoid multiple resource walks, this function walks resources marked as
* IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
* resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ *
+ * After that, deal with misc other ranges in __ioremap_check_other() which do
+ * not fall into the above category.
*/
static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
struct ioremap_desc *desc)
@@ -135,6 +151,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
memset(desc, 0, sizeof(struct ioremap_desc));
walk_mem_res(start, end, desc, __ioremap_collect_map_flags);
+
+ __ioremap_check_other(addr, desc);
}
/*
@@ -196,10 +214,10 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
phys_addr &= PHYSICAL_PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
- retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
+ retval = memtype_reserve(phys_addr, (u64)phys_addr + size,
pcm, &new_pcm);
if (retval) {
- printk(KERN_ERR "ioremap reserve_memtype failed %d\n", retval);
+ printk(KERN_ERR "ioremap memtype_reserve failed %d\n", retval);
return NULL;
}
@@ -255,7 +273,7 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
- if (kernel_map_sync_memtype(phys_addr, size, pcm))
+ if (memtype_kernel_map_sync(phys_addr, size, pcm))
goto err_free_area;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
@@ -275,7 +293,7 @@ __ioremap_caller(resource_size_t phys_addr, unsigned long size,
err_free_area:
free_vm_area(area);
err_free_memtype:
- free_memtype(phys_addr, phys_addr + size);
+ memtype_free(phys_addr, phys_addr + size);
return NULL;
}
@@ -451,7 +469,7 @@ void iounmap(volatile void __iomem *addr)
return;
}
- free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+ memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p));
/* Finally remove it */
o = remove_vm_area((void __force *)addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index cf5bc37c90ac..763e71abc0fe 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -288,23 +288,6 @@ static void __init kasan_shallow_populate_pgds(void *start, void *end)
} while (pgd++, addr = next, addr != (unsigned long)end);
}
-#ifdef CONFIG_KASAN_INLINE
-static int kasan_die_handler(struct notifier_block *self,
- unsigned long val,
- void *data)
-{
- if (val == DIE_GPF) {
- pr_emerg("CONFIG_KASAN_INLINE enabled\n");
- pr_emerg("GPF could be caused by NULL-ptr deref or user memory access\n");
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block kasan_die_notifier = {
- .notifier_call = kasan_die_handler,
-};
-#endif
-
void __init kasan_early_init(void)
{
int i;
@@ -341,10 +324,6 @@ void __init kasan_init(void)
int i;
void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
-#ifdef CONFIG_KASAN_INLINE
- register_die_notifier(&kasan_die_notifier);
-#endif
-
memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
/*
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index aae9a933dfd4..cb91eccc4960 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -163,8 +163,6 @@ unsigned long get_mmap_base(int is_legacy)
const char *arch_vma_name(struct vm_area_struct *vma)
{
- if (vma->vm_flags & VM_MPX)
- return "[mpx]";
return NULL;
}
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
deleted file mode 100644
index 895fb7a9294d..000000000000
--- a/arch/x86/mm/mpx.c
+++ /dev/null
@@ -1,938 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * mpx.c - Memory Protection eXtensions
- *
- * Copyright (c) 2014, Intel Corporation.
- * Qiaowei Ren <qiaowei.ren@intel.com>
- * Dave Hansen <dave.hansen@intel.com>
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/mm_types.h>
-#include <linux/mman.h>
-#include <linux/syscalls.h>
-#include <linux/sched/sysctl.h>
-
-#include <asm/insn.h>
-#include <asm/insn-eval.h>
-#include <asm/mmu_context.h>
-#include <asm/mpx.h>
-#include <asm/processor.h>
-#include <asm/fpu/internal.h>
-
-#define CREATE_TRACE_POINTS
-#include <asm/trace/mpx.h>
-
-static inline unsigned long mpx_bd_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BD_SIZE_BYTES_64;
- else
- return MPX_BD_SIZE_BYTES_32;
-}
-
-static inline unsigned long mpx_bt_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BT_SIZE_BYTES_64;
- else
- return MPX_BT_SIZE_BYTES_32;
-}
-
-/*
- * This is really a simplified "vm_mmap". it only handles MPX
- * bounds tables (the bounds directory is user-allocated).
- */
-static unsigned long mpx_mmap(unsigned long len)
-{
- struct mm_struct *mm = current->mm;
- unsigned long addr, populate;
-
- /* Only bounds table can be allocated here */
- if (len != mpx_bt_size_bytes(mm))
- return -EINVAL;
-
- down_write(&mm->mmap_sem);
- addr = do_mmap(NULL, 0, len, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_PRIVATE, VM_MPX, 0, &populate, NULL);
- up_write(&mm->mmap_sem);
- if (populate)
- mm_populate(addr, populate);
-
- return addr;
-}
-
-static int mpx_insn_decode(struct insn *insn,
- struct pt_regs *regs)
-{
- unsigned char buf[MAX_INSN_SIZE];
- int x86_64 = !test_thread_flag(TIF_IA32);
- int not_copied;
- int nr_copied;
-
- not_copied = copy_from_user(buf, (void __user *)regs->ip, sizeof(buf));
- nr_copied = sizeof(buf) - not_copied;
- /*
- * The decoder _should_ fail nicely if we pass it a short buffer.
- * But, let's not depend on that implementation detail. If we
- * did not get anything, just error out now.
- */
- if (!nr_copied)
- return -EFAULT;
- insn_init(insn, buf, nr_copied, x86_64);
- insn_get_length(insn);
- /*
- * copy_from_user() tries to get as many bytes as we could see in
- * the largest possible instruction. If the instruction we are
- * after is shorter than that _and_ we attempt to copy from
- * something unreadable, we might get a short read. This is OK
- * as long as the read did not stop in the middle of the
- * instruction. Check to see if we got a partial instruction.
- */
- if (nr_copied < insn->length)
- return -EFAULT;
-
- insn_get_opcode(insn);
- /*
- * We only _really_ need to decode bndcl/bndcn/bndcu
- * Error out on anything else.
- */
- if (insn->opcode.bytes[0] != 0x0f)
- goto bad_opcode;
- if ((insn->opcode.bytes[1] != 0x1a) &&
- (insn->opcode.bytes[1] != 0x1b))
- goto bad_opcode;
-
- return 0;
-bad_opcode:
- return -EINVAL;
-}
-
-/*
- * If a bounds overflow occurs then a #BR is generated. This
- * function decodes MPX instructions to get violation address
- * and set this address into extended struct siginfo.
- *
- * Note that this is not a super precise way of doing this.
- * Userspace could have, by the time we get here, written
- * anything it wants in to the instructions. We can not
- * trust anything about it. They might not be valid
- * instructions or might encode invalid registers, etc...
- */
-int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs)
-{
- const struct mpx_bndreg_state *bndregs;
- const struct mpx_bndreg *bndreg;
- struct insn insn;
- uint8_t bndregno;
- int err;
-
- err = mpx_insn_decode(&insn, regs);
- if (err)
- goto err_out;
-
- /*
- * We know at this point that we are only dealing with
- * MPX instructions.
- */
- insn_get_modrm(&insn);
- bndregno = X86_MODRM_REG(insn.modrm.value);
- if (bndregno > 3) {
- err = -EINVAL;
- goto err_out;
- }
- /* get bndregs field from current task's xsave area */
- bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS);
- if (!bndregs) {
- err = -EINVAL;
- goto err_out;
- }
- /* now go select the individual register in the set of 4 */
- bndreg = &bndregs->bndreg[bndregno];
-
- /*
- * The registers are always 64-bit, but the upper 32
- * bits are ignored in 32-bit mode. Also, note that the
- * upper bounds are architecturally represented in 1's
- * complement form.
- *
- * The 'unsigned long' cast is because the compiler
- * complains when casting from integers to different-size
- * pointers.
- */
- info->lower = (void __user *)(unsigned long)bndreg->lower_bound;
- info->upper = (void __user *)(unsigned long)~bndreg->upper_bound;
- info->addr = insn_get_addr_ref(&insn, regs);
-
- /*
- * We were not able to extract an address from the instruction,
- * probably because there was something invalid in it.
- */
- if (info->addr == (void __user *)-1) {
- err = -EINVAL;
- goto err_out;
- }
- trace_mpx_bounds_register_exception(info->addr, bndreg);
- return 0;
-err_out:
- /* info might be NULL, but kfree() handles that */
- return err;
-}
-
-static __user void *mpx_get_bounds_dir(void)
-{
- const struct mpx_bndcsr *bndcsr;
-
- if (!cpu_feature_enabled(X86_FEATURE_MPX))
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * The bounds directory pointer is stored in a register
- * only accessible if we first do an xsave.
- */
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * Make sure the register looks valid by checking the
- * enable bit.
- */
- if (!(bndcsr->bndcfgu & MPX_BNDCFG_ENABLE_FLAG))
- return MPX_INVALID_BOUNDS_DIR;
-
- /*
- * Lastly, mask off the low bits used for configuration
- * flags, and return the address of the bounds table.
- */
- return (void __user *)(unsigned long)
- (bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK);
-}
-
-int mpx_enable_management(void)
-{
- void __user *bd_base = MPX_INVALID_BOUNDS_DIR;
- struct mm_struct *mm = current->mm;
- int ret = 0;
-
- /*
- * runtime in the userspace will be responsible for allocation of
- * the bounds directory. Then, it will save the base of the bounds
- * directory into XSAVE/XRSTOR Save Area and enable MPX through
- * XRSTOR instruction.
- *
- * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is
- * expected to be relatively expensive. Storing the bounds
- * directory here means that we do not have to do xsave in the
- * unmap path; we can just use mm->context.bd_addr instead.
- */
- bd_base = mpx_get_bounds_dir();
- down_write(&mm->mmap_sem);
-
- /* MPX doesn't support addresses above 47 bits yet. */
- if (find_vma(mm, DEFAULT_MAP_WINDOW)) {
- pr_warn_once("%s (%d): MPX cannot handle addresses "
- "above 47-bits. Disabling.",
- current->comm, current->pid);
- ret = -ENXIO;
- goto out;
- }
- mm->context.bd_addr = bd_base;
- if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR)
- ret = -ENXIO;
-out:
- up_write(&mm->mmap_sem);
- return ret;
-}
-
-int mpx_disable_management(void)
-{
- struct mm_struct *mm = current->mm;
-
- if (!cpu_feature_enabled(X86_FEATURE_MPX))
- return -ENXIO;
-
- down_write(&mm->mmap_sem);
- mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR;
- up_write(&mm->mmap_sem);
- return 0;
-}
-
-static int mpx_cmpxchg_bd_entry(struct mm_struct *mm,
- unsigned long *curval,
- unsigned long __user *addr,
- unsigned long old_val, unsigned long new_val)
-{
- int ret;
- /*
- * user_atomic_cmpxchg_inatomic() actually uses sizeof()
- * the pointer that we pass to it to figure out how much
- * data to cmpxchg. We have to be careful here not to
- * pass a pointer to a 64-bit data type when we only want
- * a 32-bit copy.
- */
- if (is_64bit_mm(mm)) {
- ret = user_atomic_cmpxchg_inatomic(curval,
- addr, old_val, new_val);
- } else {
- u32 uninitialized_var(curval_32);
- u32 old_val_32 = old_val;
- u32 new_val_32 = new_val;
- u32 __user *addr_32 = (u32 __user *)addr;
-
- ret = user_atomic_cmpxchg_inatomic(&curval_32,
- addr_32, old_val_32, new_val_32);
- *curval = curval_32;
- }
- return ret;
-}
-
-/*
- * With 32-bit mode, a bounds directory is 4MB, and the size of each
- * bounds table is 16KB. With 64-bit mode, a bounds directory is 2GB,
- * and the size of each bounds table is 4MB.
- */
-static int allocate_bt(struct mm_struct *mm, long __user *bd_entry)
-{
- unsigned long expected_old_val = 0;
- unsigned long actual_old_val = 0;
- unsigned long bt_addr;
- unsigned long bd_new_entry;
- int ret = 0;
-
- /*
- * Carve the virtual space out of userspace for the new
- * bounds table:
- */
- bt_addr = mpx_mmap(mpx_bt_size_bytes(mm));
- if (IS_ERR((void *)bt_addr))
- return PTR_ERR((void *)bt_addr);
- /*
- * Set the valid flag (kinda like _PAGE_PRESENT in a pte)
- */
- bd_new_entry = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
-
- /*
- * Go poke the address of the new bounds table in to the
- * bounds directory entry out in userspace memory. Note:
- * we may race with another CPU instantiating the same table.
- * In that case the cmpxchg will see an unexpected
- * 'actual_old_val'.
- *
- * This can fault, but that's OK because we do not hold
- * mmap_sem at this point, unlike some of the other part
- * of the MPX code that have to pagefault_disable().
- */
- ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val, bd_entry,
- expected_old_val, bd_new_entry);
- if (ret)
- goto out_unmap;
-
- /*
- * The user_atomic_cmpxchg_inatomic() will only return nonzero
- * for faults, *not* if the cmpxchg itself fails. Now we must
- * verify that the cmpxchg itself completed successfully.
- */
- /*
- * We expected an empty 'expected_old_val', but instead found
- * an apparently valid entry. Assume we raced with another
- * thread to instantiate this table and desclare succecss.
- */
- if (actual_old_val & MPX_BD_ENTRY_VALID_FLAG) {
- ret = 0;
- goto out_unmap;
- }
- /*
- * We found a non-empty bd_entry but it did not have the
- * VALID_FLAG set. Return an error which will result in
- * a SEGV since this probably means that somebody scribbled
- * some invalid data in to a bounds table.
- */
- if (expected_old_val != actual_old_val) {
- ret = -EINVAL;
- goto out_unmap;
- }
- trace_mpx_new_bounds_table(bt_addr);
- return 0;
-out_unmap:
- vm_munmap(bt_addr, mpx_bt_size_bytes(mm));
- return ret;
-}
-
-/*
- * When a BNDSTX instruction attempts to save bounds to a bounds
- * table, it will first attempt to look up the table in the
- * first-level bounds directory. If it does not find a table in
- * the directory, a #BR is generated and we get here in order to
- * allocate a new table.
- *
- * With 32-bit mode, the size of BD is 4MB, and the size of each
- * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
- * and the size of each bound table is 4MB.
- */
-static int do_mpx_bt_fault(void)
-{
- unsigned long bd_entry, bd_base;
- const struct mpx_bndcsr *bndcsr;
- struct mm_struct *mm = current->mm;
-
- bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR);
- if (!bndcsr)
- return -EINVAL;
- /*
- * Mask off the preserve and enable bits
- */
- bd_base = bndcsr->bndcfgu & MPX_BNDCFG_ADDR_MASK;
- /*
- * The hardware provides the address of the missing or invalid
- * entry via BNDSTATUS, so we don't have to go look it up.
- */
- bd_entry = bndcsr->bndstatus & MPX_BNDSTA_ADDR_MASK;
- /*
- * Make sure the directory entry is within where we think
- * the directory is.
- */
- if ((bd_entry < bd_base) ||
- (bd_entry >= bd_base + mpx_bd_size_bytes(mm)))
- return -EINVAL;
-
- return allocate_bt(mm, (long __user *)bd_entry);
-}
-
-int mpx_handle_bd_fault(void)
-{
- /*
- * Userspace never asked us to manage the bounds tables,
- * so refuse to help.
- */
- if (!kernel_managing_mpx_tables(current->mm))
- return -EINVAL;
-
- return do_mpx_bt_fault();
-}
-
-/*
- * A thin wrapper around get_user_pages(). Returns 0 if the
- * fault was resolved or -errno if not.
- */
-static int mpx_resolve_fault(long __user *addr, int write)
-{
- long gup_ret;
- int nr_pages = 1;
-
- gup_ret = get_user_pages((unsigned long)addr, nr_pages,
- write ? FOLL_WRITE : 0, NULL, NULL);
- /*
- * get_user_pages() returns number of pages gotten.
- * 0 means we failed to fault in and get anything,
- * probably because 'addr' is bad.
- */
- if (!gup_ret)
- return -EFAULT;
- /* Other error, return it */
- if (gup_ret < 0)
- return gup_ret;
- /* must have gup'd a page and gup_ret>0, success */
- return 0;
-}
-
-static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm,
- unsigned long bd_entry)
-{
- unsigned long bt_addr = bd_entry;
- int align_to_bytes;
- /*
- * Bit 0 in a bt_entry is always the valid bit.
- */
- bt_addr &= ~MPX_BD_ENTRY_VALID_FLAG;
- /*
- * Tables are naturally aligned at 8-byte boundaries
- * on 64-bit and 4-byte boundaries on 32-bit. The
- * documentation makes it appear that the low bits
- * are ignored by the hardware, so we do the same.
- */
- if (is_64bit_mm(mm))
- align_to_bytes = 8;
- else
- align_to_bytes = 4;
- bt_addr &= ~(align_to_bytes-1);
- return bt_addr;
-}
-
-/*
- * We only want to do a 4-byte get_user() on 32-bit. Otherwise,
- * we might run off the end of the bounds table if we are on
- * a 64-bit kernel and try to get 8 bytes.
- */
-static int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret,
- long __user *bd_entry_ptr)
-{
- u32 bd_entry_32;
- int ret;
-
- if (is_64bit_mm(mm))
- return get_user(*bd_entry_ret, bd_entry_ptr);
-
- /*
- * Note that get_user() uses the type of the *pointer* to
- * establish the size of the get, not the destination.
- */
- ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr);
- *bd_entry_ret = bd_entry_32;
- return ret;
-}
-
-/*
- * Get the base of bounds tables pointed by specific bounds
- * directory entry.
- */
-static int get_bt_addr(struct mm_struct *mm,
- long __user *bd_entry_ptr,
- unsigned long *bt_addr_result)
-{
- int ret;
- int valid_bit;
- unsigned long bd_entry;
- unsigned long bt_addr;
-
- if (!access_ok((bd_entry_ptr), sizeof(*bd_entry_ptr)))
- return -EFAULT;
-
- while (1) {
- int need_write = 0;
-
- pagefault_disable();
- ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr);
- pagefault_enable();
- if (!ret)
- break;
- if (ret == -EFAULT)
- ret = mpx_resolve_fault(bd_entry_ptr, need_write);
- /*
- * If we could not resolve the fault, consider it
- * userspace's fault and error out.
- */
- if (ret)
- return ret;
- }
-
- valid_bit = bd_entry & MPX_BD_ENTRY_VALID_FLAG;
- bt_addr = mpx_bd_entry_to_bt_addr(mm, bd_entry);
-
- /*
- * When the kernel is managing bounds tables, a bounds directory
- * entry will either have a valid address (plus the valid bit)
- * *OR* be completely empty. If we see a !valid entry *and* some
- * data in the address field, we know something is wrong. This
- * -EINVAL return will cause a SIGSEGV.
- */
- if (!valid_bit && bt_addr)
- return -EINVAL;
- /*
- * Do we have an completely zeroed bt entry? That is OK. It
- * just means there was no bounds table for this memory. Make
- * sure to distinguish this from -EINVAL, which will cause
- * a SEGV.
- */
- if (!valid_bit)
- return -ENOENT;
-
- *bt_addr_result = bt_addr;
- return 0;
-}
-
-static inline int bt_entry_size_bytes(struct mm_struct *mm)
-{
- if (is_64bit_mm(mm))
- return MPX_BT_ENTRY_BYTES_64;
- else
- return MPX_BT_ENTRY_BYTES_32;
-}
-
-/*
- * Take a virtual address and turns it in to the offset in bytes
- * inside of the bounds table where the bounds table entry
- * controlling 'addr' can be found.
- */
-static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm,
- unsigned long addr)
-{
- unsigned long bt_table_nr_entries;
- unsigned long offset = addr;
-
- if (is_64bit_mm(mm)) {
- /* Bottom 3 bits are ignored on 64-bit */
- offset >>= 3;
- bt_table_nr_entries = MPX_BT_NR_ENTRIES_64;
- } else {
- /* Bottom 2 bits are ignored on 32-bit */
- offset >>= 2;
- bt_table_nr_entries = MPX_BT_NR_ENTRIES_32;
- }
- /*
- * We know the size of the table in to which we are
- * indexing, and we have eliminated all the low bits
- * which are ignored for indexing.
- *
- * Mask out all the high bits which we do not need
- * to index in to the table. Note that the tables
- * are always powers of two so this gives us a proper
- * mask.
- */
- offset &= (bt_table_nr_entries-1);
- /*
- * We now have an entry offset in terms of *entries* in
- * the table. We need to scale it back up to bytes.
- */
- offset *= bt_entry_size_bytes(mm);
- return offset;
-}
-
-/*
- * How much virtual address space does a single bounds
- * directory entry cover?
- *
- * Note, we need a long long because 4GB doesn't fit in
- * to a long on 32-bit.
- */
-static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
-{
- unsigned long long virt_space;
- unsigned long long GB = (1ULL << 30);
-
- /*
- * This covers 32-bit emulation as well as 32-bit kernels
- * running on 64-bit hardware.
- */
- if (!is_64bit_mm(mm))
- return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
-
- /*
- * 'x86_virt_bits' returns what the hardware is capable
- * of, and returns the full >32-bit address space when
- * running 32-bit kernels on 64-bit hardware.
- */
- virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
- return virt_space / MPX_BD_NR_ENTRIES_64;
-}
-
-/*
- * Free the backing physical pages of bounds table 'bt_addr'.
- * Assume start...end is within that bounds table.
- */
-static noinline int zap_bt_entries_mapping(struct mm_struct *mm,
- unsigned long bt_addr,
- unsigned long start_mapping, unsigned long end_mapping)
-{
- struct vm_area_struct *vma;
- unsigned long addr, len;
- unsigned long start;
- unsigned long end;
-
- /*
- * if we 'end' on a boundary, the offset will be 0 which
- * is not what we want. Back it up a byte to get the
- * last bt entry. Then once we have the entry itself,
- * move 'end' back up by the table entry size.
- */
- start = bt_addr + mpx_get_bt_entry_offset_bytes(mm, start_mapping);
- end = bt_addr + mpx_get_bt_entry_offset_bytes(mm, end_mapping - 1);
- /*
- * Move end back up by one entry. Among other things
- * this ensures that it remains page-aligned and does
- * not screw up zap_page_range()
- */
- end += bt_entry_size_bytes(mm);
-
- /*
- * Find the first overlapping vma. If vma->vm_start > start, there
- * will be a hole in the bounds table. This -EINVAL return will
- * cause a SIGSEGV.
- */
- vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EINVAL;
-
- /*
- * A NUMA policy on a VM_MPX VMA could cause this bounds table to
- * be split. So we need to look across the entire 'start -> end'
- * range of this bounds table, find all of the VM_MPX VMAs, and
- * zap only those.
- */
- addr = start;
- while (vma && vma->vm_start < end) {
- /*
- * We followed a bounds directory entry down
- * here. If we find a non-MPX VMA, that's bad,
- * so stop immediately and return an error. This
- * probably results in a SIGSEGV.
- */
- if (!(vma->vm_flags & VM_MPX))
- return -EINVAL;
-
- len = min(vma->vm_end, end) - addr;
- zap_page_range(vma, addr, len);
- trace_mpx_unmap_zap(addr, addr+len);
-
- vma = vma->vm_next;
- addr = vma->vm_start;
- }
- return 0;
-}
-
-static unsigned long mpx_get_bd_entry_offset(struct mm_struct *mm,
- unsigned long addr)
-{
- /*
- * There are several ways to derive the bd offsets. We
- * use the following approach here:
- * 1. We know the size of the virtual address space
- * 2. We know the number of entries in a bounds table
- * 3. We know that each entry covers a fixed amount of
- * virtual address space.
- * So, we can just divide the virtual address by the
- * virtual space used by one entry to determine which
- * entry "controls" the given virtual address.
- */
- if (is_64bit_mm(mm)) {
- int bd_entry_size = 8; /* 64-bit pointer */
- /*
- * Take the 64-bit addressing hole in to account.
- */
- addr &= ((1UL << boot_cpu_data.x86_virt_bits) - 1);
- return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
- } else {
- int bd_entry_size = 4; /* 32-bit pointer */
- /*
- * 32-bit has no hole so this case needs no mask
- */
- return (addr / bd_entry_virt_space(mm)) * bd_entry_size;
- }
- /*
- * The two return calls above are exact copies. If we
- * pull out a single copy and put it in here, gcc won't
- * realize that we're doing a power-of-2 divide and use
- * shifts. It uses a real divide. If we put them up
- * there, it manages to figure it out (gcc 4.8.3).
- */
-}
-
-static int unmap_entire_bt(struct mm_struct *mm,
- long __user *bd_entry, unsigned long bt_addr)
-{
- unsigned long expected_old_val = bt_addr | MPX_BD_ENTRY_VALID_FLAG;
- unsigned long uninitialized_var(actual_old_val);
- int ret;
-
- while (1) {
- int need_write = 1;
- unsigned long cleared_bd_entry = 0;
-
- pagefault_disable();
- ret = mpx_cmpxchg_bd_entry(mm, &actual_old_val,
- bd_entry, expected_old_val, cleared_bd_entry);
- pagefault_enable();
- if (!ret)
- break;
- if (ret == -EFAULT)
- ret = mpx_resolve_fault(bd_entry, need_write);
- /*
- * If we could not resolve the fault, consider it
- * userspace's fault and error out.
- */
- if (ret)
- return ret;
- }
- /*
- * The cmpxchg was performed, check the results.
- */
- if (actual_old_val != expected_old_val) {
- /*
- * Someone else raced with us to unmap the table.
- * That is OK, since we were both trying to do
- * the same thing. Declare success.
- */
- if (!actual_old_val)
- return 0;
- /*
- * Something messed with the bounds directory
- * entry. We hold mmap_sem for read or write
- * here, so it could not be a _new_ bounds table
- * that someone just allocated. Something is
- * wrong, so pass up the error and SIGSEGV.
- */
- return -EINVAL;
- }
- /*
- * Note, we are likely being called under do_munmap() already. To
- * avoid recursion, do_munmap() will check whether it comes
- * from one bounds table through VM_MPX flag.
- */
- return do_munmap(mm, bt_addr, mpx_bt_size_bytes(mm), NULL);
-}
-
-static int try_unmap_single_bt(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- struct vm_area_struct *next;
- struct vm_area_struct *prev;
- /*
- * "bta" == Bounds Table Area: the area controlled by the
- * bounds table that we are unmapping.
- */
- unsigned long bta_start_vaddr = start & ~(bd_entry_virt_space(mm)-1);
- unsigned long bta_end_vaddr = bta_start_vaddr + bd_entry_virt_space(mm);
- unsigned long uninitialized_var(bt_addr);
- void __user *bde_vaddr;
- int ret;
- /*
- * We already unlinked the VMAs from the mm's rbtree so 'start'
- * is guaranteed to be in a hole. This gets us the first VMA
- * before the hole in to 'prev' and the next VMA after the hole
- * in to 'next'.
- */
- next = find_vma_prev(mm, start, &prev);
- /*
- * Do not count other MPX bounds table VMAs as neighbors.
- * Although theoretically possible, we do not allow bounds
- * tables for bounds tables so our heads do not explode.
- * If we count them as neighbors here, we may end up with
- * lots of tables even though we have no actual table
- * entries in use.
- */
- while (next && (next->vm_flags & VM_MPX))
- next = next->vm_next;
- while (prev && (prev->vm_flags & VM_MPX))
- prev = prev->vm_prev;
- /*
- * We know 'start' and 'end' lie within an area controlled
- * by a single bounds table. See if there are any other
- * VMAs controlled by that bounds table. If there are not
- * then we can "expand" the are we are unmapping to possibly
- * cover the entire table.
- */
- next = find_vma_prev(mm, start, &prev);
- if ((!prev || prev->vm_end <= bta_start_vaddr) &&
- (!next || next->vm_start >= bta_end_vaddr)) {
- /*
- * No neighbor VMAs controlled by same bounds
- * table. Try to unmap the whole thing
- */
- start = bta_start_vaddr;
- end = bta_end_vaddr;
- }
-
- bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start);
- ret = get_bt_addr(mm, bde_vaddr, &bt_addr);
- /*
- * No bounds table there, so nothing to unmap.
- */
- if (ret == -ENOENT) {
- ret = 0;
- return 0;
- }
- if (ret)
- return ret;
- /*
- * We are unmapping an entire table. Either because the
- * unmap that started this whole process was large enough
- * to cover an entire table, or that the unmap was small
- * but was the area covered by a bounds table.
- */
- if ((start == bta_start_vaddr) &&
- (end == bta_end_vaddr))
- return unmap_entire_bt(mm, bde_vaddr, bt_addr);
- return zap_bt_entries_mapping(mm, bt_addr, start, end);
-}
-
-static int mpx_unmap_tables(struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- unsigned long one_unmap_start;
- trace_mpx_unmap_search(start, end);
-
- one_unmap_start = start;
- while (one_unmap_start < end) {
- int ret;
- unsigned long next_unmap_start = ALIGN(one_unmap_start+1,
- bd_entry_virt_space(mm));
- unsigned long one_unmap_end = end;
- /*
- * if the end is beyond the current bounds table,
- * move it back so we only deal with a single one
- * at a time
- */
- if (one_unmap_end > next_unmap_start)
- one_unmap_end = next_unmap_start;
- ret = try_unmap_single_bt(mm, one_unmap_start, one_unmap_end);
- if (ret)
- return ret;
-
- one_unmap_start = next_unmap_start;
- }
- return 0;
-}
-
-/*
- * Free unused bounds tables covered in a virtual address region being
- * munmap()ed. Assume end > start.
- *
- * This function will be called by do_munmap(), and the VMAs covering
- * the virtual address region start...end have already been split if
- * necessary, and the 'vma' is the first vma in this range (start -> end).
- */
-void mpx_notify_unmap(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
- struct vm_area_struct *vma;
- int ret;
-
- /*
- * Refuse to do anything unless userspace has asked
- * the kernel to help manage the bounds tables,
- */
- if (!kernel_managing_mpx_tables(current->mm))
- return;
- /*
- * This will look across the entire 'start -> end' range,
- * and find all of the non-VM_MPX VMAs.
- *
- * To avoid recursion, if a VM_MPX vma is found in the range
- * (start->end), we will not continue follow-up work. This
- * recursion represents having bounds tables for bounds tables,
- * which should not occur normally. Being strict about it here
- * helps ensure that we do not have an exploitable stack overflow.
- */
- vma = find_vma(mm, start);
- while (vma && vma->vm_start < end) {
- if (vma->vm_flags & VM_MPX)
- return;
- vma = vma->vm_next;
- }
-
- ret = mpx_unmap_tables(mm, start, end);
- if (ret)
- force_sig(SIGSEGV);
-}
-
-/* MPX cannot handle addresses above 47 bits yet. */
-unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
- unsigned long flags)
-{
- if (!kernel_managing_mpx_tables(current->mm))
- return addr;
- if (addr + len <= DEFAULT_MAP_WINDOW)
- return addr;
- if (flags & MAP_FIXED)
- return -ENOMEM;
-
- /*
- * Requested len is larger than the whole area we're allowed to map in.
- * Resetting hinting address wouldn't do much good -- fail early.
- */
- if (len > DEFAULT_MAP_WINDOW)
- return -ENOMEM;
-
- /* Look for unmap area within DEFAULT_MAP_WINDOW */
- return 0;
-}
diff --git a/arch/x86/mm/pat/Makefile b/arch/x86/mm/pat/Makefile
new file mode 100644
index 000000000000..ea464c995161
--- /dev/null
+++ b/arch/x86/mm/pat/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := set_memory.o memtype.o
+
+obj-$(CONFIG_X86_PAT) += memtype_interval.o
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pat/cpa-test.c
index facce271e8b9..facce271e8b9 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pat/cpa-test.c
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat/memtype.c
index 2d758e19ef22..394be8611748 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1,11 +1,34 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Handle caching attributes in page tables (PAT)
+ * Page Attribute Table (PAT) support: handle memory caching attributes in page tables.
*
* Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* Suresh B Siddha <suresh.b.siddha@intel.com>
*
* Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ *
+ * Basic principles:
+ *
+ * PAT is a CPU feature supported by all modern x86 CPUs, to allow the firmware and
+ * the kernel to set one of a handful of 'caching type' attributes for physical
+ * memory ranges: uncached, write-combining, write-through, write-protected,
+ * and the most commonly used and default attribute: write-back caching.
+ *
+ * PAT support supercedes and augments MTRR support in a compatible fashion: MTRR is
+ * a hardware interface to enumerate a limited number of physical memory ranges
+ * and set their caching attributes explicitly, programmed into the CPU via MSRs.
+ * Even modern CPUs have MTRRs enabled - but these are typically not touched
+ * by the kernel or by user-space (such as the X server), we rely on PAT for any
+ * additional cache attribute logic.
+ *
+ * PAT doesn't work via explicit memory ranges, but uses page table entries to add
+ * cache attribute information to the mapped memory range: there's 3 bits used,
+ * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT), with the 8 possible values mapped by the
+ * CPU to actual cache attributes via an MSR loaded into the CPU (MSR_IA32_CR_PAT).
+ *
+ * ( There's a metric ton of finer details, such as compatibility with CPU quirks
+ * that only support 4 types of PAT entries, and interaction with MTRRs, see
+ * below for details. )
*/
#include <linux/seq_file.h>
@@ -29,44 +52,48 @@
#include <asm/mtrr.h>
#include <asm/page.h>
#include <asm/msr.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/io.h>
-#include "pat_internal.h"
-#include "mm_internal.h"
+#include "memtype.h"
+#include "../mm_internal.h"
#undef pr_fmt
#define pr_fmt(fmt) "" fmt
-static bool __read_mostly boot_cpu_done;
+static bool __read_mostly pat_bp_initialized;
static bool __read_mostly pat_disabled = !IS_ENABLED(CONFIG_X86_PAT);
-static bool __read_mostly pat_initialized;
-static bool __read_mostly init_cm_done;
+static bool __read_mostly pat_bp_enabled;
+static bool __read_mostly pat_cm_initialized;
-void pat_disable(const char *reason)
+/*
+ * PAT support is enabled by default, but can be disabled for
+ * various user-requested or hardware-forced reasons:
+ */
+void pat_disable(const char *msg_reason)
{
if (pat_disabled)
return;
- if (boot_cpu_done) {
+ if (pat_bp_initialized) {
WARN_ONCE(1, "x86/PAT: PAT cannot be disabled after initialization\n");
return;
}
pat_disabled = true;
- pr_info("x86/PAT: %s\n", reason);
+ pr_info("x86/PAT: %s\n", msg_reason);
}
static int __init nopat(char *str)
{
- pat_disable("PAT support disabled.");
+ pat_disable("PAT support disabled via boot option.");
return 0;
}
early_param("nopat", nopat);
bool pat_enabled(void)
{
- return pat_initialized;
+ return pat_bp_enabled;
}
EXPORT_SYMBOL_GPL(pat_enabled);
@@ -197,6 +224,8 @@ static void __init_cache_modes(u64 pat)
char pat_msg[33];
int i;
+ WARN_ON_ONCE(pat_cm_initialized);
+
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
@@ -205,28 +234,28 @@ static void __init_cache_modes(u64 pat)
}
pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
- init_cm_done = true;
+ pat_cm_initialized = true;
}
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-static void pat_bsp_init(u64 pat)
+static void pat_bp_init(u64 pat)
{
u64 tmp_pat;
if (!boot_cpu_has(X86_FEATURE_PAT)) {
- pat_disable("PAT not supported by CPU.");
+ pat_disable("PAT not supported by the CPU.");
return;
}
rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
if (!tmp_pat) {
- pat_disable("PAT MSR is 0, disabled.");
+ pat_disable("PAT support disabled by the firmware.");
return;
}
wrmsrl(MSR_IA32_CR_PAT, pat);
- pat_initialized = true;
+ pat_bp_enabled = true;
__init_cache_modes(pat);
}
@@ -248,7 +277,7 @@ void init_cache_modes(void)
{
u64 pat = 0;
- if (init_cm_done)
+ if (pat_cm_initialized)
return;
if (boot_cpu_has(X86_FEATURE_PAT)) {
@@ -291,7 +320,7 @@ void init_cache_modes(void)
}
/**
- * pat_init - Initialize PAT MSR and PAT table
+ * pat_init - Initialize the PAT MSR and PAT table on the current CPU
*
* This function initializes PAT MSR and PAT table with an OS-defined value
* to enable additional cache attributes, WC, WT and WP.
@@ -305,6 +334,10 @@ void pat_init(void)
u64 pat;
struct cpuinfo_x86 *c = &boot_cpu_data;
+#ifndef CONFIG_X86_PAT
+ pr_info_once("x86/PAT: PAT support disabled because CONFIG_X86_PAT is disabled in the kernel.\n");
+#endif
+
if (pat_disabled)
return;
@@ -364,9 +397,9 @@ void pat_init(void)
PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT);
}
- if (!boot_cpu_done) {
- pat_bsp_init(pat);
- boot_cpu_done = true;
+ if (!pat_bp_initialized) {
+ pat_bp_init(pat);
+ pat_bp_initialized = true;
} else {
pat_ap_init(pat);
}
@@ -542,10 +575,10 @@ static u64 sanitize_phys(u64 address)
* available type in new_type in case of no error. In case of any error
* it will return a negative return value.
*/
-int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
+int memtype_reserve(u64 start, u64 end, enum page_cache_mode req_type,
enum page_cache_mode *new_type)
{
- struct memtype *new;
+ struct memtype *entry_new;
enum page_cache_mode actual_type;
int is_range_ram;
int err = 0;
@@ -593,22 +626,22 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
return -EINVAL;
}
- new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!new)
+ entry_new = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!entry_new)
return -ENOMEM;
- new->start = start;
- new->end = end;
- new->type = actual_type;
+ entry_new->start = start;
+ entry_new->end = end;
+ entry_new->type = actual_type;
spin_lock(&memtype_lock);
- err = memtype_check_insert(new, new_type);
+ err = memtype_check_insert(entry_new, new_type);
if (err) {
- pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ pr_info("x86/PAT: memtype_reserve failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
start, end - 1,
- cattr_name(new->type), cattr_name(req_type));
- kfree(new);
+ cattr_name(entry_new->type), cattr_name(req_type));
+ kfree(entry_new);
spin_unlock(&memtype_lock);
return err;
@@ -616,18 +649,17 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
spin_unlock(&memtype_lock);
- dprintk("reserve_memtype added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
- start, end - 1, cattr_name(new->type), cattr_name(req_type),
+ dprintk("memtype_reserve added [mem %#010Lx-%#010Lx], track %s, req %s, ret %s\n",
+ start, end - 1, cattr_name(entry_new->type), cattr_name(req_type),
new_type ? cattr_name(*new_type) : "-");
return err;
}
-int free_memtype(u64 start, u64 end)
+int memtype_free(u64 start, u64 end)
{
- int err = -EINVAL;
int is_range_ram;
- struct memtype *entry;
+ struct memtype *entry_old;
if (!pat_enabled())
return 0;
@@ -640,28 +672,24 @@ int free_memtype(u64 start, u64 end)
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
- if (is_range_ram == 1) {
-
- err = free_ram_pages_type(start, end);
-
- return err;
- } else if (is_range_ram < 0) {
+ if (is_range_ram == 1)
+ return free_ram_pages_type(start, end);
+ if (is_range_ram < 0)
return -EINVAL;
- }
spin_lock(&memtype_lock);
- entry = memtype_erase(start, end);
+ entry_old = memtype_erase(start, end);
spin_unlock(&memtype_lock);
- if (IS_ERR(entry)) {
+ if (IS_ERR(entry_old)) {
pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid, start, end - 1);
return -EINVAL;
}
- kfree(entry);
+ kfree(entry_old);
- dprintk("free_memtype request [mem %#010Lx-%#010Lx]\n", start, end - 1);
+ dprintk("memtype_free request [mem %#010Lx-%#010Lx]\n", start, end - 1);
return 0;
}
@@ -700,6 +728,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
rettype = _PAGE_CACHE_MODE_UC_MINUS;
spin_unlock(&memtype_lock);
+
return rettype;
}
@@ -723,7 +752,7 @@ bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn)
EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
/**
- * io_reserve_memtype - Request a memory type mapping for a region of memory
+ * memtype_reserve_io - Request a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
* @type: A pointer to memtype, with requested type. On success, requested
@@ -732,7 +761,7 @@ EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr);
* On success, returns 0
* On failure, returns non-zero
*/
-int io_reserve_memtype(resource_size_t start, resource_size_t end,
+int memtype_reserve_io(resource_size_t start, resource_size_t end,
enum page_cache_mode *type)
{
resource_size_t size = end - start;
@@ -742,47 +771,47 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end,
WARN_ON_ONCE(iomem_map_sanity_check(start, size));
- ret = reserve_memtype(start, end, req_type, &new_type);
+ ret = memtype_reserve(start, end, req_type, &new_type);
if (ret)
goto out_err;
if (!is_new_memtype_allowed(start, size, req_type, new_type))
goto out_free;
- if (kernel_map_sync_memtype(start, size, new_type) < 0)
+ if (memtype_kernel_map_sync(start, size, new_type) < 0)
goto out_free;
*type = new_type;
return 0;
out_free:
- free_memtype(start, end);
+ memtype_free(start, end);
ret = -EBUSY;
out_err:
return ret;
}
/**
- * io_free_memtype - Release a memory type mapping for a region of memory
+ * memtype_free_io - Release a memory type mapping for a region of memory
* @start: start (physical address) of the region
* @end: end (physical address) of the region
*/
-void io_free_memtype(resource_size_t start, resource_size_t end)
+void memtype_free_io(resource_size_t start, resource_size_t end)
{
- free_memtype(start, end);
+ memtype_free(start, end);
}
int arch_io_reserve_memtype_wc(resource_size_t start, resource_size_t size)
{
enum page_cache_mode type = _PAGE_CACHE_MODE_WC;
- return io_reserve_memtype(start, start + size, &type);
+ return memtype_reserve_io(start, start + size, &type);
}
EXPORT_SYMBOL(arch_io_reserve_memtype_wc);
void arch_io_free_memtype_wc(resource_size_t start, resource_size_t size)
{
- io_free_memtype(start, start + size);
+ memtype_free_io(start, start + size);
}
EXPORT_SYMBOL(arch_io_free_memtype_wc);
@@ -839,10 +868,10 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
}
/*
- * Change the memory type for the physial address range in kernel identity
+ * Change the memory type for the physical address range in kernel identity
* mapping space if that range is a part of identity map.
*/
-int kernel_map_sync_memtype(u64 base, unsigned long size,
+int memtype_kernel_map_sync(u64 base, unsigned long size,
enum page_cache_mode pcm)
{
unsigned long id_sz;
@@ -851,15 +880,14 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
return 0;
/*
- * some areas in the middle of the kernel identity range
- * are not mapped, like the PCI space.
+ * Some areas in the middle of the kernel identity range
+ * are not mapped, for example the PCI space.
*/
if (!page_is_ram(base >> PAGE_SHIFT))
return 0;
id_sz = (__pa(high_memory-1) <= base + size) ?
- __pa(high_memory) - base :
- size;
+ __pa(high_memory) - base : size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
@@ -873,7 +901,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
/*
* Internal interface to reserve a range of physical memory with prot.
- * Reserved non RAM regions only and after successful reserve_memtype,
+ * Reserved non RAM regions only and after successful memtype_reserve,
* this func also keeps identity mapping (if any) in sync with this new prot.
*/
static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
@@ -910,14 +938,14 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
return 0;
}
- ret = reserve_memtype(paddr, paddr + size, want_pcm, &pcm);
+ ret = memtype_reserve(paddr, paddr + size, want_pcm, &pcm);
if (ret)
return ret;
if (pcm != want_pcm) {
if (strict_prot ||
!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
- free_memtype(paddr, paddr + size);
+ memtype_free(paddr, paddr + size);
pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_pcm),
@@ -935,8 +963,8 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
cachemode2protval(pcm));
}
- if (kernel_map_sync_memtype(paddr, size, pcm) < 0) {
- free_memtype(paddr, paddr + size);
+ if (memtype_kernel_map_sync(paddr, size, pcm) < 0) {
+ memtype_free(paddr, paddr + size);
return -EINVAL;
}
return 0;
@@ -952,7 +980,7 @@ static void free_pfn_range(u64 paddr, unsigned long size)
is_ram = pat_pagerange_is_ram(paddr, paddr + size);
if (is_ram == 0)
- free_memtype(paddr, paddr + size);
+ memtype_free(paddr, paddr + size);
}
/*
@@ -1099,25 +1127,30 @@ EXPORT_SYMBOL_GPL(pgprot_writethrough);
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
+/*
+ * We are allocating a temporary printout-entry to be passed
+ * between seq_start()/next() and seq_show():
+ */
static struct memtype *memtype_get_idx(loff_t pos)
{
- struct memtype *print_entry;
+ struct memtype *entry_print;
int ret;
- print_entry = kzalloc(sizeof(struct memtype), GFP_KERNEL);
- if (!print_entry)
+ entry_print = kzalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!entry_print)
return NULL;
spin_lock(&memtype_lock);
- ret = memtype_copy_nth_element(print_entry, pos);
+ ret = memtype_copy_nth_element(entry_print, pos);
spin_unlock(&memtype_lock);
- if (!ret) {
- return print_entry;
- } else {
- kfree(print_entry);
+ /* Free it on error: */
+ if (ret) {
+ kfree(entry_print);
return NULL;
}
+
+ return entry_print;
}
static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
@@ -1142,11 +1175,14 @@ static void memtype_seq_stop(struct seq_file *seq, void *v)
static int memtype_seq_show(struct seq_file *seq, void *v)
{
- struct memtype *print_entry = (struct memtype *)v;
+ struct memtype *entry_print = (struct memtype *)v;
+
+ seq_printf(seq, "PAT: [mem 0x%016Lx-0x%016Lx] %s\n",
+ entry_print->start,
+ entry_print->end,
+ cattr_name(entry_print->type));
- seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
- print_entry->start, print_entry->end);
- kfree(print_entry);
+ kfree(entry_print);
return 0;
}
@@ -1178,7 +1214,6 @@ static int __init pat_memtype_list_init(void)
}
return 0;
}
-
late_initcall(pat_memtype_list_init);
#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat/memtype.h
index 79a06684349e..cacecdbceb55 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat/memtype.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __PAT_INTERNAL_H_
-#define __PAT_INTERNAL_H_
+#ifndef __MEMTYPE_H_
+#define __MEMTYPE_H_
extern int pat_debug_enable;
@@ -29,13 +29,13 @@ static inline char *cattr_name(enum page_cache_mode pcm)
}
#ifdef CONFIG_X86_PAT
-extern int memtype_check_insert(struct memtype *new,
+extern int memtype_check_insert(struct memtype *entry_new,
enum page_cache_mode *new_type);
extern struct memtype *memtype_erase(u64 start, u64 end);
extern struct memtype *memtype_lookup(u64 addr);
-extern int memtype_copy_nth_element(struct memtype *out, loff_t pos);
+extern int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos);
#else
-static inline int memtype_check_insert(struct memtype *new,
+static inline int memtype_check_insert(struct memtype *entry_new,
enum page_cache_mode *new_type)
{ return 0; }
static inline struct memtype *memtype_erase(u64 start, u64 end)
@@ -46,4 +46,4 @@ static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos)
{ return 0; }
#endif
-#endif /* __PAT_INTERNAL_H_ */
+#endif /* __MEMTYPE_H_ */
diff --git a/arch/x86/mm/pat/memtype_interval.c b/arch/x86/mm/pat/memtype_interval.c
new file mode 100644
index 000000000000..a07e4882bf36
--- /dev/null
+++ b/arch/x86/mm/pat/memtype_interval.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Interval tree used to store the PAT memory type reservations.
+ */
+
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/kernel.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+
+#include <asm/pgtable.h>
+#include <asm/memtype.h>
+
+#include "memtype.h"
+
+/*
+ * The memtype tree keeps track of memory type for specific
+ * physical memory areas. Without proper tracking, conflicting memory
+ * types in different mappings can cause CPU cache corruption.
+ *
+ * The tree is an interval tree (augmented rbtree) which tree is ordered
+ * by the starting address. The tree can contain multiple entries for
+ * different regions which overlap. All the aliases have the same
+ * cache attributes of course, as enforced by the PAT logic.
+ *
+ * memtype_lock protects the rbtree.
+ */
+
+static inline u64 interval_start(struct memtype *entry)
+{
+ return entry->start;
+}
+
+static inline u64 interval_end(struct memtype *entry)
+{
+ return entry->end - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
+ interval_start, interval_end,
+ static, interval)
+
+static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
+
+enum {
+ MEMTYPE_EXACT_MATCH = 0,
+ MEMTYPE_END_MATCH = 1
+};
+
+static struct memtype *memtype_match(u64 start, u64 end, int match_type)
+{
+ struct memtype *entry_match;
+
+ entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
+
+ while (entry_match != NULL && entry_match->start < end) {
+ if ((match_type == MEMTYPE_EXACT_MATCH) &&
+ (entry_match->start == start) && (entry_match->end == end))
+ return entry_match;
+
+ if ((match_type == MEMTYPE_END_MATCH) &&
+ (entry_match->start < start) && (entry_match->end == end))
+ return entry_match;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ }
+
+ return NULL; /* Returns NULL if there is no match */
+}
+
+static int memtype_check_conflict(u64 start, u64 end,
+ enum page_cache_mode reqtype,
+ enum page_cache_mode *newtype)
+{
+ struct memtype *entry_match;
+ enum page_cache_mode found_type = reqtype;
+
+ entry_match = interval_iter_first(&memtype_rbroot, start, end-1);
+ if (entry_match == NULL)
+ goto success;
+
+ if (entry_match->type != found_type && newtype == NULL)
+ goto failure;
+
+ dprintk("Overlap at 0x%Lx-0x%Lx\n", entry_match->start, entry_match->end);
+ found_type = entry_match->type;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ while (entry_match) {
+ if (entry_match->type != found_type)
+ goto failure;
+
+ entry_match = interval_iter_next(entry_match, start, end-1);
+ }
+success:
+ if (newtype)
+ *newtype = found_type;
+
+ return 0;
+
+failure:
+ pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid, start, end,
+ cattr_name(found_type), cattr_name(entry_match->type));
+
+ return -EBUSY;
+}
+
+int memtype_check_insert(struct memtype *entry_new, enum page_cache_mode *ret_type)
+{
+ int err = 0;
+
+ err = memtype_check_conflict(entry_new->start, entry_new->end, entry_new->type, ret_type);
+ if (err)
+ return err;
+
+ if (ret_type)
+ entry_new->type = *ret_type;
+
+ interval_insert(entry_new, &memtype_rbroot);
+ return 0;
+}
+
+struct memtype *memtype_erase(u64 start, u64 end)
+{
+ struct memtype *entry_old;
+
+ /*
+ * Since the memtype_rbroot tree allows overlapping ranges,
+ * memtype_erase() checks with EXACT_MATCH first, i.e. free
+ * a whole node for the munmap case. If no such entry is found,
+ * it then checks with END_MATCH, i.e. shrink the size of a node
+ * from the end for the mremap case.
+ */
+ entry_old = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
+ if (!entry_old) {
+ entry_old = memtype_match(start, end, MEMTYPE_END_MATCH);
+ if (!entry_old)
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (entry_old->start == start) {
+ /* munmap: erase this node */
+ interval_remove(entry_old, &memtype_rbroot);
+ } else {
+ /* mremap: update the end value of this node */
+ interval_remove(entry_old, &memtype_rbroot);
+ entry_old->end = start;
+ interval_insert(entry_old, &memtype_rbroot);
+
+ return NULL;
+ }
+
+ return entry_old;
+}
+
+struct memtype *memtype_lookup(u64 addr)
+{
+ return interval_iter_first(&memtype_rbroot, addr, addr + PAGE_SIZE-1);
+}
+
+/*
+ * Debugging helper, copy the Nth entry of the tree into a
+ * a copy for printout. This allows us to print out the tree
+ * via debugfs, without holding the memtype_lock too long:
+ */
+#ifdef CONFIG_DEBUG_FS
+int memtype_copy_nth_element(struct memtype *entry_out, loff_t pos)
+{
+ struct memtype *entry_match;
+ int i = 1;
+
+ entry_match = interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
+
+ while (entry_match && pos != i) {
+ entry_match = interval_iter_next(entry_match, 0, ULONG_MAX);
+ i++;
+ }
+
+ if (entry_match) { /* pos == i */
+ *entry_out = *entry_match;
+ return 0;
+ } else {
+ return 1;
+ }
+}
+#endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pat/set_memory.c
index 1b99ad05b117..c4aedd00c1ba 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -24,10 +24,10 @@
#include <linux/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/set_memory.h>
-#include "mm_internal.h"
+#include "../mm_internal.h"
/*
* The current flushing context - we pass it instead of 5 arguments:
@@ -331,7 +331,7 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
}
-void __cpa_flush_tlb(void *data)
+static void __cpa_flush_tlb(void *data)
{
struct cpa_data *cpa = data;
unsigned int i;
@@ -618,6 +618,17 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
}
EXPORT_SYMBOL_GPL(lookup_address);
+/*
+ * Lookup the page table entry for a virtual address in a given mm. Return a
+ * pointer to the entry and the level of the mapping.
+ */
+pte_t *lookup_address_in_mm(struct mm_struct *mm, unsigned long address,
+ unsigned int *level)
+{
+ return lookup_address_in_pgd(pgd_offset(mm, address), address, level);
+}
+EXPORT_SYMBOL_GPL(lookup_address_in_mm);
+
static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
unsigned int *level)
{
@@ -1801,7 +1812,7 @@ int set_memory_uc(unsigned long addr, int numpages)
/*
* for now UC MINUS. see comments in ioremap()
*/
- ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_UC_MINUS, NULL);
if (ret)
goto out_err;
@@ -1813,7 +1824,7 @@ int set_memory_uc(unsigned long addr, int numpages)
return 0;
out_free:
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
out_err:
return ret;
}
@@ -1839,14 +1850,14 @@ int set_memory_wc(unsigned long addr, int numpages)
{
int ret;
- ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ ret = memtype_reserve(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_WC, NULL);
if (ret)
return ret;
ret = _set_memory_wc(addr, numpages);
if (ret)
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return ret;
}
@@ -1873,7 +1884,7 @@ int set_memory_wb(unsigned long addr, int numpages)
if (ret)
return ret;
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ memtype_free(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
return 0;
}
EXPORT_SYMBOL(set_memory_wb);
@@ -2014,7 +2025,7 @@ static int _set_pages_array(struct page **pages, int numpages,
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- if (reserve_memtype(start, end, new_type, NULL))
+ if (memtype_reserve(start, end, new_type, NULL))
goto err_out;
}
@@ -2040,7 +2051,7 @@ err_out:
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- free_memtype(start, end);
+ memtype_free(start, end);
}
return -EINVAL;
}
@@ -2089,7 +2100,7 @@ int set_pages_array_wb(struct page **pages, int numpages)
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
end = start + PAGE_SIZE;
- free_memtype(start, end);
+ memtype_free(start, end);
}
return 0;
@@ -2215,7 +2226,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
.pgd = pgd,
.numpages = numpages,
.mask_set = __pgprot(0),
- .mask_clr = __pgprot(0),
+ .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)),
.flags = 0,
};
@@ -2224,12 +2235,6 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
if (!(__supported_pte_mask & _PAGE_NX))
goto out;
- if (!(page_flags & _PAGE_NX))
- cpa.mask_clr = __pgprot(_PAGE_NX);
-
- if (!(page_flags & _PAGE_RW))
- cpa.mask_clr = __pgprot(_PAGE_RW);
-
if (!(page_flags & _PAGE_ENC))
cpa.mask_clr = pgprot_encrypted(cpa.mask_clr);
@@ -2281,5 +2286,5 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address,
* be exposed to the rest of the kernel. Include these directly here.
*/
#ifdef CONFIG_CPA_DEBUG
-#include "pageattr-test.c"
+#include "cpa-test.c"
#endif
diff --git a/arch/x86/mm/pat_interval.c b/arch/x86/mm/pat_interval.c
deleted file mode 100644
index 6855362eaf21..000000000000
--- a/arch/x86/mm/pat_interval.c
+++ /dev/null
@@ -1,185 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Handle caching attributes in page tables (PAT)
- *
- * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
- * Suresh B Siddha <suresh.b.siddha@intel.com>
- *
- * Interval tree used to store the PAT memory type reservations.
- */
-
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/kernel.h>
-#include <linux/interval_tree_generic.h>
-#include <linux/sched.h>
-#include <linux/gfp.h>
-
-#include <asm/pgtable.h>
-#include <asm/pat.h>
-
-#include "pat_internal.h"
-
-/*
- * The memtype tree keeps track of memory type for specific
- * physical memory areas. Without proper tracking, conflicting memory
- * types in different mappings can cause CPU cache corruption.
- *
- * The tree is an interval tree (augmented rbtree) with tree ordered
- * on starting address. Tree can contain multiple entries for
- * different regions which overlap. All the aliases have the same
- * cache attributes of course.
- *
- * memtype_lock protects the rbtree.
- */
-static inline u64 memtype_interval_start(struct memtype *memtype)
-{
- return memtype->start;
-}
-
-static inline u64 memtype_interval_end(struct memtype *memtype)
-{
- return memtype->end - 1;
-}
-INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end,
- memtype_interval_start, memtype_interval_end,
- static, memtype_interval)
-
-static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED;
-
-enum {
- MEMTYPE_EXACT_MATCH = 0,
- MEMTYPE_END_MATCH = 1
-};
-
-static struct memtype *memtype_match(u64 start, u64 end, int match_type)
-{
- struct memtype *match;
-
- match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
- while (match != NULL && match->start < end) {
- if ((match_type == MEMTYPE_EXACT_MATCH) &&
- (match->start == start) && (match->end == end))
- return match;
-
- if ((match_type == MEMTYPE_END_MATCH) &&
- (match->start < start) && (match->end == end))
- return match;
-
- match = memtype_interval_iter_next(match, start, end-1);
- }
-
- return NULL; /* Returns NULL if there is no match */
-}
-
-static int memtype_check_conflict(u64 start, u64 end,
- enum page_cache_mode reqtype,
- enum page_cache_mode *newtype)
-{
- struct memtype *match;
- enum page_cache_mode found_type = reqtype;
-
- match = memtype_interval_iter_first(&memtype_rbroot, start, end-1);
- if (match == NULL)
- goto success;
-
- if (match->type != found_type && newtype == NULL)
- goto failure;
-
- dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end);
- found_type = match->type;
-
- match = memtype_interval_iter_next(match, start, end-1);
- while (match) {
- if (match->type != found_type)
- goto failure;
-
- match = memtype_interval_iter_next(match, start, end-1);
- }
-success:
- if (newtype)
- *newtype = found_type;
-
- return 0;
-
-failure:
- pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
- current->comm, current->pid, start, end,
- cattr_name(found_type), cattr_name(match->type));
- return -EBUSY;
-}
-
-int memtype_check_insert(struct memtype *new,
- enum page_cache_mode *ret_type)
-{
- int err = 0;
-
- err = memtype_check_conflict(new->start, new->end, new->type, ret_type);
- if (err)
- return err;
-
- if (ret_type)
- new->type = *ret_type;
-
- memtype_interval_insert(new, &memtype_rbroot);
- return 0;
-}
-
-struct memtype *memtype_erase(u64 start, u64 end)
-{
- struct memtype *data;
-
- /*
- * Since the memtype_rbroot tree allows overlapping ranges,
- * memtype_erase() checks with EXACT_MATCH first, i.e. free
- * a whole node for the munmap case. If no such entry is found,
- * it then checks with END_MATCH, i.e. shrink the size of a node
- * from the end for the mremap case.
- */
- data = memtype_match(start, end, MEMTYPE_EXACT_MATCH);
- if (!data) {
- data = memtype_match(start, end, MEMTYPE_END_MATCH);
- if (!data)
- return ERR_PTR(-EINVAL);
- }
-
- if (data->start == start) {
- /* munmap: erase this node */
- memtype_interval_remove(data, &memtype_rbroot);
- } else {
- /* mremap: update the end value of this node */
- memtype_interval_remove(data, &memtype_rbroot);
- data->end = start;
- memtype_interval_insert(data, &memtype_rbroot);
- return NULL;
- }
-
- return data;
-}
-
-struct memtype *memtype_lookup(u64 addr)
-{
- return memtype_interval_iter_first(&memtype_rbroot, addr,
- addr + PAGE_SIZE-1);
-}
-
-#if defined(CONFIG_DEBUG_FS)
-int memtype_copy_nth_element(struct memtype *out, loff_t pos)
-{
- struct memtype *match;
- int i = 1;
-
- match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX);
- while (match && pos != i) {
- match = memtype_interval_iter_next(match, 0, ULONG_MAX);
- i++;
- }
-
- if (match) { /* pos == i */
- *out = *match;
- return 0;
- } else {
- return 1;
- }
-}
-#endif
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 9bb7f0ab9fe6..0e6700eaa4f9 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -18,6 +18,7 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
+#include <linux/vmalloc.h>
unsigned int __VMALLOC_RESERVE = 128 << 20;
diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c
index bdc98150d4db..fc3f3d3e2ef2 100644
--- a/arch/x86/mm/physaddr.c
+++ b/arch/x86/mm/physaddr.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <asm/page.h>
+#include <linux/vmalloc.h>
#include "physaddr.h"
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 92153d054d6c..bda73cb7a044 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -79,7 +79,7 @@ static void do_read_far_test(void __iomem *p)
static void do_test(unsigned long size)
{
- void __iomem *p = ioremap_nocache(mmio_address, size);
+ void __iomem *p = ioremap(mmio_address, size);
if (!p) {
pr_err("could not ioremap, aborting.\n");
return;
@@ -104,7 +104,7 @@ static void do_test_bulk_ioremapping(void)
int i;
for (i = 0; i < 10; ++i) {
- p = ioremap_nocache(mmio_address, PAGE_SIZE);
+ p = ioremap(mmio_address, PAGE_SIZE);
if (p)
iounmap(p);
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e6a9edc5baaf..66f96f21a7b6 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -708,7 +708,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
else
on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
- (void *)info, 1, GFP_ATOMIC, cpumask);
+ (void *)info, 1, cpumask);
}
/*
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index b8be18427277..9ba08e9abc09 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -10,10 +10,12 @@
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/memory.h>
+#include <linux/sort.h>
#include <asm/extable.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>
+#include <asm/asm-prototypes.h>
static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -1326,7 +1328,7 @@ emit_jmp:
return proglen;
}
-static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
int i;
@@ -1342,7 +1344,7 @@ static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
-(stack_size - i * 8));
}
-static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
+static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
int i;
@@ -1359,7 +1361,7 @@ static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
-(stack_size - i * 8));
}
-static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
+static int invoke_bpf(const struct btf_func_model *m, u8 **pprog,
struct bpf_prog **progs, int prog_cnt, int stack_size)
{
u8 *prog = *pprog;
@@ -1454,7 +1456,8 @@ static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
* add rsp, 8 // skip eth_type_trans's frame
* ret // return to its caller
*/
-int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
+int arch_prepare_bpf_trampoline(void *image, void *image_end,
+ const struct btf_func_model *m, u32 flags,
struct bpf_prog **fentry_progs, int fentry_cnt,
struct bpf_prog **fexit_progs, int fexit_cnt,
void *orig_call)
@@ -1521,15 +1524,160 @@ int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags
/* skip our return address and return to parent */
EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
EMIT1(0xC3); /* ret */
- /* One half of the page has active running trampoline.
- * Another half is an area for next trampoline.
- * Make sure the trampoline generation logic doesn't overflow.
- */
- if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
+ /* Make sure the trampoline generation logic doesn't overflow */
+ if (WARN_ON_ONCE(prog > (u8 *)image_end - BPF_INSN_SAFETY))
return -EFAULT;
+ return prog - (u8 *)image;
+}
+
+static int emit_cond_near_jump(u8 **pprog, void *func, void *ip, u8 jmp_cond)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ s64 offset;
+
+ offset = func - (ip + 2 + 4);
+ if (!is_simm32(offset)) {
+ pr_err("Target %p is out of range\n", func);
+ return -EINVAL;
+ }
+ EMIT2_off32(0x0F, jmp_cond + 0x10, offset);
+ *pprog = prog;
+ return 0;
+}
+
+static void emit_nops(u8 **pprog, unsigned int len)
+{
+ unsigned int i, noplen;
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ while (len > 0) {
+ noplen = len;
+
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+
+ for (i = 0; i < noplen; i++)
+ EMIT1(ideal_nops[noplen][i]);
+ len -= noplen;
+ }
+
+ *pprog = prog;
+}
+
+static int emit_fallback_jump(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int err = 0;
+
+#ifdef CONFIG_RETPOLINE
+ /* Note that this assumes the the compiler uses external
+ * thunks for indirect calls. Both clang and GCC use the same
+ * naming convention for external thunks.
+ */
+ err = emit_jump(&prog, __x86_indirect_thunk_rdx, prog);
+#else
+ int cnt = 0;
+
+ EMIT2(0xFF, 0xE2); /* jmp rdx */
+#endif
+ *pprog = prog;
+ return err;
+}
+
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+{
+ u8 *jg_reloc, *jg_target, *prog = *pprog;
+ int pivot, err, jg_bytes = 1, cnt = 0;
+ s64 jg_offset;
+
+ if (a == b) {
+ /* Leaf node of recursion, i.e. not a range of indices
+ * anymore.
+ */
+ EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
+ if (!is_simm32(progs[a]))
+ return -1;
+ EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
+ progs[a]);
+ err = emit_cond_near_jump(&prog, /* je func */
+ (void *)progs[a], prog,
+ X86_JE);
+ if (err)
+ return err;
+
+ err = emit_fallback_jump(&prog); /* jmp thunk/indirect */
+ if (err)
+ return err;
+
+ *pprog = prog;
+ return 0;
+ }
+
+ /* Not a leaf node, so we pivot, and recursively descend into
+ * the lower and upper ranges.
+ */
+ pivot = (b - a) / 2;
+ EMIT1(add_1mod(0x48, BPF_REG_3)); /* cmp rdx,func */
+ if (!is_simm32(progs[a + pivot]))
+ return -1;
+ EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3), progs[a + pivot]);
+
+ if (pivot > 2) { /* jg upper_part */
+ /* Require near jump. */
+ jg_bytes = 4;
+ EMIT2_off32(0x0F, X86_JG + 0x10, 0);
+ } else {
+ EMIT2(X86_JG, 0);
+ }
+ jg_reloc = prog;
+
+ err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */
+ progs);
+ if (err)
+ return err;
+
+ /* From Intel 64 and IA-32 Architectures Optimization
+ * Reference Manual, 3.4.1.4 Code Alignment, Assembly/Compiler
+ * Coding Rule 11: All branch targets should be 16-byte
+ * aligned.
+ */
+ jg_target = PTR_ALIGN(prog, 16);
+ if (jg_target != prog)
+ emit_nops(&prog, jg_target - prog);
+ jg_offset = prog - jg_reloc;
+ emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
+
+ err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */
+ b, progs);
+ if (err)
+ return err;
+
+ *pprog = prog;
return 0;
}
+static int cmp_ips(const void *a, const void *b)
+{
+ const s64 *ipa = a;
+ const s64 *ipb = b;
+
+ if (*ipa > *ipb)
+ return 1;
+ if (*ipa < *ipb)
+ return -1;
+ return 0;
+}
+
+int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+{
+ u8 *prog = image;
+
+ sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
+ return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+}
+
struct x64_jit_data {
struct bpf_binary_header *header;
int *addrs;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1e59df041456..df1d95913d4e 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -625,43 +625,6 @@ unsigned int pcibios_assign_all_busses(void)
return (pci_probe & PCI_ASSIGN_ALL_BUSSES) ? 1 : 0;
}
-#if defined(CONFIG_X86_DEV_DMA_OPS) && defined(CONFIG_PCI_DOMAINS)
-static LIST_HEAD(dma_domain_list);
-static DEFINE_SPINLOCK(dma_domain_list_lock);
-
-void add_dma_domain(struct dma_domain *domain)
-{
- spin_lock(&dma_domain_list_lock);
- list_add(&domain->node, &dma_domain_list);
- spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(add_dma_domain);
-
-void del_dma_domain(struct dma_domain *domain)
-{
- spin_lock(&dma_domain_list_lock);
- list_del(&domain->node);
- spin_unlock(&dma_domain_list_lock);
-}
-EXPORT_SYMBOL_GPL(del_dma_domain);
-
-static void set_dma_domain_ops(struct pci_dev *pdev)
-{
- struct dma_domain *domain;
-
- spin_lock(&dma_domain_list_lock);
- list_for_each_entry(domain, &dma_domain_list, node) {
- if (pci_domain_nr(pdev->bus) == domain->domain_nr) {
- pdev->dev.dma_ops = domain->dma_ops;
- break;
- }
- }
- spin_unlock(&dma_domain_list_lock);
-}
-#else
-static void set_dma_domain_ops(struct pci_dev *pdev) {}
-#endif
-
static void set_dev_domain_options(struct pci_dev *pdev)
{
if (is_vmd(pdev->bus))
@@ -697,7 +660,6 @@ int pcibios_add_device(struct pci_dev *dev)
pa_data = data->next;
memunmap(data);
}
- set_dma_domain_ops(dev);
set_dev_domain_options(dev);
return 0;
}
@@ -736,3 +698,13 @@ int pci_ext_cfg_avail(void)
else
return 0;
}
+
+#if IS_ENABLED(CONFIG_VMD)
+struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
+{
+ if (is_vmd(dev->bus))
+ return to_pci_sysdata(dev->bus)->vmd_dev;
+
+ return dev;
+}
+#endif
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 9df652d3d927..fa855bbaebaf 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -34,7 +34,7 @@
#include <linux/errno.h>
#include <linux/memblock.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/e820/api.h>
#include <asm/pci_x86.h>
#include <asm/io_apic.h>
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 887d181b769b..0c7b6e66c644 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -105,7 +105,7 @@ static void __iomem *mcfg_ioremap(struct pci_mmcfg_region *cfg)
start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
num_buses = cfg->end_bus - cfg->start_bus + 1;
size = PCI_MMCFG_BUS_OFFSET(num_buses);
- addr = ioremap_nocache(start, size);
+ addr = ioremap(start, size);
if (addr)
addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
return addr;
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index fe29f3f5d384..84b09c230cbd 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
-OBJECT_FILES_NON_STANDARD_efi_stub_$(BITS).o := y
+KASAN_SANITIZE := n
+GCOV_PROFILE := n
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_EFI_MIXED) += efi_thunk_$(BITS).o
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 38d44f36d5ed..ae923ee8e2b4 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -54,8 +54,8 @@
#include <asm/x86_init.h>
#include <asm/uv/uv.h>
-static struct efi efi_phys __initdata;
static efi_system_table_t efi_systab __initdata;
+static u64 efi_systab_phys __initdata;
static efi_config_table_type_t arch_tables[] __initdata = {
#ifdef CONFIG_X86_UV
@@ -97,32 +97,6 @@ static int __init setup_add_efi_memmap(char *arg)
}
early_param("add_efi_memmap", setup_add_efi_memmap);
-static efi_status_t __init phys_efi_set_virtual_address_map(
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
-{
- efi_status_t status;
- unsigned long flags;
- pgd_t *save_pgd;
-
- save_pgd = efi_call_phys_prolog();
- if (!save_pgd)
- return EFI_ABORTED;
-
- /* Disable interrupts around EFI calls: */
- local_irq_save(flags);
- status = efi_call_phys(efi_phys.set_virtual_address_map,
- memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
- local_irq_restore(flags);
-
- efi_call_phys_epilog(save_pgd);
-
- return status;
-}
-
void __init efi_find_mirror(void)
{
efi_memory_desc_t *md;
@@ -330,10 +304,16 @@ static void __init efi_clean_memmap(void)
}
if (n_removal > 0) {
- u64 size = efi.memmap.nr_map - n_removal;
+ struct efi_memory_map_data data = {
+ .phys_map = efi.memmap.phys_map,
+ .desc_version = efi.memmap.desc_version,
+ .desc_size = efi.memmap.desc_size,
+ .size = efi.memmap.desc_size * (efi.memmap.nr_map - n_removal),
+ .flags = 0,
+ };
pr_warn("Removing %d invalid memory map entries.\n", n_removal);
- efi_memmap_install(efi.memmap.phys_map, size);
+ efi_memmap_install(&data);
}
}
@@ -353,89 +333,90 @@ void __init efi_print_memmap(void)
}
}
-static int __init efi_systab_init(void *phys)
+static int __init efi_systab_init(u64 phys)
{
+ int size = efi_enabled(EFI_64BIT) ? sizeof(efi_system_table_64_t)
+ : sizeof(efi_system_table_32_t);
+ bool over4g = false;
+ void *p;
+
+ p = early_memremap_ro(phys, size);
+ if (p == NULL) {
+ pr_err("Couldn't map the system table!\n");
+ return -ENOMEM;
+ }
+
if (efi_enabled(EFI_64BIT)) {
- efi_system_table_64_t *systab64;
- struct efi_setup_data *data = NULL;
- u64 tmp = 0;
+ const efi_system_table_64_t *systab64 = p;
+
+ efi_systab.hdr = systab64->hdr;
+ efi_systab.fw_vendor = systab64->fw_vendor;
+ efi_systab.fw_revision = systab64->fw_revision;
+ efi_systab.con_in_handle = systab64->con_in_handle;
+ efi_systab.con_in = systab64->con_in;
+ efi_systab.con_out_handle = systab64->con_out_handle;
+ efi_systab.con_out = (void *)(unsigned long)systab64->con_out;
+ efi_systab.stderr_handle = systab64->stderr_handle;
+ efi_systab.stderr = systab64->stderr;
+ efi_systab.runtime = (void *)(unsigned long)systab64->runtime;
+ efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
+ efi_systab.nr_tables = systab64->nr_tables;
+ efi_systab.tables = systab64->tables;
+
+ over4g = systab64->con_in_handle > U32_MAX ||
+ systab64->con_in > U32_MAX ||
+ systab64->con_out_handle > U32_MAX ||
+ systab64->con_out > U32_MAX ||
+ systab64->stderr_handle > U32_MAX ||
+ systab64->stderr > U32_MAX ||
+ systab64->boottime > U32_MAX;
if (efi_setup) {
- data = early_memremap(efi_setup, sizeof(*data));
- if (!data)
+ struct efi_setup_data *data;
+
+ data = early_memremap_ro(efi_setup, sizeof(*data));
+ if (!data) {
+ early_memunmap(p, size);
return -ENOMEM;
- }
- systab64 = early_memremap((unsigned long)phys,
- sizeof(*systab64));
- if (systab64 == NULL) {
- pr_err("Couldn't map the system table!\n");
- if (data)
- early_memunmap(data, sizeof(*data));
- return -ENOMEM;
- }
+ }
+
+ efi_systab.fw_vendor = (unsigned long)data->fw_vendor;
+ efi_systab.runtime = (void *)(unsigned long)data->runtime;
+ efi_systab.tables = (unsigned long)data->tables;
+
+ over4g |= data->fw_vendor > U32_MAX ||
+ data->runtime > U32_MAX ||
+ data->tables > U32_MAX;
- efi_systab.hdr = systab64->hdr;
- efi_systab.fw_vendor = data ? (unsigned long)data->fw_vendor :
- systab64->fw_vendor;
- tmp |= data ? data->fw_vendor : systab64->fw_vendor;
- efi_systab.fw_revision = systab64->fw_revision;
- efi_systab.con_in_handle = systab64->con_in_handle;
- tmp |= systab64->con_in_handle;
- efi_systab.con_in = systab64->con_in;
- tmp |= systab64->con_in;
- efi_systab.con_out_handle = systab64->con_out_handle;
- tmp |= systab64->con_out_handle;
- efi_systab.con_out = systab64->con_out;
- tmp |= systab64->con_out;
- efi_systab.stderr_handle = systab64->stderr_handle;
- tmp |= systab64->stderr_handle;
- efi_systab.stderr = systab64->stderr;
- tmp |= systab64->stderr;
- efi_systab.runtime = data ?
- (void *)(unsigned long)data->runtime :
- (void *)(unsigned long)systab64->runtime;
- tmp |= data ? data->runtime : systab64->runtime;
- efi_systab.boottime = (void *)(unsigned long)systab64->boottime;
- tmp |= systab64->boottime;
- efi_systab.nr_tables = systab64->nr_tables;
- efi_systab.tables = data ? (unsigned long)data->tables :
- systab64->tables;
- tmp |= data ? data->tables : systab64->tables;
-
- early_memunmap(systab64, sizeof(*systab64));
- if (data)
early_memunmap(data, sizeof(*data));
-#ifdef CONFIG_X86_32
- if (tmp >> 32) {
- pr_err("EFI data located above 4GB, disabling EFI.\n");
- return -EINVAL;
+ } else {
+ over4g |= systab64->fw_vendor > U32_MAX ||
+ systab64->runtime > U32_MAX ||
+ systab64->tables > U32_MAX;
}
-#endif
} else {
- efi_system_table_32_t *systab32;
-
- systab32 = early_memremap((unsigned long)phys,
- sizeof(*systab32));
- if (systab32 == NULL) {
- pr_err("Couldn't map the system table!\n");
- return -ENOMEM;
- }
-
- efi_systab.hdr = systab32->hdr;
- efi_systab.fw_vendor = systab32->fw_vendor;
- efi_systab.fw_revision = systab32->fw_revision;
- efi_systab.con_in_handle = systab32->con_in_handle;
- efi_systab.con_in = systab32->con_in;
- efi_systab.con_out_handle = systab32->con_out_handle;
- efi_systab.con_out = systab32->con_out;
- efi_systab.stderr_handle = systab32->stderr_handle;
- efi_systab.stderr = systab32->stderr;
- efi_systab.runtime = (void *)(unsigned long)systab32->runtime;
- efi_systab.boottime = (void *)(unsigned long)systab32->boottime;
- efi_systab.nr_tables = systab32->nr_tables;
- efi_systab.tables = systab32->tables;
-
- early_memunmap(systab32, sizeof(*systab32));
+ const efi_system_table_32_t *systab32 = p;
+
+ efi_systab.hdr = systab32->hdr;
+ efi_systab.fw_vendor = systab32->fw_vendor;
+ efi_systab.fw_revision = systab32->fw_revision;
+ efi_systab.con_in_handle = systab32->con_in_handle;
+ efi_systab.con_in = systab32->con_in;
+ efi_systab.con_out_handle = systab32->con_out_handle;
+ efi_systab.con_out = (void *)(unsigned long)systab32->con_out;
+ efi_systab.stderr_handle = systab32->stderr_handle;
+ efi_systab.stderr = systab32->stderr;
+ efi_systab.runtime = (void *)(unsigned long)systab32->runtime;
+ efi_systab.boottime = (void *)(unsigned long)systab32->boottime;
+ efi_systab.nr_tables = systab32->nr_tables;
+ efi_systab.tables = systab32->tables;
+ }
+
+ early_memunmap(p, size);
+
+ if (IS_ENABLED(CONFIG_X86_32) && over4g) {
+ pr_err("EFI data located above 4GB, disabling EFI.\n");
+ return -EINVAL;
}
efi.systab = &efi_systab;
@@ -455,108 +436,23 @@ static int __init efi_systab_init(void *phys)
return 0;
}
-static int __init efi_runtime_init32(void)
-{
- efi_runtime_services_32_t *runtime;
-
- runtime = early_memremap((unsigned long)efi.systab->runtime,
- sizeof(efi_runtime_services_32_t));
- if (!runtime) {
- pr_err("Could not map the runtime service table!\n");
- return -ENOMEM;
- }
-
- /*
- * We will only need *early* access to the SetVirtualAddressMap
- * EFI runtime service. All other runtime services will be called
- * via the virtual mapping.
- */
- efi_phys.set_virtual_address_map =
- (efi_set_virtual_address_map_t *)
- (unsigned long)runtime->set_virtual_address_map;
- early_memunmap(runtime, sizeof(efi_runtime_services_32_t));
-
- return 0;
-}
-
-static int __init efi_runtime_init64(void)
-{
- efi_runtime_services_64_t *runtime;
-
- runtime = early_memremap((unsigned long)efi.systab->runtime,
- sizeof(efi_runtime_services_64_t));
- if (!runtime) {
- pr_err("Could not map the runtime service table!\n");
- return -ENOMEM;
- }
-
- /*
- * We will only need *early* access to the SetVirtualAddressMap
- * EFI runtime service. All other runtime services will be called
- * via the virtual mapping.
- */
- efi_phys.set_virtual_address_map =
- (efi_set_virtual_address_map_t *)
- (unsigned long)runtime->set_virtual_address_map;
- early_memunmap(runtime, sizeof(efi_runtime_services_64_t));
-
- return 0;
-}
-
-static int __init efi_runtime_init(void)
-{
- int rv;
-
- /*
- * Check out the runtime services table. We need to map
- * the runtime services table so that we can grab the physical
- * address of several of the EFI runtime functions, needed to
- * set the firmware into virtual mode.
- *
- * When EFI_PARAVIRT is in force then we could not map runtime
- * service memory region because we do not have direct access to it.
- * However, runtime services are available through proxy functions
- * (e.g. in case of Xen dom0 EFI implementation they call special
- * hypercall which executes relevant EFI functions) and that is why
- * they are always enabled.
- */
-
- if (!efi_enabled(EFI_PARAVIRT)) {
- if (efi_enabled(EFI_64BIT))
- rv = efi_runtime_init64();
- else
- rv = efi_runtime_init32();
-
- if (rv)
- return rv;
- }
-
- set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
-
- return 0;
-}
-
void __init efi_init(void)
{
efi_char16_t *c16;
char vendor[100] = "unknown";
int i = 0;
- void *tmp;
-#ifdef CONFIG_X86_32
- if (boot_params.efi_info.efi_systab_hi ||
- boot_params.efi_info.efi_memmap_hi) {
+ if (IS_ENABLED(CONFIG_X86_32) &&
+ (boot_params.efi_info.efi_systab_hi ||
+ boot_params.efi_info.efi_memmap_hi)) {
pr_info("Table located above 4GB, disabling EFI.\n");
return;
}
- efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
-#else
- efi_phys.systab = (efi_system_table_t *)
- (boot_params.efi_info.efi_systab |
- ((__u64)boot_params.efi_info.efi_systab_hi<<32));
-#endif
- if (efi_systab_init(efi_phys.systab))
+ efi_systab_phys = boot_params.efi_info.efi_systab |
+ ((__u64)boot_params.efi_info.efi_systab_hi << 32);
+
+ if (efi_systab_init(efi_systab_phys))
return;
efi.config_table = (unsigned long)efi.systab->tables;
@@ -566,14 +462,16 @@ void __init efi_init(void)
/*
* Show what we know for posterity
*/
- c16 = tmp = early_memremap(efi.systab->fw_vendor, 2);
+ c16 = early_memremap_ro(efi.systab->fw_vendor,
+ sizeof(vendor) * sizeof(efi_char16_t));
if (c16) {
- for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
- vendor[i] = *c16++;
+ for (i = 0; i < sizeof(vendor) - 1 && c16[i]; ++i)
+ vendor[i] = c16[i];
vendor[i] = '\0';
- } else
+ early_memunmap(c16, sizeof(vendor) * sizeof(efi_char16_t));
+ } else {
pr_err("Could not map the firmware vendor!\n");
- early_memunmap(tmp, 2);
+ }
pr_info("EFI v%u.%.02u by %s\n",
efi.systab->hdr.revision >> 16,
@@ -592,19 +490,21 @@ void __init efi_init(void)
if (!efi_runtime_supported())
pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n");
- else {
- if (efi_runtime_disabled() || efi_runtime_init()) {
- efi_memmap_unmap();
- return;
- }
+
+ if (!efi_runtime_supported() || efi_runtime_disabled()) {
+ efi_memmap_unmap();
+ return;
}
+ set_bit(EFI_RUNTIME_SERVICES, &efi.flags);
efi_clean_memmap();
if (efi_enabled(EFI_DBG))
efi_print_memmap();
}
+#if defined(CONFIG_X86_32) || defined(CONFIG_X86_UV)
+
void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
{
u64 addr, npages;
@@ -669,6 +569,8 @@ void __init old_map_region(efi_memory_desc_t *md)
(unsigned long long)md->phys_addr);
}
+#endif
+
/* Merge contiguous regions of the same type and attribute */
static void __init efi_merge_regions(void)
{
@@ -707,7 +609,7 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- systab = (u64)(unsigned long)efi_phys.systab;
+ systab = efi_systab_phys;
if (md->phys_addr <= systab && systab < end) {
systab += md->virt_addr - md->phys_addr;
efi.systab = (efi_system_table_t *)(unsigned long)systab;
@@ -767,7 +669,7 @@ static inline void *efi_map_next_entry_reverse(void *entry)
*/
static void *efi_map_next_entry(void *entry)
{
- if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
+ if (!efi_have_uv1_memmap() && efi_enabled(EFI_64BIT)) {
/*
* Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
* config table feature requires us to map all entries
@@ -828,7 +730,7 @@ static bool should_map_region(efi_memory_desc_t *md)
* Map all of RAM so that we can access arguments in the 1:1
* mapping when making EFI runtime calls.
*/
- if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) {
+ if (efi_is_mixed()) {
if (md->type == EFI_CONVENTIONAL_MEMORY ||
md->type == EFI_LOADER_DATA ||
md->type == EFI_LOADER_CODE)
@@ -899,11 +801,11 @@ static void __init kexec_enter_virtual_mode(void)
/*
* We don't do virtual mode, since we don't do runtime services, on
- * non-native EFI. With efi=old_map, we don't do runtime services in
+ * non-native EFI. With the UV1 memmap, we don't do runtime services in
* kexec kernel because in the initial boot something else might
* have been mapped at these virtual addresses.
*/
- if (!efi_is_native() || efi_enabled(EFI_OLD_MEMMAP)) {
+ if (efi_is_mixed() || efi_have_uv1_memmap()) {
efi_memmap_unmap();
clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
return;
@@ -958,11 +860,6 @@ static void __init kexec_enter_virtual_mode(void)
efi.runtime_version = efi_systab.hdr.revision;
efi_native_runtime_setup();
-
- efi.set_virtual_address_map = NULL;
-
- if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
- runtime_code_page_mkexec();
#endif
}
@@ -974,9 +871,9 @@ static void __init kexec_enter_virtual_mode(void)
*
* The old method which used to update that memory descriptor with the
* virtual address obtained from ioremap() is still supported when the
- * kernel is booted with efi=old_map on its command line. Same old
- * method enabled the runtime services to be called without having to
- * thunk back into physical mode for every invocation.
+ * kernel is booted on SG1 UV1 hardware. Same old method enabled the
+ * runtime services to be called without having to thunk back into
+ * physical mode for every invocation.
*
* The new method does a pagetable switch in a preemption-safe manner
* so that we're in a different address space when calling a runtime
@@ -999,16 +896,14 @@ static void __init __efi_enter_virtual_mode(void)
if (efi_alloc_page_tables()) {
pr_err("Failed to allocate EFI page tables\n");
- clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- return;
+ goto err;
}
efi_merge_regions();
new_memmap = efi_map_regions(&count, &pg_shift);
if (!new_memmap) {
pr_err("Error reallocating memory, EFI runtime non-functional!\n");
- clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- return;
+ goto err;
}
pa = __pa(new_memmap);
@@ -1022,8 +917,7 @@ static void __init __efi_enter_virtual_mode(void)
if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) {
pr_err("Failed to remap late EFI memory map\n");
- clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- return;
+ goto err;
}
if (efi_enabled(EFI_DBG)) {
@@ -1031,34 +925,22 @@ static void __init __efi_enter_virtual_mode(void)
efi_print_memmap();
}
- BUG_ON(!efi.systab);
+ if (WARN_ON(!efi.systab))
+ goto err;
- if (efi_setup_page_tables(pa, 1 << pg_shift)) {
- clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
- return;
- }
+ if (efi_setup_page_tables(pa, 1 << pg_shift))
+ goto err;
efi_sync_low_kernel_mappings();
- if (efi_is_native()) {
- status = phys_efi_set_virtual_address_map(
- efi.memmap.desc_size * count,
- efi.memmap.desc_size,
- efi.memmap.desc_version,
- (efi_memory_desc_t *)pa);
- } else {
- status = efi_thunk_set_virtual_address_map(
- efi_phys.set_virtual_address_map,
- efi.memmap.desc_size * count,
- efi.memmap.desc_size,
- efi.memmap.desc_version,
- (efi_memory_desc_t *)pa);
- }
-
+ status = efi_set_virtual_address_map(efi.memmap.desc_size * count,
+ efi.memmap.desc_size,
+ efi.memmap.desc_version,
+ (efi_memory_desc_t *)pa);
if (status != EFI_SUCCESS) {
- pr_alert("Unable to switch EFI into virtual mode (status=%lx)!\n",
- status);
- panic("EFI call to SetVirtualAddressMap() failed!");
+ pr_err("Unable to switch EFI into virtual mode (status=%lx)!\n",
+ status);
+ goto err;
}
efi_free_boot_services();
@@ -1071,13 +953,11 @@ static void __init __efi_enter_virtual_mode(void)
*/
efi.runtime_version = efi_systab.hdr.revision;
- if (efi_is_native())
+ if (!efi_is_mixed())
efi_native_runtime_setup();
else
efi_thunk_runtime_setup();
- efi.set_virtual_address_map = NULL;
-
/*
* Apply more restrictive page table mapping attributes now that
* SVAM() has been called and the firmware has performed all
@@ -1087,6 +967,10 @@ static void __init __efi_enter_virtual_mode(void)
/* clean DUMMY object */
efi_delete_dummy_variable();
+ return;
+
+err:
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
}
void __init efi_enter_virtual_mode(void)
@@ -1102,20 +986,6 @@ void __init efi_enter_virtual_mode(void)
efi_dump_pagetable();
}
-static int __init arch_parse_efi_cmdline(char *str)
-{
- if (!str) {
- pr_warn("need at least one option\n");
- return -EINVAL;
- }
-
- if (parse_option_str(str, "old_map"))
- set_bit(EFI_OLD_MEMMAP, &efi.flags);
-
- return 0;
-}
-early_param("efi", arch_parse_efi_cmdline);
-
bool efi_is_table_address(unsigned long phys_addr)
{
unsigned int i;
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 9959657127f4..081d466002c9 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -49,7 +49,7 @@ void efi_sync_low_kernel_mappings(void) {}
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ ptdump_walk_pgd_level(NULL, &init_mm);
#endif
}
@@ -66,9 +66,17 @@ void __init efi_map_region(efi_memory_desc_t *md)
void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
-pgd_t * __init efi_call_phys_prolog(void)
+efi_status_t efi_call_svam(efi_set_virtual_address_map_t *__efiapi *,
+ u32, u32, u32, void *);
+
+efi_status_t __init efi_set_virtual_address_map(unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
{
struct desc_ptr gdt_descr;
+ efi_status_t status;
+ unsigned long flags;
pgd_t *save_pgd;
/* Current pgd is swapper_pg_dir, we'll restore it later: */
@@ -80,14 +88,18 @@ pgd_t * __init efi_call_phys_prolog(void)
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
- return save_pgd;
-}
+ /* Disable interrupts around EFI calls: */
+ local_irq_save(flags);
+ status = efi_call_svam(&efi.systab->runtime->set_virtual_address_map,
+ memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
+ local_irq_restore(flags);
-void __init efi_call_phys_epilog(pgd_t *save_pgd)
-{
load_fixmap_gdt(0);
load_cr3(save_pgd);
__flush_tlb_all();
+
+ return status;
}
void __init efi_runtime_update_mappings(void)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 08ce8177c3af..d19a2edd63cb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -57,142 +57,6 @@ static u64 efi_va = EFI_VA_START;
struct efi_scratch efi_scratch;
-static void __init early_code_mapping_set_exec(int executable)
-{
- efi_memory_desc_t *md;
-
- if (!(__supported_pte_mask & _PAGE_NX))
- return;
-
- /* Make EFI service code area executable */
- for_each_efi_memory_desc(md) {
- if (md->type == EFI_RUNTIME_SERVICES_CODE ||
- md->type == EFI_BOOT_SERVICES_CODE)
- efi_set_executable(md, executable);
- }
-}
-
-pgd_t * __init efi_call_phys_prolog(void)
-{
- unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
- pgd_t *save_pgd, *pgd_k, *pgd_efi;
- p4d_t *p4d, *p4d_k, *p4d_efi;
- pud_t *pud;
-
- int pgd;
- int n_pgds, i, j;
-
- if (!efi_enabled(EFI_OLD_MEMMAP)) {
- efi_switch_mm(&efi_mm);
- return efi_mm.pgd;
- }
-
- early_code_mapping_set_exec(1);
-
- n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
- save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
- if (!save_pgd)
- return NULL;
-
- /*
- * Build 1:1 identity mapping for efi=old_map usage. Note that
- * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
- * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
- * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
- * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
- * This means here we can only reuse the PMD tables of the direct mapping.
- */
- for (pgd = 0; pgd < n_pgds; pgd++) {
- addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
- vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
- pgd_efi = pgd_offset_k(addr_pgd);
- save_pgd[pgd] = *pgd_efi;
-
- p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
- if (!p4d) {
- pr_err("Failed to allocate p4d table!\n");
- goto out;
- }
-
- for (i = 0; i < PTRS_PER_P4D; i++) {
- addr_p4d = addr_pgd + i * P4D_SIZE;
- p4d_efi = p4d + p4d_index(addr_p4d);
-
- pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
- if (!pud) {
- pr_err("Failed to allocate pud table!\n");
- goto out;
- }
-
- for (j = 0; j < PTRS_PER_PUD; j++) {
- addr_pud = addr_p4d + j * PUD_SIZE;
-
- if (addr_pud > (max_pfn << PAGE_SHIFT))
- break;
-
- vaddr = (unsigned long)__va(addr_pud);
-
- pgd_k = pgd_offset_k(vaddr);
- p4d_k = p4d_offset(pgd_k, vaddr);
- pud[j] = *pud_offset(p4d_k, vaddr);
- }
- }
- pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
- }
-
- __flush_tlb_all();
- return save_pgd;
-out:
- efi_call_phys_epilog(save_pgd);
- return NULL;
-}
-
-void __init efi_call_phys_epilog(pgd_t *save_pgd)
-{
- /*
- * After the lock is released, the original page table is restored.
- */
- int pgd_idx, i;
- int nr_pgds;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
-
- if (!efi_enabled(EFI_OLD_MEMMAP)) {
- efi_switch_mm(efi_scratch.prev_mm);
- return;
- }
-
- nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
-
- for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
- pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
- set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
-
- if (!pgd_present(*pgd))
- continue;
-
- for (i = 0; i < PTRS_PER_P4D; i++) {
- p4d = p4d_offset(pgd,
- pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
-
- if (!p4d_present(*p4d))
- continue;
-
- pud = (pud_t *)p4d_page_vaddr(*p4d);
- pud_free(&init_mm, pud);
- }
-
- p4d = (p4d_t *)pgd_page_vaddr(*pgd);
- p4d_free(&init_mm, p4d);
- }
-
- kfree(save_pgd);
-
- __flush_tlb_all();
- early_code_mapping_set_exec(0);
-}
-
EXPORT_SYMBOL_GPL(efi_mm);
/*
@@ -211,7 +75,7 @@ int __init efi_alloc_page_tables(void)
pud_t *pud;
gfp_t gfp_mask;
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return 0;
gfp_mask = GFP_KERNEL | __GFP_ZERO;
@@ -252,7 +116,7 @@ void efi_sync_low_kernel_mappings(void)
pud_t *pud_k, *pud_efi;
pgd_t *efi_pgd = efi_mm.pgd;
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return;
/*
@@ -316,7 +180,7 @@ void efi_sync_low_kernel_mappings(void)
static inline phys_addr_t
virt_to_phys_or_null_size(void *va, unsigned long size)
{
- bool bad_size;
+ phys_addr_t pa;
if (!va)
return 0;
@@ -324,16 +188,13 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
if (virt_addr_valid(va))
return virt_to_phys(va);
- /*
- * A fully aligned variable on the stack is guaranteed not to
- * cross a page bounary. Try to catch strings on the stack by
- * checking that 'size' is a power of two.
- */
- bad_size = size > PAGE_SIZE || !is_power_of_2(size);
+ pa = slow_virt_to_phys(va);
- WARN_ON(!IS_ALIGNED((unsigned long)va, size) || bad_size);
+ /* check if the object crosses a page boundary */
+ if (WARN_ON((pa ^ (pa + size - 1)) & PAGE_MASK))
+ return 0;
- return slow_virt_to_phys(va);
+ return pa;
}
#define virt_to_phys_or_null(addr) \
@@ -346,7 +207,7 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
unsigned npages;
pgd_t *pgd = efi_mm.pgd;
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return 0;
/*
@@ -373,10 +234,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* as trim_bios_range() will reserve the first page and isolate it away
* from memory allocators anyway.
*/
- pf = _PAGE_RW;
- if (sev_active())
- pf |= _PAGE_ENC;
-
if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) {
pr_err("Failed to create 1:1 mapping for the first page!\n");
return 1;
@@ -388,21 +245,22 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* text and allocate a new stack because we can't rely on the
* stack pointer being < 4GB.
*/
- if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native())
+ if (!efi_is_mixed())
return 0;
page = alloc_page(GFP_KERNEL|__GFP_DMA32);
- if (!page)
- panic("Unable to allocate EFI runtime stack < 4GB\n");
+ if (!page) {
+ pr_err("Unable to allocate EFI runtime stack < 4GB\n");
+ return 1;
+ }
- efi_scratch.phys_stack = virt_to_phys(page_address(page));
- efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
+ efi_scratch.phys_stack = page_to_phys(page + 1); /* stack grows down */
- npages = (_etext - _text) >> PAGE_SHIFT;
+ npages = (__end_rodata_aligned - _text) >> PAGE_SHIFT;
text = __pa(_text);
pfn = text >> PAGE_SHIFT;
- pf = _PAGE_RW | _PAGE_ENC;
+ pf = _PAGE_ENC;
if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
pr_err("Failed to map kernel text 1:1\n");
return 1;
@@ -417,6 +275,22 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va)
unsigned long pfn;
pgd_t *pgd = efi_mm.pgd;
+ /*
+ * EFI_RUNTIME_SERVICES_CODE regions typically cover PE/COFF
+ * executable images in memory that consist of both R-X and
+ * RW- sections, so we cannot apply read-only or non-exec
+ * permissions just yet. However, modern EFI systems provide
+ * a memory attributes table that describes those sections
+ * with the appropriate restricted permissions, which are
+ * applied in efi_runtime_update_mappings() below. All other
+ * regions can be mapped non-executable at this point, with
+ * the exception of boot services code regions, but those will
+ * be unmapped again entirely in efi_free_boot_services().
+ */
+ if (md->type != EFI_BOOT_SERVICES_CODE &&
+ md->type != EFI_RUNTIME_SERVICES_CODE)
+ flags |= _PAGE_NX;
+
if (!(md->attribute & EFI_MEMORY_WB))
flags |= _PAGE_PCD;
@@ -434,7 +308,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
unsigned long size = md->num_pages << PAGE_SHIFT;
u64 pa = md->phys_addr;
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return old_map_region(md);
/*
@@ -449,7 +323,7 @@ void __init efi_map_region(efi_memory_desc_t *md)
* booting in EFI mixed mode, because even though we may be
* running a 64-bit kernel, the firmware may only be 32-bit.
*/
- if (!efi_is_native () && IS_ENABLED(CONFIG_EFI_MIXED)) {
+ if (efi_is_mixed()) {
md->virt_addr = md->phys_addr;
return;
}
@@ -491,26 +365,6 @@ void __init efi_map_region_fixed(efi_memory_desc_t *md)
__map_region(md, md->virt_addr);
}
-void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
- u32 type, u64 attribute)
-{
- unsigned long last_map_pfn;
-
- if (type == EFI_MEMORY_MAPPED_IO)
- return ioremap(phys_addr, size);
-
- last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
- if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
- unsigned long top = last_map_pfn << PAGE_SHIFT;
- efi_ioremap(top, size - (top - phys_addr), type, attribute);
- }
-
- if (!(attribute & EFI_MEMORY_WB))
- efi_memory_uc((u64)(unsigned long)__va(phys_addr), size);
-
- return (void __iomem *)__va(phys_addr);
-}
-
void __init parse_efi_setup(u64 phys_addr, u32 data_len)
{
efi_setup = phys_addr + sizeof(struct setup_data);
@@ -559,7 +413,7 @@ void __init efi_runtime_update_mappings(void)
{
efi_memory_desc_t *md;
- if (efi_enabled(EFI_OLD_MEMMAP)) {
+ if (efi_have_uv1_memmap()) {
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
return;
@@ -613,10 +467,10 @@ void __init efi_runtime_update_mappings(void)
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- if (efi_enabled(EFI_OLD_MEMMAP))
- ptdump_walk_pgd_level(NULL, swapper_pg_dir);
+ if (efi_have_uv1_memmap())
+ ptdump_walk_pgd_level(NULL, &init_mm);
else
- ptdump_walk_pgd_level(NULL, efi_mm.pgd);
+ ptdump_walk_pgd_level(NULL, &efi_mm);
#endif
}
@@ -634,63 +488,74 @@ void efi_switch_mm(struct mm_struct *mm)
switch_mm(efi_scratch.prev_mm, mm, NULL);
}
-#ifdef CONFIG_EFI_MIXED
-extern efi_status_t efi64_thunk(u32, ...);
-
static DEFINE_SPINLOCK(efi_runtime_lock);
-#define runtime_service32(func) \
-({ \
- u32 table = (u32)(unsigned long)efi.systab; \
- u32 *rt, *___f; \
- \
- rt = (u32 *)(table + offsetof(efi_system_table_32_t, runtime)); \
- ___f = (u32 *)(*rt + offsetof(efi_runtime_services_32_t, func)); \
- *___f; \
+/*
+ * DS and ES contain user values. We need to save them.
+ * The 32-bit EFI code needs a valid DS, ES, and SS. There's no
+ * need to save the old SS: __KERNEL_DS is always acceptable.
+ */
+#define __efi_thunk(func, ...) \
+({ \
+ efi_runtime_services_32_t *__rt; \
+ unsigned short __ds, __es; \
+ efi_status_t ____s; \
+ \
+ __rt = (void *)(unsigned long)efi.systab->mixed_mode.runtime; \
+ \
+ savesegment(ds, __ds); \
+ savesegment(es, __es); \
+ \
+ loadsegment(ss, __KERNEL_DS); \
+ loadsegment(ds, __KERNEL_DS); \
+ loadsegment(es, __KERNEL_DS); \
+ \
+ ____s = efi64_thunk(__rt->func, __VA_ARGS__); \
+ \
+ loadsegment(ds, __ds); \
+ loadsegment(es, __es); \
+ \
+ ____s ^= (____s & BIT(31)) | (____s & BIT_ULL(31)) << 32; \
+ ____s; \
})
/*
* Switch to the EFI page tables early so that we can access the 1:1
* runtime services mappings which are not mapped in any other page
- * tables. This function must be called before runtime_service32().
+ * tables.
*
* Also, disable interrupts because the IDT points to 64-bit handlers,
* which aren't going to function correctly when we switch to 32-bit.
*/
-#define efi_thunk(f, ...) \
+#define efi_thunk(func...) \
({ \
efi_status_t __s; \
- u32 __func; \
\
arch_efi_call_virt_setup(); \
\
- __func = runtime_service32(f); \
- __s = efi64_thunk(__func, __VA_ARGS__); \
+ __s = __efi_thunk(func); \
\
arch_efi_call_virt_teardown(); \
\
__s; \
})
-efi_status_t efi_thunk_set_virtual_address_map(
- void *phys_set_virtual_address_map,
- unsigned long memory_map_size,
- unsigned long descriptor_size,
- u32 descriptor_version,
- efi_memory_desc_t *virtual_map)
+static efi_status_t __init __no_sanitize_address
+efi_thunk_set_virtual_address_map(unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
{
efi_status_t status;
unsigned long flags;
- u32 func;
efi_sync_low_kernel_mappings();
local_irq_save(flags);
efi_switch_mm(&efi_mm);
- func = (u32)(unsigned long)phys_set_virtual_address_map;
- status = efi64_thunk(func, memory_map_size, descriptor_size,
- descriptor_version, virtual_map);
+ status = __efi_thunk(set_virtual_address_map, memory_map_size,
+ descriptor_size, descriptor_version, virtual_map);
efi_switch_mm(efi_scratch.prev_mm);
local_irq_restore(flags);
@@ -700,85 +565,25 @@ efi_status_t efi_thunk_set_virtual_address_map(
static efi_status_t efi_thunk_get_time(efi_time_t *tm, efi_time_cap_t *tc)
{
- efi_status_t status;
- u32 phys_tm, phys_tc;
- unsigned long flags;
-
- spin_lock(&rtc_lock);
- spin_lock_irqsave(&efi_runtime_lock, flags);
-
- phys_tm = virt_to_phys_or_null(tm);
- phys_tc = virt_to_phys_or_null(tc);
-
- status = efi_thunk(get_time, phys_tm, phys_tc);
-
- spin_unlock_irqrestore(&efi_runtime_lock, flags);
- spin_unlock(&rtc_lock);
-
- return status;
+ return EFI_UNSUPPORTED;
}
static efi_status_t efi_thunk_set_time(efi_time_t *tm)
{
- efi_status_t status;
- u32 phys_tm;
- unsigned long flags;
-
- spin_lock(&rtc_lock);
- spin_lock_irqsave(&efi_runtime_lock, flags);
-
- phys_tm = virt_to_phys_or_null(tm);
-
- status = efi_thunk(set_time, phys_tm);
-
- spin_unlock_irqrestore(&efi_runtime_lock, flags);
- spin_unlock(&rtc_lock);
-
- return status;
+ return EFI_UNSUPPORTED;
}
static efi_status_t
efi_thunk_get_wakeup_time(efi_bool_t *enabled, efi_bool_t *pending,
efi_time_t *tm)
{
- efi_status_t status;
- u32 phys_enabled, phys_pending, phys_tm;
- unsigned long flags;
-
- spin_lock(&rtc_lock);
- spin_lock_irqsave(&efi_runtime_lock, flags);
-
- phys_enabled = virt_to_phys_or_null(enabled);
- phys_pending = virt_to_phys_or_null(pending);
- phys_tm = virt_to_phys_or_null(tm);
-
- status = efi_thunk(get_wakeup_time, phys_enabled,
- phys_pending, phys_tm);
-
- spin_unlock_irqrestore(&efi_runtime_lock, flags);
- spin_unlock(&rtc_lock);
-
- return status;
+ return EFI_UNSUPPORTED;
}
static efi_status_t
efi_thunk_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
{
- efi_status_t status;
- u32 phys_tm;
- unsigned long flags;
-
- spin_lock(&rtc_lock);
- spin_lock_irqsave(&efi_runtime_lock, flags);
-
- phys_tm = virt_to_phys_or_null(tm);
-
- status = efi_thunk(set_wakeup_time, enabled, phys_tm);
-
- spin_unlock_irqrestore(&efi_runtime_lock, flags);
- spin_unlock(&rtc_lock);
-
- return status;
+ return EFI_UNSUPPORTED;
}
static unsigned long efi_name_size(efi_char16_t *name)
@@ -790,6 +595,8 @@ static efi_status_t
efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
u32 *attr, unsigned long *data_size, void *data)
{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
efi_status_t status;
u32 phys_name, phys_vendor, phys_attr;
u32 phys_data_size, phys_data;
@@ -797,14 +604,19 @@ efi_thunk_get_variable(efi_char16_t *name, efi_guid_t *vendor,
spin_lock_irqsave(&efi_runtime_lock, flags);
+ *vnd = *vendor;
+
phys_data_size = virt_to_phys_or_null(data_size);
- phys_vendor = virt_to_phys_or_null(vendor);
+ phys_vendor = virt_to_phys_or_null(vnd);
phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
phys_attr = virt_to_phys_or_null(attr);
phys_data = virt_to_phys_or_null_size(data, *data_size);
- status = efi_thunk(get_variable, phys_name, phys_vendor,
- phys_attr, phys_data_size, phys_data);
+ if (!phys_name || (data && !phys_data))
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(get_variable, phys_name, phys_vendor,
+ phys_attr, phys_data_size, phys_data);
spin_unlock_irqrestore(&efi_runtime_lock, flags);
@@ -815,19 +627,25 @@ static efi_status_t
efi_thunk_set_variable(efi_char16_t *name, efi_guid_t *vendor,
u32 attr, unsigned long data_size, void *data)
{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
u32 phys_name, phys_vendor, phys_data;
efi_status_t status;
unsigned long flags;
spin_lock_irqsave(&efi_runtime_lock, flags);
+ *vnd = *vendor;
+
phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
- phys_vendor = virt_to_phys_or_null(vendor);
+ phys_vendor = virt_to_phys_or_null(vnd);
phys_data = virt_to_phys_or_null_size(data, data_size);
- /* If data_size is > sizeof(u32) we've got problems */
- status = efi_thunk(set_variable, phys_name, phys_vendor,
- attr, data_size, phys_data);
+ if (!phys_name || !phys_data)
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
spin_unlock_irqrestore(&efi_runtime_lock, flags);
@@ -839,6 +657,8 @@ efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor,
u32 attr, unsigned long data_size,
void *data)
{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
u32 phys_name, phys_vendor, phys_data;
efi_status_t status;
unsigned long flags;
@@ -846,13 +666,17 @@ efi_thunk_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor,
if (!spin_trylock_irqsave(&efi_runtime_lock, flags))
return EFI_NOT_READY;
+ *vnd = *vendor;
+
phys_name = virt_to_phys_or_null_size(name, efi_name_size(name));
- phys_vendor = virt_to_phys_or_null(vendor);
+ phys_vendor = virt_to_phys_or_null(vnd);
phys_data = virt_to_phys_or_null_size(data, data_size);
- /* If data_size is > sizeof(u32) we've got problems */
- status = efi_thunk(set_variable, phys_name, phys_vendor,
- attr, data_size, phys_data);
+ if (!phys_name || !phys_data)
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(set_variable, phys_name, phys_vendor,
+ attr, data_size, phys_data);
spin_unlock_irqrestore(&efi_runtime_lock, flags);
@@ -864,39 +688,36 @@ efi_thunk_get_next_variable(unsigned long *name_size,
efi_char16_t *name,
efi_guid_t *vendor)
{
+ u8 buf[24] __aligned(8);
+ efi_guid_t *vnd = PTR_ALIGN((efi_guid_t *)buf, sizeof(*vnd));
efi_status_t status;
u32 phys_name_size, phys_name, phys_vendor;
unsigned long flags;
spin_lock_irqsave(&efi_runtime_lock, flags);
+ *vnd = *vendor;
+
phys_name_size = virt_to_phys_or_null(name_size);
- phys_vendor = virt_to_phys_or_null(vendor);
+ phys_vendor = virt_to_phys_or_null(vnd);
phys_name = virt_to_phys_or_null_size(name, *name_size);
- status = efi_thunk(get_next_variable, phys_name_size,
- phys_name, phys_vendor);
+ if (!phys_name)
+ status = EFI_INVALID_PARAMETER;
+ else
+ status = efi_thunk(get_next_variable, phys_name_size,
+ phys_name, phys_vendor);
spin_unlock_irqrestore(&efi_runtime_lock, flags);
+ *vendor = *vnd;
return status;
}
static efi_status_t
efi_thunk_get_next_high_mono_count(u32 *count)
{
- efi_status_t status;
- u32 phys_count;
- unsigned long flags;
-
- spin_lock_irqsave(&efi_runtime_lock, flags);
-
- phys_count = virt_to_phys_or_null(count);
- status = efi_thunk(get_next_high_mono_count, phys_count);
-
- spin_unlock_irqrestore(&efi_runtime_lock, flags);
-
- return status;
+ return EFI_UNSUPPORTED;
}
static void
@@ -993,8 +814,11 @@ efi_thunk_query_capsule_caps(efi_capsule_header_t **capsules,
return EFI_UNSUPPORTED;
}
-void efi_thunk_runtime_setup(void)
+void __init efi_thunk_runtime_setup(void)
{
+ if (!IS_ENABLED(CONFIG_EFI_MIXED))
+ return;
+
efi.get_time = efi_thunk_get_time;
efi.set_time = efi_thunk_set_time;
efi.get_wakeup_time = efi_thunk_get_wakeup_time;
@@ -1010,4 +834,46 @@ void efi_thunk_runtime_setup(void)
efi.update_capsule = efi_thunk_update_capsule;
efi.query_capsule_caps = efi_thunk_query_capsule_caps;
}
-#endif /* CONFIG_EFI_MIXED */
+
+efi_status_t __init __no_sanitize_address
+efi_set_virtual_address_map(unsigned long memory_map_size,
+ unsigned long descriptor_size,
+ u32 descriptor_version,
+ efi_memory_desc_t *virtual_map)
+{
+ efi_status_t status;
+ unsigned long flags;
+ pgd_t *save_pgd = NULL;
+
+ if (efi_is_mixed())
+ return efi_thunk_set_virtual_address_map(memory_map_size,
+ descriptor_size,
+ descriptor_version,
+ virtual_map);
+
+ if (efi_have_uv1_memmap()) {
+ save_pgd = efi_uv1_memmap_phys_prolog();
+ if (!save_pgd)
+ return EFI_ABORTED;
+ } else {
+ efi_switch_mm(&efi_mm);
+ }
+
+ kernel_fpu_begin();
+
+ /* Disable interrupts around EFI calls: */
+ local_irq_save(flags);
+ status = efi_call(efi.systab->runtime->set_virtual_address_map,
+ memory_map_size, descriptor_size,
+ descriptor_version, virtual_map);
+ local_irq_restore(flags);
+
+ kernel_fpu_end();
+
+ if (save_pgd)
+ efi_uv1_memmap_phys_epilog(save_pgd);
+ else
+ efi_switch_mm(efi_scratch.prev_mm);
+
+ return status;
+}
diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index eed8b5b441f8..75c46e7a809f 100644
--- a/arch/x86/platform/efi/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
@@ -7,118 +7,43 @@
*/
#include <linux/linkage.h>
+#include <linux/init.h>
#include <asm/page_types.h>
-/*
- * efi_call_phys(void *, ...) is a function with variable parameters.
- * All the callers of this function assure that all the parameters are 4-bytes.
- */
-
-/*
- * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
- * So we'd better save all of them at the beginning of this function and restore
- * at the end no matter how many we use, because we can not assure EFI runtime
- * service functions will comply with gcc calling convention, too.
- */
+ __INIT
+SYM_FUNC_START(efi_call_svam)
+ push 8(%esp)
+ push 8(%esp)
+ push %ecx
+ push %edx
-.text
-SYM_FUNC_START(efi_call_phys)
/*
- * 0. The function can only be called in Linux kernel. So CS has been
- * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
- * the values of these registers are the same. And, the corresponding
- * GDT entries are identical. So I will do nothing about segment reg
- * and GDT, but change GDT base register in prolog and epilog.
- */
-
- /*
- * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
- * But to make it smoothly switch from virtual mode to flat mode.
- * The mapping of lower virtual memory has been created in prolog and
- * epilog.
+ * Switch to the flat mapped alias of this routine, by jumping to the
+ * address of label '1' after subtracting PAGE_OFFSET from it.
*/
movl $1f, %edx
subl $__PAGE_OFFSET, %edx
jmp *%edx
1:
- /*
- * 2. Now on the top of stack is the return
- * address in the caller of efi_call_phys(), then parameter 1,
- * parameter 2, ..., param n. To make things easy, we save the return
- * address of efi_call_phys in a global variable.
- */
- popl %edx
- movl %edx, saved_return_addr
- /* get the function pointer into ECX*/
- popl %ecx
- movl %ecx, efi_rt_function_ptr
- movl $2f, %edx
- subl $__PAGE_OFFSET, %edx
- pushl %edx
-
- /*
- * 3. Clear PG bit in %CR0.
- */
+ /* disable paging */
movl %cr0, %edx
andl $0x7fffffff, %edx
movl %edx, %cr0
- jmp 1f
-1:
- /*
- * 4. Adjust stack pointer.
- */
+ /* convert the stack pointer to a flat mapped address */
subl $__PAGE_OFFSET, %esp
- /*
- * 5. Call the physical function.
- */
- jmp *%ecx
+ /* call the EFI routine */
+ call *(%eax)
-2:
- /*
- * 6. After EFI runtime service returns, control will return to
- * following instruction. We'd better readjust stack pointer first.
- */
- addl $__PAGE_OFFSET, %esp
+ /* convert ESP back to a kernel VA, and pop the outgoing args */
+ addl $__PAGE_OFFSET + 16, %esp
- /*
- * 7. Restore PG bit
- */
+ /* re-enable paging */
movl %cr0, %edx
orl $0x80000000, %edx
movl %edx, %cr0
- jmp 1f
-1:
- /*
- * 8. Now restore the virtual mode from flat mode by
- * adding EIP with PAGE_OFFSET.
- */
- movl $1f, %edx
- jmp *%edx
-1:
-
- /*
- * 9. Balance the stack. And because EAX contain the return value,
- * we'd better not clobber it.
- */
- leal efi_rt_function_ptr, %edx
- movl (%edx), %ecx
- pushl %ecx
- /*
- * 10. Push the saved return address onto the stack and return.
- */
- leal saved_return_addr, %edx
- movl (%edx), %ecx
- pushl %ecx
ret
-SYM_FUNC_END(efi_call_phys)
-.previous
-
-.data
-saved_return_addr:
- .long 0
-efi_rt_function_ptr:
- .long 0
+SYM_FUNC_END(efi_call_svam)
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index b1d2313fe3bf..15da118f04f0 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -8,41 +8,12 @@
*/
#include <linux/linkage.h>
-#include <asm/segment.h>
-#include <asm/msr.h>
-#include <asm/processor-flags.h>
-#include <asm/page_types.h>
+#include <asm/nospec-branch.h>
-#define SAVE_XMM \
- mov %rsp, %rax; \
- subq $0x70, %rsp; \
- and $~0xf, %rsp; \
- mov %rax, (%rsp); \
- mov %cr0, %rax; \
- clts; \
- mov %rax, 0x8(%rsp); \
- movaps %xmm0, 0x60(%rsp); \
- movaps %xmm1, 0x50(%rsp); \
- movaps %xmm2, 0x40(%rsp); \
- movaps %xmm3, 0x30(%rsp); \
- movaps %xmm4, 0x20(%rsp); \
- movaps %xmm5, 0x10(%rsp)
-
-#define RESTORE_XMM \
- movaps 0x60(%rsp), %xmm0; \
- movaps 0x50(%rsp), %xmm1; \
- movaps 0x40(%rsp), %xmm2; \
- movaps 0x30(%rsp), %xmm3; \
- movaps 0x20(%rsp), %xmm4; \
- movaps 0x10(%rsp), %xmm5; \
- mov 0x8(%rsp), %rsi; \
- mov %rsi, %cr0; \
- mov (%rsp), %rsp
-
-SYM_FUNC_START(efi_call)
+SYM_FUNC_START(__efi_call)
pushq %rbp
movq %rsp, %rbp
- SAVE_XMM
+ and $~0xf, %rsp
mov 16(%rbp), %rax
subq $48, %rsp
mov %r9, 32(%rsp)
@@ -50,9 +21,7 @@ SYM_FUNC_START(efi_call)
mov %r8, %r9
mov %rcx, %r8
mov %rsi, %rcx
- call *%rdi
- addq $48, %rsp
- RESTORE_XMM
- popq %rbp
+ CALL_NOSPEC %rdi
+ leave
ret
-SYM_FUNC_END(efi_call)
+SYM_FUNC_END(__efi_call)
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 3189f1394701..26f0da238c1c 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -25,15 +25,16 @@
.text
.code64
-SYM_FUNC_START(efi64_thunk)
+SYM_CODE_START(__efi64_thunk)
push %rbp
push %rbx
/*
* Switch to 1:1 mapped 32-bit stack pointer.
*/
- movq %rsp, efi_saved_sp(%rip)
+ movq %rsp, %rax
movq efi_scratch(%rip), %rsp
+ push %rax
/*
* Calculate the physical address of the kernel text.
@@ -41,113 +42,31 @@ SYM_FUNC_START(efi64_thunk)
movq $__START_KERNEL_map, %rax
subq phys_base(%rip), %rax
- /*
- * Push some physical addresses onto the stack. This is easier
- * to do now in a code64 section while the assembler can address
- * 64-bit values. Note that all the addresses on the stack are
- * 32-bit.
- */
- subq $16, %rsp
- leaq efi_exit32(%rip), %rbx
- subq %rax, %rbx
- movl %ebx, 8(%rsp)
-
- leaq __efi64_thunk(%rip), %rbx
+ leaq 1f(%rip), %rbp
+ leaq 2f(%rip), %rbx
+ subq %rax, %rbp
subq %rax, %rbx
- call *%rbx
-
- movq efi_saved_sp(%rip), %rsp
- pop %rbx
- pop %rbp
- retq
-SYM_FUNC_END(efi64_thunk)
-/*
- * We run this function from the 1:1 mapping.
- *
- * This function must be invoked with a 1:1 mapped stack.
- */
-SYM_FUNC_START_LOCAL(__efi64_thunk)
- movl %ds, %eax
- push %rax
- movl %es, %eax
- push %rax
- movl %ss, %eax
- push %rax
-
- subq $32, %rsp
- movl %esi, 0x0(%rsp)
- movl %edx, 0x4(%rsp)
- movl %ecx, 0x8(%rsp)
- movq %r8, %rsi
- movl %esi, 0xc(%rsp)
- movq %r9, %rsi
- movl %esi, 0x10(%rsp)
-
- leaq 1f(%rip), %rbx
- movq %rbx, func_rt_ptr(%rip)
+ subq $28, %rsp
+ movl %ebx, 0x0(%rsp) /* return address */
+ movl %esi, 0x4(%rsp)
+ movl %edx, 0x8(%rsp)
+ movl %ecx, 0xc(%rsp)
+ movl %r8d, 0x10(%rsp)
+ movl %r9d, 0x14(%rsp)
/* Switch to 32-bit descriptor */
pushq $__KERNEL32_CS
- leaq efi_enter32(%rip), %rax
- pushq %rax
+ pushq %rdi /* EFI runtime service address */
lretq
-1: addq $32, %rsp
-
+1: movq 24(%rsp), %rsp
pop %rbx
- movl %ebx, %ss
- pop %rbx
- movl %ebx, %es
- pop %rbx
- movl %ebx, %ds
-
- /*
- * Convert 32-bit status code into 64-bit.
- */
- test %rax, %rax
- jz 1f
- movl %eax, %ecx
- andl $0x0fffffff, %ecx
- andl $0xf0000000, %eax
- shl $32, %rax
- or %rcx, %rax
-1:
- ret
-SYM_FUNC_END(__efi64_thunk)
-
-SYM_FUNC_START_LOCAL(efi_exit32)
- movq func_rt_ptr(%rip), %rax
- push %rax
- mov %rdi, %rax
- ret
-SYM_FUNC_END(efi_exit32)
+ pop %rbp
+ retq
.code32
-/*
- * EFI service pointer must be in %edi.
- *
- * The stack should represent the 32-bit calling convention.
- */
-SYM_FUNC_START_LOCAL(efi_enter32)
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- movl %eax, %ss
-
- call *%edi
-
- /* We must preserve return value */
- movl %eax, %edi
-
- movl 72(%esp), %eax
- pushl $__KERNEL_CS
- pushl %eax
-
+2: pushl $__KERNEL_CS
+ pushl %ebp
lret
-SYM_FUNC_END(efi_enter32)
-
- .data
- .balign 8
-func_rt_ptr: .quad 0
-efi_saved_sp: .quad 0
+SYM_CODE_END(__efi64_thunk)
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index f8f0220b6a66..88d32c06cffa 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -16,6 +16,7 @@
#include <asm/efi.h>
#include <asm/uv/uv.h>
#include <asm/cpu_device_id.h>
+#include <asm/realmode.h>
#include <asm/reboot.h>
#define EFI_MIN_RESERVE 5120
@@ -243,7 +244,7 @@ EXPORT_SYMBOL_GPL(efi_query_variable_store);
*/
void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
{
- phys_addr_t new_phys, new_size;
+ struct efi_memory_map_data data = { 0 };
struct efi_mem_range mr;
efi_memory_desc_t md;
int num_entries;
@@ -271,24 +272,21 @@ void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size)
num_entries = efi_memmap_split_count(&md, &mr.range);
num_entries += efi.memmap.nr_map;
- new_size = efi.memmap.desc_size * num_entries;
-
- new_phys = efi_memmap_alloc(num_entries);
- if (!new_phys) {
+ if (efi_memmap_alloc(num_entries, &data) != 0) {
pr_err("Could not allocate boot services memmap\n");
return;
}
- new = early_memremap(new_phys, new_size);
+ new = early_memremap(data.phys_map, data.size);
if (!new) {
pr_err("Failed to map new boot services memmap\n");
return;
}
efi_memmap_insert(&efi.memmap, new, &mr);
- early_memunmap(new, new_size);
+ early_memunmap(new, data.size);
- efi_memmap_install(new_phys, num_entries);
+ efi_memmap_install(&data);
e820__range_update(addr, size, E820_TYPE_RAM, E820_TYPE_RESERVED);
e820__update_table(e820_table);
}
@@ -384,10 +382,10 @@ static void __init efi_unmap_pages(efi_memory_desc_t *md)
/*
* To Do: Remove this check after adding functionality to unmap EFI boot
- * services code/data regions from direct mapping area because
- * "efi=old_map" maps EFI regions in swapper_pg_dir.
+ * services code/data regions from direct mapping area because the UV1
+ * memory map maps EFI regions in swapper_pg_dir.
*/
- if (efi_enabled(EFI_OLD_MEMMAP))
+ if (efi_have_uv1_memmap())
return;
/*
@@ -395,7 +393,7 @@ static void __init efi_unmap_pages(efi_memory_desc_t *md)
* EFI runtime calls, hence don't unmap EFI boot services code/data
* regions.
*/
- if (!efi_is_native())
+ if (efi_is_mixed())
return;
if (kernel_unmap_pages_in_pgd(pgd, pa, md->num_pages))
@@ -407,7 +405,7 @@ static void __init efi_unmap_pages(efi_memory_desc_t *md)
void __init efi_free_boot_services(void)
{
- phys_addr_t new_phys, new_size;
+ struct efi_memory_map_data data = { 0 };
efi_memory_desc_t *md;
int num_entries = 0;
void *new, *new_md;
@@ -462,14 +460,12 @@ void __init efi_free_boot_services(void)
if (!num_entries)
return;
- new_size = efi.memmap.desc_size * num_entries;
- new_phys = efi_memmap_alloc(num_entries);
- if (!new_phys) {
+ if (efi_memmap_alloc(num_entries, &data) != 0) {
pr_err("Failed to allocate new EFI memmap\n");
return;
}
- new = memremap(new_phys, new_size, MEMREMAP_WB);
+ new = memremap(data.phys_map, data.size, MEMREMAP_WB);
if (!new) {
pr_err("Failed to map new EFI memmap\n");
return;
@@ -493,7 +489,7 @@ void __init efi_free_boot_services(void)
memunmap(new);
- if (efi_memmap_install(new_phys, num_entries)) {
+ if (efi_memmap_install(&data) != 0) {
pr_err("Could not install new EFI memmap\n");
return;
}
@@ -558,7 +554,7 @@ out:
return ret;
}
-static const struct dmi_system_id sgi_uv1_dmi[] = {
+static const struct dmi_system_id sgi_uv1_dmi[] __initconst = {
{ NULL, "SGI UV1",
{ DMI_MATCH(DMI_PRODUCT_NAME, "Stoutland Platform"),
DMI_MATCH(DMI_PRODUCT_VERSION, "1.0"),
@@ -581,8 +577,15 @@ void __init efi_apply_memmap_quirks(void)
}
/* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */
- if (dmi_check_system(sgi_uv1_dmi))
- set_bit(EFI_OLD_MEMMAP, &efi.flags);
+ if (dmi_check_system(sgi_uv1_dmi)) {
+ if (IS_ENABLED(CONFIG_X86_UV)) {
+ set_bit(EFI_UV1_MEMMAP, &efi.flags);
+ } else {
+ pr_warn("EFI runtime disabled, needs CONFIG_X86_UV=y on UV1\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ efi_memmap_unmap();
+ }
+ }
}
/*
@@ -720,7 +723,7 @@ void efi_recover_from_page_fault(unsigned long phys_addr)
/*
* Make sure that an efi runtime service caused the page fault.
* "efi_mm" cannot be used to check if the page fault had occurred
- * in the firmware context because efi=old_map doesn't use efi_pgd.
+ * in the firmware context because the UV1 memmap doesn't use efi_pgd.
*/
if (efi_rts_work.efi_rts_id == EFI_NONE)
return;
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index 44d1f884c3d3..139738bbdd36 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -6,21 +6,31 @@
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
*/
-#include <linux/gpio.h>
-#include <linux/platform_data/tc35876x.h>
+#include <linux/gpio/machine.h>
#include <asm/intel-mid.h>
+static struct gpiod_lookup_table tc35876x_gpio_table = {
+ .dev_id = "i2c_disp_brig",
+ .table = {
+ GPIO_LOOKUP("0000:00:0c.0", -1, "bridge-reset", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "bl-en", GPIO_ACTIVE_HIGH),
+ GPIO_LOOKUP("0000:00:0c.0", -1, "vadd", GPIO_ACTIVE_HIGH),
+ { },
+ },
+};
+
/*tc35876x DSI_LVDS bridge chip and panel platform data*/
static void *tc35876x_platform_data(void *data)
{
- static struct tc35876x_platform_data pdata;
+ struct gpiod_lookup_table *table = &tc35876x_gpio_table;
+ struct gpiod_lookup *lookup = table->table;
- /* gpio pins set to -1 will not be used by the driver */
- pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN");
- pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN");
- pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3");
+ lookup[0].chip_hwnum = get_gpio_by_name("LCMB_RXEN");
+ lookup[1].chip_hwnum = get_gpio_by_name("6S6P_BL_EN");
+ lookup[2].chip_hwnum = get_gpio_by_name("EN_VREG_LCD_V3P3");
+ gpiod_add_lookup_table(table);
- return &pdata;
+ return NULL;
}
static const struct devs_id tc35876x_dev_id __initconst = {
diff --git a/arch/x86/platform/intel-quark/imr.c b/arch/x86/platform/intel-quark/imr.c
index 6dd25dc5f027..e9d97d52475e 100644
--- a/arch/x86/platform/intel-quark/imr.c
+++ b/arch/x86/platform/intel-quark/imr.c
@@ -29,6 +29,8 @@
#include <asm/cpu_device_id.h>
#include <asm/imr.h>
#include <asm/iosf_mbi.h>
+#include <asm/io.h>
+
#include <linux/debugfs.h>
#include <linux/init.h>
#include <linux/mm.h>
diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index 42f879b75f9b..4307830e1b6f 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -14,6 +14,8 @@
#include <asm-generic/sections.h>
#include <asm/cpu_device_id.h>
#include <asm/imr.h>
+#include <asm/io.h>
+
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/types.h>
diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index ece9cb9c1189..607f58147311 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -31,13 +31,16 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
return BIOS_STATUS_UNIMPLEMENTED;
/*
- * If EFI_OLD_MEMMAP is set, we need to fall back to using our old EFI
+ * If EFI_UV1_MEMMAP is set, we need to fall back to using our old EFI
* callback method, which uses efi_call() directly, with the kernel page tables:
*/
- if (unlikely(efi_enabled(EFI_OLD_MEMMAP)))
+ if (unlikely(efi_enabled(EFI_UV1_MEMMAP))) {
+ kernel_fpu_begin();
ret = efi_call((void *)__va(tab->function), (u64)which, a1, a2, a3, a4, a5);
- else
+ kernel_fpu_end();
+ } else {
ret = efi_call_virt_pointer(tab, function, (u64)which, a1, a2, a3, a4, a5);
+ }
return ret;
}
@@ -214,3 +217,163 @@ int uv_bios_init(void)
pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision);
return 0;
}
+
+static void __init early_code_mapping_set_exec(int executable)
+{
+ efi_memory_desc_t *md;
+
+ if (!(__supported_pte_mask & _PAGE_NX))
+ return;
+
+ /* Make EFI service code area executable */
+ for_each_efi_memory_desc(md) {
+ if (md->type == EFI_RUNTIME_SERVICES_CODE ||
+ md->type == EFI_BOOT_SERVICES_CODE)
+ efi_set_executable(md, executable);
+ }
+}
+
+void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd)
+{
+ /*
+ * After the lock is released, the original page table is restored.
+ */
+ int pgd_idx, i;
+ int nr_pgds;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+
+ nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+ for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++) {
+ pgd = pgd_offset_k(pgd_idx * PGDIR_SIZE);
+ set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+
+ if (!pgd_present(*pgd))
+ continue;
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ p4d = p4d_offset(pgd,
+ pgd_idx * PGDIR_SIZE + i * P4D_SIZE);
+
+ if (!p4d_present(*p4d))
+ continue;
+
+ pud = (pud_t *)p4d_page_vaddr(*p4d);
+ pud_free(&init_mm, pud);
+ }
+
+ p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+ p4d_free(&init_mm, p4d);
+ }
+
+ kfree(save_pgd);
+
+ __flush_tlb_all();
+ early_code_mapping_set_exec(0);
+}
+
+pgd_t * __init efi_uv1_memmap_phys_prolog(void)
+{
+ unsigned long vaddr, addr_pgd, addr_p4d, addr_pud;
+ pgd_t *save_pgd, *pgd_k, *pgd_efi;
+ p4d_t *p4d, *p4d_k, *p4d_efi;
+ pud_t *pud;
+
+ int pgd;
+ int n_pgds, i, j;
+
+ early_code_mapping_set_exec(1);
+
+ n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
+ save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL);
+ if (!save_pgd)
+ return NULL;
+
+ /*
+ * Build 1:1 identity mapping for UV1 memmap usage. Note that
+ * PAGE_OFFSET is PGDIR_SIZE aligned when KASLR is disabled, while
+ * it is PUD_SIZE ALIGNED with KASLR enabled. So for a given physical
+ * address X, the pud_index(X) != pud_index(__va(X)), we can only copy
+ * PUD entry of __va(X) to fill in pud entry of X to build 1:1 mapping.
+ * This means here we can only reuse the PMD tables of the direct mapping.
+ */
+ for (pgd = 0; pgd < n_pgds; pgd++) {
+ addr_pgd = (unsigned long)(pgd * PGDIR_SIZE);
+ vaddr = (unsigned long)__va(pgd * PGDIR_SIZE);
+ pgd_efi = pgd_offset_k(addr_pgd);
+ save_pgd[pgd] = *pgd_efi;
+
+ p4d = p4d_alloc(&init_mm, pgd_efi, addr_pgd);
+ if (!p4d) {
+ pr_err("Failed to allocate p4d table!\n");
+ goto out;
+ }
+
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ addr_p4d = addr_pgd + i * P4D_SIZE;
+ p4d_efi = p4d + p4d_index(addr_p4d);
+
+ pud = pud_alloc(&init_mm, p4d_efi, addr_p4d);
+ if (!pud) {
+ pr_err("Failed to allocate pud table!\n");
+ goto out;
+ }
+
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ addr_pud = addr_p4d + j * PUD_SIZE;
+
+ if (addr_pud > (max_pfn << PAGE_SHIFT))
+ break;
+
+ vaddr = (unsigned long)__va(addr_pud);
+
+ pgd_k = pgd_offset_k(vaddr);
+ p4d_k = p4d_offset(pgd_k, vaddr);
+ pud[j] = *pud_offset(p4d_k, vaddr);
+ }
+ }
+ pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
+ }
+
+ __flush_tlb_all();
+ return save_pgd;
+out:
+ efi_uv1_memmap_phys_epilog(save_pgd);
+ return NULL;
+}
+
+void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
+ u32 type, u64 attribute)
+{
+ unsigned long last_map_pfn;
+
+ if (type == EFI_MEMORY_MAPPED_IO)
+ return ioremap(phys_addr, size);
+
+ last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+ if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
+ unsigned long top = last_map_pfn << PAGE_SHIFT;
+ efi_ioremap(top, size - (top - phys_addr), type, attribute);
+ }
+
+ if (!(attribute & EFI_MEMORY_WB))
+ efi_memory_uc((u64)(unsigned long)__va(phys_addr), size);
+
+ return (void __iomem *)__va(phys_addr);
+}
+
+static int __init arch_parse_efi_cmdline(char *str)
+{
+ if (!str) {
+ pr_warn("need at least one option\n");
+ return -EINVAL;
+ }
+
+ if (!efi_is_mixed() && parse_option_str(str, "old_map"))
+ set_bit(EFI_UV1_MEMMAP, &efi.flags);
+
+ return 0;
+}
+early_param("efi", arch_parse_efi_cmdline);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 5f0a96bf27a1..1fd321f37f1b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1668,12 +1668,12 @@ static int tunables_open(struct inode *inode, struct file *file)
return 0;
}
-static const struct file_operations proc_uv_ptc_operations = {
- .open = ptc_proc_open,
- .read = seq_read,
- .write = ptc_proc_write,
- .llseek = seq_lseek,
- .release = seq_release,
+static const struct proc_ops uv_ptc_proc_ops = {
+ .proc_open = ptc_proc_open,
+ .proc_read = seq_read,
+ .proc_write = ptc_proc_write,
+ .proc_lseek = seq_lseek,
+ .proc_release = seq_release,
};
static const struct file_operations tunables_fops = {
@@ -1691,7 +1691,7 @@ static int __init uv_ptc_init(void)
return 0;
proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
- &proc_uv_ptc_operations);
+ &uv_ptc_proc_ops);
if (!proc_uv_ptc) {
pr_err("unable to create %s proc entry\n",
UV_PTC_BASENAME);
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index f60501a384f9..99b6332ba540 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -12,7 +12,7 @@ OBJECT_FILES_NON_STANDARD := y
# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
KCOV_INSTRUMENT := n
-always := realmode.bin realmode.relocs
+always-y := realmode.bin realmode.relocs
wakeup-objs := wakeup_asm.o wakemain.o video-mode.o
wakeup-objs += copy.o bioscall.o regs.o
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
index 09af7ff53044..55b1ab378974 100644
--- a/arch/x86/tools/Makefile
+++ b/arch/x86/tools/Makefile
@@ -26,7 +26,7 @@ posttest: $(obj)/insn_decoder_test vmlinux $(obj)/insn_sanity
$(call cmd,posttest)
$(call cmd,sanitytest)
-hostprogs-y += insn_decoder_test insn_sanity
+hostprogs += insn_decoder_test insn_sanity
# -I needed for generated C source and C source which in the kernel tree.
HOSTCFLAGS_insn_decoder_test.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/uapi/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/uapi/
@@ -39,7 +39,7 @@ $(obj)/insn_decoder_test.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/l
$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
HOST_EXTRACFLAGS += -I$(srctree)/tools/include
-hostprogs-y += relocs
+hostprogs += relocs
relocs-objs := relocs_32.o relocs_64.o relocs_common.o
PHONY += relocs
relocs: $(obj)/relocs
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index ba5a41828e9d..1aded63a95cb 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -62,10 +62,10 @@ config XEN_512GB
boot parameter "xen_512gb_limit".
config XEN_SAVE_RESTORE
- bool
- depends on XEN
- select HIBERNATE_CALLBACKS
- default y
+ bool
+ depends on XEN
+ select HIBERNATE_CALLBACKS
+ default y
config XEN_DEBUG_FS
bool "Enable Xen debug and tuning parameters in debugfs"
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a04551ee5568..1abe455d926a 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -31,7 +31,7 @@ static efi_system_table_t efi_systab_xen __initdata = {
.con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
.con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
.con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
- .con_out = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
+ .con_out = NULL, /* Not used under Xen. */
.stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
.stderr = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */
.runtime = (efi_runtime_services_t *)EFI_INVALID_TABLE_ADDR,
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ae4a41ca19f6..507f4fb88fa7 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -72,6 +72,9 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#ifdef CONFIG_X86_IOPL_IOPERM
+#include <asm/io_bitmap.h>
+#endif
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -837,6 +840,25 @@ static void xen_load_sp0(unsigned long sp0)
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
+#ifdef CONFIG_X86_IOPL_IOPERM
+static void xen_update_io_bitmap(void)
+{
+ struct physdev_set_iobitmap iobitmap;
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
+
+ native_tss_update_io_bitmap();
+
+ iobitmap.bitmap = (uint8_t *)(&tss->x86_tss) +
+ tss->x86_tss.io_bitmap_base;
+ if (tss->x86_tss.io_bitmap_base == IO_BITMAP_OFFSET_INVALID)
+ iobitmap.nr_ports = 0;
+ else
+ iobitmap.nr_ports = IO_BITMAP_BITS;
+
+ HYPERVISOR_physdev_op(PHYSDEVOP_set_iobitmap, &iobitmap);
+}
+#endif
+
static void xen_io_delay(void)
{
}
@@ -896,14 +918,15 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
{
int ret;
+#ifdef CONFIG_X86_64
+ unsigned int which;
+ u64 base;
+#endif
ret = 0;
switch (msr) {
#ifdef CONFIG_X86_64
- unsigned which;
- u64 base;
-
case MSR_FS_BASE: which = SEGBASE_FS; goto set;
case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
@@ -1046,6 +1069,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.write_idt_entry = xen_write_idt_entry,
.load_sp0 = xen_load_sp0,
+#ifdef CONFIG_X86_IOPL_IOPERM
+ .update_io_bitmap = xen_update_io_bitmap,
+#endif
.io_delay = xen_io_delay,
/* Xen takes care of %gs when switching to usermode for us */
@@ -1205,6 +1231,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
x86_platform.get_nmi_reason = xen_get_nmi_reason;
x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.irqs.intr_mode_select = x86_init_noop;
x86_init.irqs.intr_mode_init = x86_init_noop;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index c8dbee62ec2a..bbba8b17829a 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -67,7 +67,7 @@
#include <asm/linkage.h>
#include <asm/page.h>
#include <asm/init.h>
-#include <asm/pat.h>
+#include <asm/memtype.h>
#include <asm/smp.h>
#include <asm/tlb.h>