aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig31
-rw-r--r--arch/x86/boot/Makefile10
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S75
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c11
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S15
-rw-r--r--arch/x86/crypto/camellia-aesni-avx2-asm_64.S15
-rw-r--r--arch/x86/crypto/camellia_glue.c10
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S9
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S13
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c10
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S8
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S5
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S13
-rw-r--r--arch/x86/crypto/serpent-avx2-asm_64.S13
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c11
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c11
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c35
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S35
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S38
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S13
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c10
-rw-r--r--arch/x86/entry/Makefile4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl2
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/entry/thunk_64.S4
-rw-r--r--arch/x86/entry/vdso/Makefile9
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c2
-rw-r--r--arch/x86/events/Makefile1
-rw-r--r--arch/x86/events/amd/ibs.c37
-rw-r--r--arch/x86/events/amd/iommu.c5
-rw-r--r--arch/x86/events/amd/power.c353
-rw-r--r--arch/x86/events/core.c8
-rw-r--r--arch/x86/events/intel/cqm.c454
-rw-r--r--arch/x86/events/intel/ds.c5
-rw-r--r--arch/x86/events/intel/lbr.c2
-rw-r--r--arch/x86/events/intel/rapl.c2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c7
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/checksum_32.h9
-rw-r--r--arch/x86/include/asm/checksum_64.h10
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpufeature.h55
-rw-r--r--arch/x86/include/asm/cpufeatures.h10
-rw-r--r--arch/x86/include/asm/disabled-features.h15
-rw-r--r--arch/x86/include/asm/efi.h30
-rw-r--r--arch/x86/include/asm/fpu/internal.h2
-rw-r--r--arch/x86/include/asm/fpu/types.h12
-rw-r--r--arch/x86/include/asm/fpu/xstate.h3
-rw-r--r--arch/x86/include/asm/ftrace.h4
-rw-r--r--arch/x86/include/asm/gpio.h4
-rw-r--r--arch/x86/include/asm/hw_irq.h15
-rw-r--r--arch/x86/include/asm/kvm_host.h14
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/mmu_context.h85
-rw-r--r--arch/x86/include/asm/msr.h8
-rw-r--r--arch/x86/include/asm/paravirt.h9
-rw-r--r--arch/x86/include/asm/paravirt_types.h18
-rw-r--r--arch/x86/include/asm/pci.h18
-rw-r--r--arch/x86/include/asm/pgtable.h44
-rw-r--r--arch/x86/include/asm/pgtable_types.h39
-rw-r--r--arch/x86/include/asm/pkeys.h34
-rw-r--r--arch/x86/include/asm/pmem.h5
-rw-r--r--arch/x86/include/asm/preempt.h13
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h4
-rw-r--r--arch/x86/include/asm/required-features.h7
-rw-r--r--arch/x86/include/asm/rwsem.h2
-rw-r--r--arch/x86/include/asm/special_insns.h38
-rw-r--r--arch/x86/include/asm/stacktrace.h2
-rw-r--r--arch/x86/include/asm/string_64.h4
-rw-r--r--arch/x86/include/asm/uaccess.h10
-rw-r--r--arch/x86/include/asm/xen/hypercall.h5
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h2
-rw-r--r--arch/x86/include/uapi/asm/mman.h22
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h2
-rw-r--r--arch/x86/kernel/Makefile17
-rw-r--r--arch/x86/kernel/acpi/boot.c4
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S3
-rw-r--r--arch/x86/kernel/apb_timer.c2
-rw-r--r--arch/x86/kernel/aperture_64.c12
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/vector.c88
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c5
-rw-r--r--arch/x86/kernel/apm_32.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c25
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c2
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/fpu/core.c63
-rw-r--r--arch/x86/kernel/fpu/regset.c2
-rw-r--r--arch/x86/kernel/fpu/xstate.c185
-rw-r--r--arch/x86/kernel/ftrace.c2
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/ioport.c12
-rw-r--r--arch/x86/kernel/kexec-bzimage64.c2
-rw-r--r--arch/x86/kernel/kgdb.c4
-rw-r--r--arch/x86/kernel/kprobes/core.c59
-rw-r--r--arch/x86/kernel/kvm.c37
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/process_64.c16
-rw-r--r--arch/x86/kernel/setup.c9
-rw-r--r--arch/x86/kernel/smpboot.c27
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/tsc.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S14
-rw-r--r--arch/x86/kvm/cpuid.c63
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c31
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/mmu.c106
-rw-r--r--arch/x86/kvm/mmu.h42
-rw-r--r--arch/x86/kvm/page_track.c9
-rw-r--r--arch/x86/kvm/paging_tmpl.h30
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/vmx.c71
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/csum-wrappers_64.c2
-rw-r--r--arch/x86/lib/insn.c6
-rw-r--r--arch/x86/lib/memcpy_64.S7
-rw-r--r--arch/x86/lib/memset_64.S2
-rw-r--r--arch/x86/lib/rwsem.S11
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/extable.c108
-rw-r--r--arch/x86/mm/fault.c150
-rw-r--r--arch/x86/mm/gup.c47
-rw-r--r--arch/x86/mm/mpx.c8
-rw-r--r--arch/x86/mm/pageattr.c34
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pkeys.c101
-rw-r--r--arch/x86/net/bpf_jit.S48
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c3
-rw-r--r--arch/x86/pci/common.c1
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/vmd.c35
-rw-r--r--arch/x86/platform/efi/Makefile2
-rw-r--r--arch/x86/platform/efi/efi-bgrt.c52
-rw-r--r--arch/x86/platform/efi/efi.c67
-rw-r--r--arch/x86/platform/efi/efi_32.c7
-rw-r--r--arch/x86/platform/efi/efi_64.c206
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S46
-rw-r--r--arch/x86/platform/efi/quirks.c37
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_bma023.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_emc1403.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_lis331.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_max7315.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c2
-rw-r--r--arch/x86/platform/intel-mid/device_libs/platform_tca6416.c2
-rw-r--r--arch/x86/power/hibernate_asm_64.S7
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/purgatory/stack.S2
-rw-r--r--arch/x86/realmode/Makefile4
-rw-r--r--arch/x86/realmode/rm/Makefile6
-rw-r--r--arch/x86/um/asm/checksum.h9
-rw-r--r--arch/x86/um/asm/checksum_32.h2
-rw-r--r--arch/x86/video/fbdev.c18
-rw-r--r--arch/x86/xen/enlighten.c5
-rw-r--r--arch/x86/xen/mmu.c8
-rw-r--r--arch/x86/xen/xen-asm.S10
-rw-r--r--arch/x86/xen/xen-asm_64.S1
-rw-r--r--arch/x86/xen/xen-head.S21
181 files changed, 3207 insertions, 902 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8f2e6659281b..2dc18605831f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV if X86_64
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_MMIO_FLUSH
select ARCH_HAS_SG_CHAIN
@@ -155,6 +156,9 @@ config X86
select VIRT_TO_BUS
select X86_DEV_DMA_OPS if X86_64
select X86_FEATURE_NAMES if PROC_FS
+ select HAVE_STACK_VALIDATION if X86_64
+ select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS
+ select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS
config INSTRUCTION_DECODER
def_bool y
@@ -1206,6 +1210,15 @@ config MICROCODE_OLD_INTERFACE
def_bool y
depends on MICROCODE
+config PERF_EVENTS_AMD_POWER
+ depends on PERF_EVENTS && CPU_SUP_AMD
+ tristate "AMD Processor Power Reporting Mechanism"
+ ---help---
+ Provide power reporting mechanism support for AMD processors.
+ Currently, it leverages X86_FEATURE_ACC_POWER
+ (CPUID Fn8000_0007_EDX[12]) interface to calculate the
+ average power consumption on Family 15h processors.
+
config X86_MSR
tristate "/dev/cpu/*/msr - Model-specific register support"
---help---
@@ -1718,6 +1731,20 @@ config X86_INTEL_MPX
If unsure, say N.
+config X86_INTEL_MEMORY_PROTECTION_KEYS
+ prompt "Intel Memory Protection Keys"
+ def_bool y
+ # Note: only available in 64-bit mode
+ depends on CPU_SUP_INTEL && X86_64
+ ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/x86/protection-keys.txt
+
+ If unsure, say y.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
@@ -2435,8 +2462,6 @@ config PCI_CNB20LE_QUIRK
You should say N unless you know you need this.
-source "drivers/pci/pcie/Kconfig"
-
source "drivers/pci/Kconfig"
# x86_64 have no ISA slots, but can have ISA-style DMA.
@@ -2592,8 +2617,6 @@ config AMD_NB
source "drivers/pcmcia/Kconfig"
-source "drivers/pci/hotplug/Kconfig"
-
config RAPIDIO
tristate "RapidIO support"
depends on PCI
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index bbe1a62efc02..b1ef9e489084 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,7 +9,15 @@
# Changed by many, many contributors over the years.
#
-KASAN_SANITIZE := n
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT := n
# If you want to preset the SVGA mode, uncomment the next line and
# set SVGA_MODE to whatever number you want.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f9ce75d80101..6915ff2bd996 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -16,7 +16,11 @@
# (see scripts/Makefile.lib size_append)
# compressed vmlinux.bin.all + u32 size of vmlinux.bin.all
-KASAN_SANITIZE := n
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index e25a1630320c..265901a84f3f 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -303,7 +303,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index cb5b3ab5beec..4f404a64681b 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -300,7 +300,6 @@ CONFIG_DEBUG_STACKOVERFLOW=y
# CONFIG_DEBUG_RODATA_TEST is not set
CONFIG_DEBUG_BOOT_PARAMS=y
CONFIG_OPTIMIZE_INLINING=y
-CONFIG_KEYS_DEBUG_PROC_KEYS=y
CONFIG_SECURITY=y
CONFIG_SECURITY_NETWORK=y
CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 6bd2c6c95373..383a6f84a060 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -31,6 +31,7 @@
#include <linux/linkage.h>
#include <asm/inst.h>
+#include <asm/frame.h>
/*
* The following macros are used to move an (un)aligned 16 byte value to/from
@@ -1800,11 +1801,12 @@ ENDPROC(_key_expansion_256b)
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
- movl 8(%esp), KEYP # ctx
- movl 12(%esp), UKEYP # in_key
- movl 16(%esp), %edx # key_len
+ movl (FRAME_OFFSET+8)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key
+ movl (FRAME_OFFSET+16)(%esp), %edx # key_len
#endif
movups (UKEYP), %xmm0 # user key (first 16 bytes)
movaps %xmm0, (KEYP)
@@ -1905,6 +1907,7 @@ ENTRY(aesni_set_key)
#ifndef __x86_64__
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_set_key)
@@ -1912,12 +1915,13 @@ ENDPROC(aesni_set_key)
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
@@ -1927,6 +1931,7 @@ ENTRY(aesni_enc)
popl KLEN
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_enc)
@@ -2101,12 +2106,13 @@ ENDPROC(_aesni_enc4)
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl KEYP
pushl KLEN
- movl 12(%esp), KEYP
- movl 16(%esp), OUTP
- movl 20(%esp), INP
+ movl (FRAME_OFFSET+12)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+16)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+20)(%esp), INP # src
#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
@@ -2117,6 +2123,7 @@ ENTRY(aesni_dec)
popl KLEN
popl KEYP
#endif
+ FRAME_END
ret
ENDPROC(aesni_dec)
@@ -2292,14 +2299,15 @@ ENDPROC(_aesni_dec4)
* size_t len)
*/
ENTRY(aesni_ecb_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
@@ -2342,6 +2350,7 @@ ENTRY(aesni_ecb_enc)
popl KEYP
popl LEN
#endif
+ FRAME_END
ret
ENDPROC(aesni_ecb_enc)
@@ -2350,14 +2359,15 @@ ENDPROC(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl LEN
pushl KEYP
pushl KLEN
- movl 16(%esp), KEYP
- movl 20(%esp), OUTP
- movl 24(%esp), INP
- movl 28(%esp), LEN
+ movl (FRAME_OFFSET+16)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+20)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+24)(%esp), INP # src
+ movl (FRAME_OFFSET+28)(%esp), LEN # len
#endif
test LEN, LEN
jz .Lecb_dec_ret
@@ -2401,6 +2411,7 @@ ENTRY(aesni_ecb_dec)
popl KEYP
popl LEN
#endif
+ FRAME_END
ret
ENDPROC(aesni_ecb_dec)
@@ -2409,16 +2420,17 @@ ENDPROC(aesni_ecb_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
#endif
cmp $16, LEN
jb .Lcbc_enc_ret
@@ -2443,6 +2455,7 @@ ENTRY(aesni_cbc_enc)
popl LEN
popl IVP
#endif
+ FRAME_END
ret
ENDPROC(aesni_cbc_enc)
@@ -2451,16 +2464,17 @@ ENDPROC(aesni_cbc_enc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
+ FRAME_BEGIN
#ifndef __x86_64__
pushl IVP
pushl LEN
pushl KEYP
pushl KLEN
- movl 20(%esp), KEYP
- movl 24(%esp), OUTP
- movl 28(%esp), INP
- movl 32(%esp), LEN
- movl 36(%esp), IVP
+ movl (FRAME_OFFSET+20)(%esp), KEYP # ctx
+ movl (FRAME_OFFSET+24)(%esp), OUTP # dst
+ movl (FRAME_OFFSET+28)(%esp), INP # src
+ movl (FRAME_OFFSET+32)(%esp), LEN # len
+ movl (FRAME_OFFSET+36)(%esp), IVP # iv
#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
@@ -2534,13 +2548,16 @@ ENTRY(aesni_cbc_dec)
popl LEN
popl IVP
#endif
+ FRAME_END
ret
ENDPROC(aesni_cbc_dec)
#ifdef __x86_64__
+.pushsection .rodata
.align 16
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+.popsection
/*
* _aesni_inc_init: internal ABI
@@ -2598,6 +2615,7 @@ ENDPROC(_aesni_inc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_ctr_enc)
+ FRAME_BEGIN
cmp $16, LEN
jb .Lctr_enc_just_ret
mov 480(KEYP), KLEN
@@ -2651,6 +2669,7 @@ ENTRY(aesni_ctr_enc)
.Lctr_enc_ret:
movups IV, (IVP)
.Lctr_enc_just_ret:
+ FRAME_END
ret
ENDPROC(aesni_ctr_enc)
@@ -2677,6 +2696,7 @@ ENDPROC(aesni_ctr_enc)
* bool enc, u8 *iv)
*/
ENTRY(aesni_xts_crypt8)
+ FRAME_BEGIN
cmpb $0, %cl
movl $0, %ecx
movl $240, %r10d
@@ -2777,6 +2797,7 @@ ENTRY(aesni_xts_crypt8)
pxor INC, STATE4
movdqu STATE4, 0x70(OUTP)
+ FRAME_END
ret
ENDPROC(aesni_xts_crypt8)
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6145c5..064c7e2bd7c8 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -639,16 +639,11 @@ static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index ce71f9212409..aa9e8bd163f6 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -16,6 +16,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -726,6 +727,7 @@ __camellia_enc_blk16:
* %xmm0..%xmm15: 16 encrypted blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 16(%rax), %rcx;
@@ -780,6 +782,7 @@ __camellia_enc_blk16:
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
+ FRAME_END
ret;
.align 8
@@ -812,6 +815,7 @@ __camellia_dec_blk16:
* %xmm0..%xmm15: 16 plaintext blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 16(%rax), %rcx;
@@ -865,6 +869,7 @@ __camellia_dec_blk16:
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
%xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
+ FRAME_END
ret;
.align 8
@@ -890,6 +895,7 @@ ENTRY(camellia_ecb_enc_16way)
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
@@ -904,6 +910,7 @@ ENTRY(camellia_ecb_enc_16way)
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ecb_enc_16way)
@@ -913,6 +920,7 @@ ENTRY(camellia_ecb_dec_16way)
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
cmpl $16, key_length(CTX);
movl $32, %r8d;
@@ -932,6 +940,7 @@ ENTRY(camellia_ecb_dec_16way)
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ecb_dec_16way)
@@ -941,6 +950,7 @@ ENTRY(camellia_cbc_dec_16way)
* %rsi: dst (16 blocks)
* %rdx: src (16 blocks)
*/
+ FRAME_BEGIN
cmpl $16, key_length(CTX);
movl $32, %r8d;
@@ -981,6 +991,7 @@ ENTRY(camellia_cbc_dec_16way)
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_cbc_dec_16way)
@@ -997,6 +1008,7 @@ ENTRY(camellia_ctr_16way)
* %rdx: src (16 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
subq $(16 * 16), %rsp;
movq %rsp, %rax;
@@ -1092,6 +1104,7 @@ ENTRY(camellia_ctr_16way)
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_ctr_16way)
@@ -1112,6 +1125,7 @@ camellia_xts_crypt_16way:
* %r8: index for input whitening key
* %r9: pointer to __camellia_enc_blk16 or __camellia_dec_blk16
*/
+ FRAME_BEGIN
subq $(16 * 16), %rsp;
movq %rsp, %rax;
@@ -1234,6 +1248,7 @@ camellia_xts_crypt_16way:
%xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
%xmm8, %rsi);
+ FRAME_END
ret;
ENDPROC(camellia_xts_crypt_16way)
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0e0b8863a34b..16186c18656d 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -11,6 +11,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#define CAMELLIA_TABLE_BYTE_LEN 272
@@ -766,6 +767,7 @@ __camellia_enc_blk32:
* %ymm0..%ymm15: 32 encrypted blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 32(%rax), %rcx;
@@ -820,6 +822,7 @@ __camellia_enc_blk32:
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
+ FRAME_END
ret;
.align 8
@@ -852,6 +855,7 @@ __camellia_dec_blk32:
* %ymm0..%ymm15: 16 plaintext blocks, order swapped:
* 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
*/
+ FRAME_BEGIN
leaq 8 * 32(%rax), %rcx;
@@ -905,6 +909,7 @@ __camellia_dec_blk32:
%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
%ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
+ FRAME_END
ret;
.align 8
@@ -930,6 +935,7 @@ ENTRY(camellia_ecb_enc_32way)
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
@@ -948,6 +954,7 @@ ENTRY(camellia_ecb_enc_32way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ecb_enc_32way)
@@ -957,6 +964,7 @@ ENTRY(camellia_ecb_dec_32way)
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
@@ -980,6 +988,7 @@ ENTRY(camellia_ecb_dec_32way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ecb_dec_32way)
@@ -989,6 +998,7 @@ ENTRY(camellia_cbc_dec_32way)
* %rsi: dst (32 blocks)
* %rdx: src (32 blocks)
*/
+ FRAME_BEGIN
vzeroupper;
@@ -1046,6 +1056,7 @@ ENTRY(camellia_cbc_dec_32way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_cbc_dec_32way)
@@ -1070,6 +1081,7 @@ ENTRY(camellia_ctr_32way)
* %rdx: src (32 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
vzeroupper;
@@ -1184,6 +1196,7 @@ ENTRY(camellia_ctr_32way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_ctr_32way)
@@ -1216,6 +1229,7 @@ camellia_xts_crypt_32way:
* %r8: index for input whitening key
* %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32
*/
+ FRAME_BEGIN
vzeroupper;
@@ -1349,6 +1363,7 @@ camellia_xts_crypt_32way:
vzeroupper;
+ FRAME_END
ret;
ENDPROC(camellia_xts_crypt_32way)
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 5c8b6266a394..aa76cad9d262 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -1503,13 +1503,9 @@ int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __camellia_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index c35fd5d6ecd2..14fa1966bf01 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
.file "cast5-avx-x86_64-asm_64.S"
@@ -365,6 +366,7 @@ ENTRY(cast5_ecb_enc_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -388,6 +390,7 @@ ENTRY(cast5_ecb_enc_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ FRAME_END
ret;
ENDPROC(cast5_ecb_enc_16way)
@@ -398,6 +401,7 @@ ENTRY(cast5_ecb_dec_16way)
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
vmovdqu (0*4*4)(%rdx), RL1;
@@ -420,6 +424,7 @@ ENTRY(cast5_ecb_dec_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ FRAME_END
ret;
ENDPROC(cast5_ecb_dec_16way)
@@ -429,6 +434,7 @@ ENTRY(cast5_cbc_dec_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
@@ -469,6 +475,7 @@ ENTRY(cast5_cbc_dec_16way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast5_cbc_dec_16way)
@@ -479,6 +486,7 @@ ENTRY(cast5_ctr_16way)
* %rdx: src
* %rcx: iv (big endian, 64bit)
*/
+ FRAME_BEGIN
pushq %r12;
@@ -542,5 +550,6 @@ ENTRY(cast5_ctr_16way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index e3531f833951..c419389889cd 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "cast6-avx-x86_64-asm_64.S"
@@ -349,6 +350,7 @@ ENTRY(cast6_ecb_enc_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -358,6 +360,7 @@ ENTRY(cast6_ecb_enc_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_ecb_enc_8way)
@@ -367,6 +370,7 @@ ENTRY(cast6_ecb_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -376,6 +380,7 @@ ENTRY(cast6_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_ecb_dec_8way)
@@ -385,6 +390,7 @@ ENTRY(cast6_cbc_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
@@ -399,6 +405,7 @@ ENTRY(cast6_cbc_dec_8way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast6_cbc_dec_8way)
@@ -409,6 +416,7 @@ ENTRY(cast6_ctr_8way)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
pushq %r12;
@@ -424,6 +432,7 @@ ENTRY(cast6_ctr_8way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(cast6_ctr_8way)
@@ -434,6 +443,7 @@ ENTRY(cast6_xts_enc_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -446,6 +456,7 @@ ENTRY(cast6_xts_enc_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_xts_enc_8way)
@@ -456,6 +467,7 @@ ENTRY(cast6_xts_dec_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -468,5 +480,6 @@ ENTRY(cast6_xts_dec_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index fca459578c35..50e684768c55 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -329,13 +329,9 @@ static int xts_cast6_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __cast6_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 4fe27e074194..dc05f010ca9b 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -170,8 +170,8 @@ continue_block:
## branch into array
lea jump_table(%rip), bufp
movzxw (bufp, %rax, 2), len
- offset=crc_array-jump_table
- lea offset(bufp, len, 1), bufp
+ lea crc_array(%rip), bufp
+ lea (bufp, len, 1), bufp
jmp *bufp
################################################################
@@ -310,7 +310,9 @@ do_return:
popq %rdi
popq %rbx
ret
+ENDPROC(crc_pcl)
+.section .rodata, "a", %progbits
################################################################
## jump table Table is 129 entries x 2 bytes each
################################################################
@@ -324,13 +326,11 @@ JMPTBL_ENTRY %i
i=i+1
.endr
-ENDPROC(crc_pcl)
################################################################
## PCLMULQDQ tables
## Table is 128 entries x 2 words (8 bytes) each
################################################################
-.section .rodata, "a", %progbits
.align 8
K_table:
.long 0x493c7d27, 0x00000001
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
index 5d1e0075ac24..eed55c8cca4f 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_asm.S
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -18,6 +18,7 @@
#include <linux/linkage.h>
#include <asm/inst.h>
+#include <asm/frame.h>
.data
@@ -94,6 +95,7 @@ ENDPROC(__clmul_gf128mul_ble)
/* void clmul_ghash_mul(char *dst, const u128 *shash) */
ENTRY(clmul_ghash_mul)
+ FRAME_BEGIN
movups (%rdi), DATA
movups (%rsi), SHASH
movaps .Lbswap_mask, BSWAP
@@ -101,6 +103,7 @@ ENTRY(clmul_ghash_mul)
call __clmul_gf128mul_ble
PSHUFB_XMM BSWAP DATA
movups DATA, (%rdi)
+ FRAME_END
ret
ENDPROC(clmul_ghash_mul)
@@ -109,6 +112,7 @@ ENDPROC(clmul_ghash_mul)
* const u128 *shash);
*/
ENTRY(clmul_ghash_update)
+ FRAME_BEGIN
cmp $16, %rdx
jb .Lupdate_just_ret # check length
movaps .Lbswap_mask, BSWAP
@@ -128,5 +132,6 @@ ENTRY(clmul_ghash_update)
PSHUFB_XMM BSWAP DATA
movups DATA, (%rdi)
.Lupdate_just_ret:
+ FRAME_END
ret
ENDPROC(clmul_ghash_update)
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 2f202f49872b..8be571808342 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "serpent-avx-x86_64-asm_64.S"
@@ -681,6 +682,7 @@ ENTRY(serpent_ecb_enc_8way_avx)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -688,6 +690,7 @@ ENTRY(serpent_ecb_enc_8way_avx)
store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_ecb_enc_8way_avx)
@@ -697,6 +700,7 @@ ENTRY(serpent_ecb_dec_8way_avx)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_8way_avx)
store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_ecb_dec_8way_avx)
@@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_8way_avx)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -720,6 +726,7 @@ ENTRY(serpent_cbc_dec_8way_avx)
store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_cbc_dec_8way_avx)
@@ -730,6 +737,7 @@ ENTRY(serpent_ctr_8way_avx)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
RD2, RK0, RK1, RK2);
@@ -738,6 +746,7 @@ ENTRY(serpent_ctr_8way_avx)
store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_ctr_8way_avx)
@@ -748,6 +757,7 @@ ENTRY(serpent_xts_enc_8way_avx)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
@@ -758,6 +768,7 @@ ENTRY(serpent_xts_enc_8way_avx)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(serpent_xts_enc_8way_avx)
@@ -768,6 +779,7 @@ ENTRY(serpent_xts_dec_8way_avx)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
@@ -778,5 +790,6 @@ ENTRY(serpent_xts_dec_8way_avx)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
+ FRAME_END
ret;
ENDPROC(serpent_xts_dec_8way_avx)
diff --git a/arch/x86/crypto/serpent-avx2-asm_64.S b/arch/x86/crypto/serpent-avx2-asm_64.S
index b222085cccac..97c48add33ed 100644
--- a/arch/x86/crypto/serpent-avx2-asm_64.S
+++ b/arch/x86/crypto/serpent-avx2-asm_64.S
@@ -15,6 +15,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx2.S"
.file "serpent-avx2-asm_64.S"
@@ -673,6 +674,7 @@ ENTRY(serpent_ecb_enc_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
@@ -684,6 +686,7 @@ ENTRY(serpent_ecb_enc_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ecb_enc_16way)
@@ -693,6 +696,7 @@ ENTRY(serpent_ecb_dec_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
@@ -704,6 +708,7 @@ ENTRY(serpent_ecb_dec_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ecb_dec_16way)
@@ -713,6 +718,7 @@ ENTRY(serpent_cbc_dec_16way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
vzeroupper;
@@ -725,6 +731,7 @@ ENTRY(serpent_cbc_dec_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_cbc_dec_16way)
@@ -735,6 +742,7 @@ ENTRY(serpent_ctr_16way)
* %rdx: src (16 blocks)
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
vzeroupper;
@@ -748,6 +756,7 @@ ENTRY(serpent_ctr_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_ctr_16way)
@@ -758,6 +767,7 @@ ENTRY(serpent_xts_enc_16way)
* %rdx: src (16 blocks)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
vzeroupper;
@@ -772,6 +782,7 @@ ENTRY(serpent_xts_enc_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_xts_enc_16way)
@@ -782,6 +793,7 @@ ENTRY(serpent_xts_dec_16way)
* %rdx: src (16 blocks)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
vzeroupper;
@@ -796,5 +808,6 @@ ENTRY(serpent_xts_dec_16way)
vzeroupper;
+ FRAME_END
ret;
ENDPROC(serpent_xts_dec_16way)
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 5dc37026c7ce..6f778d3daa22 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -332,16 +332,11 @@ int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 3643dd508f45..8943407e8917 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -309,16 +309,11 @@ static int xts_serpent_setkey(struct crypto_tfm *tfm, const u8 *key,
unsigned int keylen)
{
struct serpent_xts_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __serpent_setkey(&ctx->crypt_ctx, key, keylen / 2);
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index a841e9765bd6..a8a0224fa0f8 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -762,6 +762,38 @@ static int sha1_mb_async_digest(struct ahash_request *req)
return crypto_ahash_digest(mcryptd_req);
}
+static int sha1_mb_async_export(struct ahash_request *req, void *out)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ return crypto_ahash_export(mcryptd_req, out);
+}
+
+static int sha1_mb_async_import(struct ahash_request *req, const void *in)
+{
+ struct ahash_request *mcryptd_req = ahash_request_ctx(req);
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+ struct sha1_mb_ctx *ctx = crypto_ahash_ctx(tfm);
+ struct mcryptd_ahash *mcryptd_tfm = ctx->mcryptd_tfm;
+ struct crypto_shash *child = mcryptd_ahash_child(mcryptd_tfm);
+ struct mcryptd_hash_request_ctx *rctx;
+ struct shash_desc *desc;
+
+ memcpy(mcryptd_req, req, sizeof(*req));
+ ahash_request_set_tfm(mcryptd_req, &mcryptd_tfm->base);
+ rctx = ahash_request_ctx(mcryptd_req);
+ desc = &rctx->desc;
+ desc->tfm = child;
+ desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+ return crypto_ahash_import(mcryptd_req, in);
+}
+
static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
{
struct mcryptd_ahash *mcryptd_tfm;
@@ -796,8 +828,11 @@ static struct ahash_alg sha1_mb_async_alg = {
.final = sha1_mb_async_final,
.finup = sha1_mb_async_finup,
.digest = sha1_mb_async_digest,
+ .export = sha1_mb_async_export,
+ .import = sha1_mb_async_import,
.halg = {
.digestsize = SHA1_DIGEST_SIZE,
+ .statesize = sizeof(struct sha1_hash_ctx),
.base = {
.cra_name = "sha1",
.cra_driver_name = "sha1_mb",
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
index 85c4e1cf7172..96df6a39d7e2 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_flush_avx2.S
@@ -52,6 +52,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
@@ -86,16 +87,6 @@
#define extra_blocks %arg2
#define p %arg2
-
-# STACK_SPACE needs to be an odd multiple of 8
-_XMM_SAVE_SIZE = 10*16
-_GPR_SAVE_SIZE = 8*8
-_ALIGN_SIZE = 8
-
-_XMM_SAVE = 0
-_GPR_SAVE = _XMM_SAVE + _XMM_SAVE_SIZE
-STACK_SPACE = _GPR_SAVE + _GPR_SAVE_SIZE + _ALIGN_SIZE
-
.macro LABEL prefix n
\prefix\n\():
.endm
@@ -113,16 +104,8 @@ offset = \_offset
# JOB* sha1_mb_mgr_flush_avx2(MB_MGR *state)
# arg 1 : rcx : state
ENTRY(sha1_mb_mgr_flush_avx2)
- mov %rsp, %r10
- sub $STACK_SPACE, %rsp
- and $~31, %rsp
- mov %rbx, _GPR_SAVE(%rsp)
- mov %r10, _GPR_SAVE+8*1(%rsp) #save rsp
- mov %rbp, _GPR_SAVE+8*3(%rsp)
- mov %r12, _GPR_SAVE+8*4(%rsp)
- mov %r13, _GPR_SAVE+8*5(%rsp)
- mov %r14, _GPR_SAVE+8*6(%rsp)
- mov %r15, _GPR_SAVE+8*7(%rsp)
+ FRAME_BEGIN
+ push %rbx
# If bit (32+3) is set, then all lanes are empty
mov _unused_lanes(state), unused_lanes
@@ -230,16 +213,8 @@ len_is_0:
mov tmp2_w, offset(job_rax)
return:
-
- mov _GPR_SAVE(%rsp), %rbx
- mov _GPR_SAVE+8*1(%rsp), %r10 #saved rsp
- mov _GPR_SAVE+8*3(%rsp), %rbp
- mov _GPR_SAVE+8*4(%rsp), %r12
- mov _GPR_SAVE+8*5(%rsp), %r13
- mov _GPR_SAVE+8*6(%rsp), %r14
- mov _GPR_SAVE+8*7(%rsp), %r15
- mov %r10, %rsp
-
+ pop %rbx
+ FRAME_END
ret
return_null:
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
index 2ab9560b53c8..63a0d9c8e31f 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_submit_avx2.S
@@ -53,6 +53,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "sha1_mb_mgr_datastruct.S"
@@ -86,33 +87,21 @@ job_rax = %rax
len = %rax
DWORD_len = %eax
-lane = %rbp
-tmp3 = %rbp
+lane = %r12
+tmp3 = %r12
tmp = %r9
DWORD_tmp = %r9d
lane_data = %r10
-# STACK_SPACE needs to be an odd multiple of 8
-STACK_SPACE = 8*8 + 16*10 + 8
-
# JOB* submit_mb_mgr_submit_avx2(MB_MGR *state, job_sha1 *job)
# arg 1 : rcx : state
# arg 2 : rdx : job
ENTRY(sha1_mb_mgr_submit_avx2)
-
- mov %rsp, %r10
- sub $STACK_SPACE, %rsp
- and $~31, %rsp
-
- mov %rbx, (%rsp)
- mov %r10, 8*2(%rsp) #save old rsp
- mov %rbp, 8*3(%rsp)
- mov %r12, 8*4(%rsp)
- mov %r13, 8*5(%rsp)
- mov %r14, 8*6(%rsp)
- mov %r15, 8*7(%rsp)
+ FRAME_BEGIN
+ push %rbx
+ push %r12
mov _unused_lanes(state), unused_lanes
mov unused_lanes, lane
@@ -197,22 +186,15 @@ len_is_0:
vpinsrd $1, _args_digest+1*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $2, _args_digest+2*32(state , idx, 4), %xmm0, %xmm0
vpinsrd $3, _args_digest+3*32(state , idx, 4), %xmm0, %xmm0
- movl 4*32(state, idx, 4), DWORD_tmp
+ movl _args_digest+4*32(state, idx, 4), DWORD_tmp
vmovdqu %xmm0, _result_digest(job_rax)
movl DWORD_tmp, _result_digest+1*16(job_rax)
return:
-
- mov (%rsp), %rbx
- mov 8*2(%rsp), %r10 #save old rsp
- mov 8*3(%rsp), %rbp
- mov 8*4(%rsp), %r12
- mov 8*5(%rsp), %r13
- mov 8*6(%rsp), %r14
- mov 8*7(%rsp), %r15
- mov %r10, %rsp
-
+ pop %r12
+ pop %rbx
+ FRAME_END
ret
return_null:
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 05058134c443..dc66273e610d 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -24,6 +24,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
#include "glue_helper-asm-avx.S"
.file "twofish-avx-x86_64-asm_64.S"
@@ -333,6 +334,7 @@ ENTRY(twofish_ecb_enc_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -342,6 +344,7 @@ ENTRY(twofish_ecb_enc_8way)
store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+ FRAME_END
ret;
ENDPROC(twofish_ecb_enc_8way)
@@ -351,6 +354,7 @@ ENTRY(twofish_ecb_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -360,6 +364,7 @@ ENTRY(twofish_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(twofish_ecb_dec_8way)
@@ -369,6 +374,7 @@ ENTRY(twofish_cbc_dec_8way)
* %rsi: dst
* %rdx: src
*/
+ FRAME_BEGIN
pushq %r12;
@@ -383,6 +389,7 @@ ENTRY(twofish_cbc_dec_8way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(twofish_cbc_dec_8way)
@@ -393,6 +400,7 @@ ENTRY(twofish_ctr_8way)
* %rdx: src
* %rcx: iv (little endian, 128bit)
*/
+ FRAME_BEGIN
pushq %r12;
@@ -408,6 +416,7 @@ ENTRY(twofish_ctr_8way)
popq %r12;
+ FRAME_END
ret;
ENDPROC(twofish_ctr_8way)
@@ -418,6 +427,7 @@ ENTRY(twofish_xts_enc_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -430,6 +440,7 @@ ENTRY(twofish_xts_enc_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
+ FRAME_END
ret;
ENDPROC(twofish_xts_enc_8way)
@@ -440,6 +451,7 @@ ENTRY(twofish_xts_dec_8way)
* %rdx: src
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
+ FRAME_BEGIN
movq %rsi, %r11;
@@ -452,5 +464,6 @@ ENTRY(twofish_xts_dec_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ FRAME_END
ret;
ENDPROC(twofish_xts_dec_8way)
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index 56d8a08ee479..2ebb5e9789f3 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -277,13 +277,9 @@ int xts_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
u32 *flags = &tfm->crt_flags;
int err;
- /* key consists of keys of equal size concatenated, therefore
- * the length must be even
- */
- if (keylen % 2) {
- *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
- return -EINVAL;
- }
+ err = xts_check_key(tfm, key, keylen);
+ if (err)
+ return err;
/* first half of xts-key is for crypt */
err = __twofish_setkey(&ctx->crypt_ctx, key, keylen / 2, flags);
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index bd55dedd7614..fe91c25092da 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -1,6 +1,10 @@
#
# Makefile for the x86 low level entry code
#
+
+OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+
obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
obj-y += common.o
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index cb713df81180..b30dd8154cc2 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -384,3 +384,5 @@
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
377 i386 copy_file_range sys_copy_file_range
+378 i386 preadv2 sys_preadv2
+379 i386 pwritev2 sys_pwritev2
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 2e5b565adacc..cac6d17ce5db 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -333,6 +333,8 @@
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
326 common copy_file_range sys_copy_file_range
+327 64 preadv2 sys_preadv2
+328 64 pwritev2 sys_pwritev2
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index efb2b932b748..98df1fa8825c 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -8,11 +8,14 @@
#include <linux/linkage.h>
#include "calling.h"
#include <asm/asm.h>
+#include <asm/frame.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
+ .type \name, @function
\name:
+ FRAME_BEGIN
/* this one pushes 9 elems, the next one would be %rIP */
pushq %rdi
@@ -62,6 +65,7 @@ restore:
popq %rdx
popq %rsi
popq %rdi
+ FRAME_END
ret
_ASM_NOKPROBE(restore)
#endif
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c854541d93ff..6874da5f67fc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -3,8 +3,12 @@
#
KBUILD_CFLAGS += $(DISABLE_LTO)
-KASAN_SANITIZE := n
-UBSAN_SANITIZE := n
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
@@ -16,6 +20,7 @@ vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
# files to link into kernel
obj-y += vma.o
+OBJECT_FILES_NON_STANDARD_vma.o := n
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 1a50e09c945b..03c3eb77bfce 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -178,7 +178,7 @@ notrace static cycle_t vread_tsc(void)
/*
* GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
+ * predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index fdfea1511cc0..f59618a39990 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,6 +1,7 @@
obj-y += core.o
obj-$(CONFIG_CPU_SUP_AMD) += amd/core.o amd/uncore.o
+obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += amd/power.o
obj-$(CONFIG_X86_LOCAL_APIC) += amd/ibs.o msr.o
ifdef CONFIG_AMD_IOMMU
obj-$(CONFIG_CPU_SUP_AMD) += amd/iommu.o
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 51087c29b2c2..3ea25c3917c0 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -376,7 +376,13 @@ static void perf_ibs_start(struct perf_event *event, int flags)
hwc->state = 0;
perf_ibs_set_period(perf_ibs, hwc, &period);
+ /*
+ * Set STARTED before enabling the hardware, such that
+ * a subsequent NMI must observe it. Then clear STOPPING
+ * such that we don't consume NMIs by accident.
+ */
set_bit(IBS_STARTED, pcpu->state);
+ clear_bit(IBS_STOPPING, pcpu->state);
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
perf_event_update_userpage(event);
@@ -390,7 +396,7 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
u64 config;
int stopping;
- stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
+ stopping = test_bit(IBS_STARTED, pcpu->state);
if (!stopping && (hwc->state & PERF_HES_UPTODATE))
return;
@@ -398,8 +404,24 @@ static void perf_ibs_stop(struct perf_event *event, int flags)
rdmsrl(hwc->config_base, config);
if (stopping) {
+ /*
+ * Set STOPPING before disabling the hardware, such that it
+ * must be visible to NMIs the moment we clear the EN bit,
+ * at which point we can generate an !VALID sample which
+ * we need to consume.
+ */
set_bit(IBS_STOPPING, pcpu->state);
perf_ibs_disable_event(perf_ibs, hwc, config);
+ /*
+ * Clear STARTED after disabling the hardware; if it were
+ * cleared before an NMI hitting after the clear but before
+ * clearing the EN bit might think it a spurious NMI and not
+ * handle it.
+ *
+ * Clearing it after, however, creates the problem of the NMI
+ * handler seeing STARTED but not having a valid sample.
+ */
+ clear_bit(IBS_STARTED, pcpu->state);
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
}
@@ -527,20 +549,24 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
u64 *buf, *config, period;
if (!test_bit(IBS_STARTED, pcpu->state)) {
+fail:
/*
* Catch spurious interrupts after stopping IBS: After
* disabling IBS there could be still incoming NMIs
* with samples that even have the valid bit cleared.
* Mark all this NMIs as handled.
*/
- return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
+ if (test_and_clear_bit(IBS_STOPPING, pcpu->state))
+ return 1;
+
+ return 0;
}
msr = hwc->config_base;
buf = ibs_data.regs;
rdmsrl(msr, *buf);
if (!(*buf++ & perf_ibs->valid_mask))
- return 0;
+ goto fail;
config = &ibs_data.regs[0];
perf_ibs_event_update(perf_ibs, event, config);
@@ -599,7 +625,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
throttle = perf_event_overflow(event, &data, &regs);
out:
if (throttle)
- perf_ibs_disable_event(perf_ibs, hwc, *config);
+ perf_ibs_stop(event, 0);
else
perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
@@ -611,6 +637,7 @@ out:
static int
perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
{
+ u64 stamp = sched_clock();
int handled = 0;
handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
@@ -619,6 +646,8 @@ perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
if (handled)
inc_irq_stat(apic_perf_irqs);
+ perf_sample_event_took(sched_clock() - stamp);
+
return handled;
}
NOKPROBE_SYMBOL(perf_ibs_nmi_handler);
diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c
index 635e5eba0caf..40625ca7a190 100644
--- a/arch/x86/events/amd/iommu.c
+++ b/arch/x86/events/amd/iommu.c
@@ -118,6 +118,11 @@ static struct amd_iommu_event_desc amd_iommu_v2_event_descs[] = {
AMD_IOMMU_EVENT_DESC(cmd_processed, "csource=0x11"),
AMD_IOMMU_EVENT_DESC(cmd_processed_inv, "csource=0x12"),
AMD_IOMMU_EVENT_DESC(tlb_inv, "csource=0x13"),
+ AMD_IOMMU_EVENT_DESC(ign_rd_wr_mmio_1ff8h, "csource=0x14"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_non_guest, "csource=0x15"),
+ AMD_IOMMU_EVENT_DESC(vapic_int_guest, "csource=0x16"),
+ AMD_IOMMU_EVENT_DESC(smi_recv, "csource=0x17"),
+ AMD_IOMMU_EVENT_DESC(smi_blk, "csource=0x18"),
{ /* end: all zeroes */ },
};
diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c
new file mode 100644
index 000000000000..55a3529dbf12
--- /dev/null
+++ b/arch/x86/events/amd/power.c
@@ -0,0 +1,353 @@
+/*
+ * Performance events - AMD Processor Power Reporting Mechanism
+ *
+ * Copyright (C) 2016 Advanced Micro Devices, Inc.
+ *
+ * Author: Huang Rui <ray.huang@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/perf_event.h>
+#include <asm/cpu_device_id.h>
+#include "../perf_event.h"
+
+#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a
+#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b
+#define MSR_F15H_PTSC 0xc0010280
+
+/* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */
+#define AMD_POWER_EVENT_MASK 0xFFULL
+
+/*
+ * Accumulated power status counters.
+ */
+#define AMD_POWER_EVENTSEL_PKG 1
+
+/*
+ * The ratio of compute unit power accumulator sample period to the
+ * PTSC period.
+ */
+static unsigned int cpu_pwr_sample_ratio;
+
+/* Maximum accumulated power of a compute unit. */
+static u64 max_cu_acc_power;
+
+static struct pmu pmu_class;
+
+/*
+ * Accumulated power represents the sum of each compute unit's (CU) power
+ * consumption. On any core of each CU we read the total accumulated power from
+ * MSR_F15H_CU_PWR_ACCUMULATOR. cpu_mask represents CPU bit map of all cores
+ * which are picked to measure the power for the CUs they belong to.
+ */
+static cpumask_t cpu_mask;
+
+static void event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev_pwr_acc, new_pwr_acc, prev_ptsc, new_ptsc;
+ u64 delta, tdelta;
+
+ prev_pwr_acc = hwc->pwr_acc;
+ prev_ptsc = hwc->ptsc;
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, new_pwr_acc);
+ rdmsrl(MSR_F15H_PTSC, new_ptsc);
+
+ /*
+ * Calculate the CU power consumption over a time period, the unit of
+ * final value (delta) is micro-Watts. Then add it to the event count.
+ */
+ if (new_pwr_acc < prev_pwr_acc) {
+ delta = max_cu_acc_power + new_pwr_acc;
+ delta -= prev_pwr_acc;
+ } else
+ delta = new_pwr_acc - prev_pwr_acc;
+
+ delta *= cpu_pwr_sample_ratio * 1000;
+ tdelta = new_ptsc - prev_ptsc;
+
+ do_div(delta, tdelta);
+ local64_add(delta, &event->count);
+}
+
+static void __pmu_event_start(struct perf_event *event)
+{
+ if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
+ return;
+
+ event->hw.state = 0;
+
+ rdmsrl(MSR_F15H_PTSC, event->hw.ptsc);
+ rdmsrl(MSR_F15H_CU_PWR_ACCUMULATOR, event->hw.pwr_acc);
+}
+
+static void pmu_event_start(struct perf_event *event, int mode)
+{
+ __pmu_event_start(event);
+}
+
+static void pmu_event_stop(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /* Mark event as deactivated and stopped. */
+ if (!(hwc->state & PERF_HES_STOPPED))
+ hwc->state |= PERF_HES_STOPPED;
+
+ /* Check if software counter update is necessary. */
+ if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
+ /*
+ * Drain the remaining delta count out of an event
+ * that we are disabling:
+ */
+ event_update(event);
+ hwc->state |= PERF_HES_UPTODATE;
+ }
+}
+
+static int pmu_event_add(struct perf_event *event, int mode)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_START)
+ __pmu_event_start(event);
+
+ return 0;
+}
+
+static void pmu_event_del(struct perf_event *event, int flags)
+{
+ pmu_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int pmu_event_init(struct perf_event *event)
+{
+ u64 cfg = event->attr.config & AMD_POWER_EVENT_MASK;
+
+ /* Only look at AMD power events. */
+ if (event->attr.type != pmu_class.type)
+ return -ENOENT;
+
+ /* Unsupported modes and filters. */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ /* no sampling */
+ event->attr.sample_period)
+ return -EINVAL;
+
+ if (cfg != AMD_POWER_EVENTSEL_PKG)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void pmu_event_read(struct perf_event *event)
+{
+ event_update(event);
+}
+
+static ssize_t
+get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf)
+{
+ return cpumap_print_to_pagebuf(true, buf, &cpu_mask);
+}
+
+static DEVICE_ATTR(cpumask, S_IRUGO, get_attr_cpumask, NULL);
+
+static struct attribute *pmu_attrs[] = {
+ &dev_attr_cpumask.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_attr_group = {
+ .attrs = pmu_attrs,
+};
+
+/*
+ * Currently it only supports to report the power of each
+ * processor/package.
+ */
+EVENT_ATTR_STR(power-pkg, power_pkg, "event=0x01");
+
+EVENT_ATTR_STR(power-pkg.unit, power_pkg_unit, "mWatts");
+
+/* Convert the count from micro-Watts to milli-Watts. */
+EVENT_ATTR_STR(power-pkg.scale, power_pkg_scale, "1.000000e-3");
+
+static struct attribute *events_attr[] = {
+ EVENT_PTR(power_pkg),
+ EVENT_PTR(power_pkg_unit),
+ EVENT_PTR(power_pkg_scale),
+ NULL,
+};
+
+static struct attribute_group pmu_events_group = {
+ .name = "events",
+ .attrs = events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group pmu_format_group = {
+ .name = "format",
+ .attrs = formats_attr,
+};
+
+static const struct attribute_group *attr_groups[] = {
+ &pmu_attr_group,
+ &pmu_format_group,
+ &pmu_events_group,
+ NULL,
+};
+
+static struct pmu pmu_class = {
+ .attr_groups = attr_groups,
+ /* system-wide only */
+ .task_ctx_nr = perf_invalid_context,
+ .event_init = pmu_event_init,
+ .add = pmu_event_add,
+ .del = pmu_event_del,
+ .start = pmu_event_start,
+ .stop = pmu_event_stop,
+ .read = pmu_event_read,
+};
+
+static void power_cpu_exit(int cpu)
+{
+ int target;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &cpu_mask))
+ return;
+
+ /*
+ * Find a new CPU on the same compute unit, if was set in cpumask
+ * and still some CPUs on compute unit. Then migrate event and
+ * context to new CPU.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target < nr_cpumask_bits) {
+ cpumask_set_cpu(target, &cpu_mask);
+ perf_pmu_migrate_context(&pmu_class, cpu, target);
+ }
+}
+
+static void power_cpu_init(int cpu)
+{
+ int target;
+
+ /*
+ * 1) If any CPU is set at cpu_mask in the same compute unit, do
+ * nothing.
+ * 2) If no CPU is set at cpu_mask in the same compute unit,
+ * set current STARTING CPU.
+ *
+ * Note: if there is a CPU aside of the new one already in the
+ * sibling mask, then it is also in cpu_mask.
+ */
+ target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
+ if (target >= nr_cpumask_bits)
+ cpumask_set_cpu(cpu, &cpu_mask);
+}
+
+static int
+power_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
+ case CPU_STARTING:
+ power_cpu_init(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ power_cpu_exit(cpu);
+ break;
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block power_cpu_notifier_nb = {
+ .notifier_call = power_cpu_notifier,
+ .priority = CPU_PRI_PERF,
+};
+
+static const struct x86_cpu_id cpu_match[] = {
+ { .vendor = X86_VENDOR_AMD, .family = 0x15 },
+ {},
+};
+
+static int __init amd_power_pmu_init(void)
+{
+ int cpu, target, ret;
+
+ if (!x86_match_cpu(cpu_match))
+ return 0;
+
+ if (!boot_cpu_has(X86_FEATURE_ACC_POWER))
+ return -ENODEV;
+
+ cpu_pwr_sample_ratio = cpuid_ecx(0x80000007);
+
+ if (rdmsrl_safe(MSR_F15H_CU_MAX_PWR_ACCUMULATOR, &max_cu_acc_power)) {
+ pr_err("Failed to read max compute unit power accumulator MSR\n");
+ return -ENODEV;
+ }
+
+ cpu_notifier_register_begin();
+
+ /* Choose one online core of each compute unit. */
+ for_each_online_cpu(cpu) {
+ target = cpumask_first(topology_sibling_cpumask(cpu));
+ if (!cpumask_test_cpu(target, &cpu_mask))
+ cpumask_set_cpu(target, &cpu_mask);
+ }
+
+ ret = perf_pmu_register(&pmu_class, "power", -1);
+ if (WARN_ON(ret)) {
+ pr_warn("AMD Power PMU registration failed\n");
+ goto out;
+ }
+
+ __register_cpu_notifier(&power_cpu_notifier_nb);
+
+ pr_info("AMD Power PMU detected\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+module_init(amd_power_pmu_init);
+
+static void __exit amd_power_pmu_exit(void)
+{
+ cpu_notifier_register_begin();
+ __unregister_cpu_notifier(&power_cpu_notifier_nb);
+ cpu_notifier_register_done();
+
+ perf_pmu_unregister(&pmu_class);
+}
+module_exit(amd_power_pmu_exit);
+
+MODULE_AUTHOR("Huang Rui <ray.huang@amd.com>");
+MODULE_DESCRIPTION("AMD Processor Power Reporting Mechanism");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e830d0c95c9..041e442a3e28 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1602,8 +1602,7 @@ __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
return new;
}
-ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
- char *page)
+ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct perf_pmu_events_attr *pmu_attr = \
container_of(attr, struct perf_pmu_events_attr, attr);
@@ -1615,6 +1614,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
return x86_pmu.events_sysfs_show(page, config);
}
+EXPORT_SYMBOL_GPL(events_sysfs_show);
EVENT_ATTR(cpu-cycles, CPU_CYCLES );
EVENT_ATTR(instructions, INSTRUCTIONS );
@@ -2194,11 +2194,11 @@ static int backtrace_stack(void *data, char *name)
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- perf_callchain_store(entry, addr);
+ return perf_callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index 93cb412a5579..7b5fd811ef45 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -13,8 +13,16 @@
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
+#define MBM_CNTR_WIDTH 24
+/*
+ * Guaranteed time in ms as per SDM where MBM counters will not overflow.
+ */
+#define MBM_CTR_OVERFLOW_TIME 1000
+
static u32 cqm_max_rmid = -1;
static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+static bool cqm_enabled, mbm_enabled;
+unsigned int mbm_socket_max;
/**
* struct intel_pqr_state - State cache for the PQR MSR
@@ -42,8 +50,37 @@ struct intel_pqr_state {
* interrupts disabled, which is sufficient for the protection.
*/
static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+static struct hrtimer *mbm_timers;
+/**
+ * struct sample - mbm event's (local or total) data
+ * @total_bytes #bytes since we began monitoring
+ * @prev_msr previous value of MSR
+ */
+struct sample {
+ u64 total_bytes;
+ u64 prev_msr;
+};
/*
+ * samples profiled for total memory bandwidth type events
+ */
+static struct sample *mbm_total;
+/*
+ * samples profiled for local memory bandwidth type events
+ */
+static struct sample *mbm_local;
+
+#define pkg_id topology_physical_package_id(smp_processor_id())
+/*
+ * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
+ * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
+ * rmids per socket, an example is given below
+ * RMID1 of Socket0: vrmid = 1
+ * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
+ * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
+ */
+#define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
+/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
* Also protects event->hw.cqm_rmid
*
@@ -65,9 +102,13 @@ static cpumask_t cqm_cpumask;
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
-#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
-
-#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+/*
+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
+ * counter from IA32_QM_CTR
+ */
+#define QOS_L3_OCCUP_EVENT_ID 0x01
+#define QOS_MBM_TOTAL_EVENT_ID 0x02
+#define QOS_MBM_LOCAL_EVENT_ID 0x03
/*
* This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
@@ -211,6 +252,21 @@ static void __put_rmid(u32 rmid)
list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
}
+static void cqm_cleanup(void)
+{
+ int i;
+
+ if (!cqm_rmid_ptrs)
+ return;
+
+ for (i = 0; i < cqm_max_rmid; i++)
+ kfree(cqm_rmid_ptrs[i]);
+
+ kfree(cqm_rmid_ptrs);
+ cqm_rmid_ptrs = NULL;
+ cqm_enabled = false;
+}
+
static int intel_cqm_setup_rmid_cache(void)
{
struct cqm_rmid_entry *entry;
@@ -218,7 +274,7 @@ static int intel_cqm_setup_rmid_cache(void)
int r = 0;
nr_rmids = cqm_max_rmid + 1;
- cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
nr_rmids, GFP_KERNEL);
if (!cqm_rmid_ptrs)
return -ENOMEM;
@@ -249,11 +305,9 @@ static int intel_cqm_setup_rmid_cache(void)
mutex_unlock(&cache_mutex);
return 0;
-fail:
- while (r--)
- kfree(cqm_rmid_ptrs[r]);
- kfree(cqm_rmid_ptrs);
+fail:
+ cqm_cleanup();
return -ENOMEM;
}
@@ -281,9 +335,13 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
/*
* Events that target same task are placed into the same cache group.
+ * Mark it as a multi event group, so that we update ->count
+ * for every event rather than just the group leader later.
*/
- if (a->hw.target == b->hw.target)
+ if (a->hw.target == b->hw.target) {
+ b->hw.is_group_event = true;
return true;
+ }
/*
* Are we an inherited event?
@@ -392,10 +450,26 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
struct rmid_read {
u32 rmid;
+ u32 evt_type;
atomic64_t value;
};
static void __intel_cqm_event_count(void *info);
+static void init_mbm_sample(u32 rmid, u32 evt_type);
+static void __intel_mbm_event_count(void *info);
+
+static bool is_mbm_event(int e)
+{
+ return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
+}
+
+static void cqm_mask_call(struct rmid_read *rr)
+{
+ if (is_mbm_event(rr->evt_type))
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
+ else
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
+}
/*
* Exchange the RMID of a group of events.
@@ -413,12 +487,12 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
*/
if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
struct rmid_read rr = {
- .value = ATOMIC64_INIT(0),
.rmid = old_rmid,
+ .evt_type = group->attr.config,
+ .value = ATOMIC64_INIT(0),
};
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
- &rr, 1);
+ cqm_mask_call(&rr);
local64_set(&group->count, atomic64_read(&rr.value));
}
@@ -430,6 +504,22 @@ static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
raw_spin_unlock_irq(&cache_lock);
+ /*
+ * If the allocation is for mbm, init the mbm stats.
+ * Need to check if each event in the group is mbm event
+ * because there could be multiple type of events in the same group.
+ */
+ if (__rmid_valid(rmid)) {
+ event = group;
+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
+
+ list_for_each_entry(event, head, hw.cqm_group_entry) {
+ if (is_mbm_event(event->attr.config))
+ init_mbm_sample(rmid, event->attr.config);
+ }
+ }
+
return old_rmid;
}
@@ -837,6 +927,72 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
schedule_delayed_work(&intel_cqm_rmid_work, delay);
}
+static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
+{
+ struct sample *mbm_current;
+ u32 vrmid = rmid_2_index(rmid);
+ u64 val, bytes, shift;
+ u32 eventid;
+
+ if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
+ mbm_current = &mbm_local[vrmid];
+ eventid = QOS_MBM_LOCAL_EVENT_ID;
+ } else {
+ mbm_current = &mbm_total[vrmid];
+ eventid = QOS_MBM_TOTAL_EVENT_ID;
+ }
+
+ wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return mbm_current->total_bytes;
+
+ if (first) {
+ mbm_current->prev_msr = val;
+ mbm_current->total_bytes = 0;
+ return mbm_current->total_bytes;
+ }
+
+ /*
+ * The h/w guarantees that counters will not overflow
+ * so long as we poll them at least once per second.
+ */
+ shift = 64 - MBM_CNTR_WIDTH;
+ bytes = (val << shift) - (mbm_current->prev_msr << shift);
+ bytes >>= shift;
+
+ bytes *= cqm_l3_scale;
+
+ mbm_current->total_bytes += bytes;
+ mbm_current->prev_msr = val;
+
+ return mbm_current->total_bytes;
+}
+
+static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
+{
+ return update_sample(rmid, evt_type, 0);
+}
+
+static void __intel_mbm_event_init(void *info)
+{
+ struct rmid_read *rr = info;
+
+ update_sample(rr->rmid, rr->evt_type, 1);
+}
+
+static void init_mbm_sample(u32 rmid, u32 evt_type)
+{
+ struct rmid_read rr = {
+ .rmid = rmid,
+ .evt_type = evt_type,
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /* on each socket, init sample */
+ on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
+}
+
/*
* Find a group and setup RMID.
*
@@ -849,6 +1005,7 @@ static void intel_cqm_setup_event(struct perf_event *event,
bool conflict = false;
u32 rmid;
+ event->hw.is_group_event = false;
list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
rmid = iter->hw.cqm_rmid;
@@ -856,6 +1013,8 @@ static void intel_cqm_setup_event(struct perf_event *event,
/* All tasks in a group share an RMID */
event->hw.cqm_rmid = rmid;
*group = iter;
+ if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
+ init_mbm_sample(rmid, event->attr.config);
return;
}
@@ -872,6 +1031,9 @@ static void intel_cqm_setup_event(struct perf_event *event,
else
rmid = __get_rmid();
+ if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
+ init_mbm_sample(rmid, event->attr.config);
+
event->hw.cqm_rmid = rmid;
}
@@ -893,7 +1055,10 @@ static void intel_cqm_event_read(struct perf_event *event)
if (!__rmid_valid(rmid))
goto out;
- val = __rmid_read(rmid);
+ if (is_mbm_event(event->attr.config))
+ val = rmid_read_mbm(rmid, event->attr.config);
+ else
+ val = __rmid_read(rmid);
/*
* Ignore this reading on error states and do not update the value.
@@ -924,10 +1089,100 @@ static inline bool cqm_group_leader(struct perf_event *event)
return !list_empty(&event->hw.cqm_groups_entry);
}
+static void __intel_mbm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = rmid_read_mbm(rr->rmid, rr->evt_type);
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+ atomic64_add(val, &rr->value);
+}
+
+static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
+{
+ struct perf_event *iter, *iter1;
+ int ret = HRTIMER_RESTART;
+ struct list_head *head;
+ unsigned long flags;
+ u32 grp_rmid;
+
+ /*
+ * Need to cache_lock as the timer Event Select MSR reads
+ * can race with the mbm/cqm count() and mbm_init() reads.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ if (list_empty(&cache_groups)) {
+ ret = HRTIMER_NORESTART;
+ goto out;
+ }
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ grp_rmid = iter->hw.cqm_rmid;
+ if (!__rmid_valid(grp_rmid))
+ continue;
+ if (is_mbm_event(iter->attr.config))
+ update_sample(grp_rmid, iter->attr.config, 0);
+
+ head = &iter->hw.cqm_group_entry;
+ if (list_empty(head))
+ continue;
+ list_for_each_entry(iter1, head, hw.cqm_group_entry) {
+ if (!iter1->hw.is_group_event)
+ break;
+ if (is_mbm_event(iter1->attr.config))
+ update_sample(iter1->hw.cqm_rmid,
+ iter1->attr.config, 0);
+ }
+ }
+
+ hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return ret;
+}
+
+static void __mbm_start_timer(void *info)
+{
+ hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static void __mbm_stop_timer(void *info)
+{
+ hrtimer_cancel(&mbm_timers[pkg_id]);
+}
+
+static void mbm_start_timers(void)
+{
+ on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
+}
+
+static void mbm_stop_timers(void)
+{
+ on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
+}
+
+static void mbm_hrtimer_init(void)
+{
+ struct hrtimer *hr;
+ int i;
+
+ for (i = 0; i < mbm_socket_max; i++) {
+ hr = &mbm_timers[i];
+ hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hr->function = mbm_hrtimer_handle;
+ }
+}
+
static u64 intel_cqm_event_count(struct perf_event *event)
{
unsigned long flags;
struct rmid_read rr = {
+ .evt_type = event->attr.config,
.value = ATOMIC64_INIT(0),
};
@@ -940,7 +1195,9 @@ static u64 intel_cqm_event_count(struct perf_event *event)
return __perf_event_count(event);
/*
- * Only the group leader gets to report values. This stops us
+ * Only the group leader gets to report values except in case of
+ * multiple events in the same group, we still need to read the
+ * other events.This stops us
* reporting duplicate values to userspace, and gives us a clear
* rule for which task gets to report the values.
*
@@ -948,7 +1205,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
* specific packages - we forfeit that ability when we create
* task events.
*/
- if (!cqm_group_leader(event))
+ if (!cqm_group_leader(event) && !event->hw.is_group_event)
return 0;
/*
@@ -975,7 +1232,7 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!__rmid_valid(rr.rmid))
goto out;
- on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+ cqm_mask_call(&rr);
raw_spin_lock_irqsave(&cache_lock, flags);
if (event->hw.cqm_rmid == rr.rmid)
@@ -1046,8 +1303,14 @@ static int intel_cqm_event_add(struct perf_event *event, int mode)
static void intel_cqm_event_destroy(struct perf_event *event)
{
struct perf_event *group_other = NULL;
+ unsigned long flags;
mutex_lock(&cache_mutex);
+ /*
+ * Hold the cache_lock as mbm timer handlers could be
+ * scanning the list of events.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
/*
* If there's another event in this group...
@@ -1079,6 +1342,14 @@ static void intel_cqm_event_destroy(struct perf_event *event)
}
}
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ /*
+ * Stop the mbm overflow timers when the last event is destroyed.
+ */
+ if (mbm_enabled && list_empty(&cache_groups))
+ mbm_stop_timers();
+
mutex_unlock(&cache_mutex);
}
@@ -1086,11 +1357,13 @@ static int intel_cqm_event_init(struct perf_event *event)
{
struct perf_event *group = NULL;
bool rotate = false;
+ unsigned long flags;
if (event->attr.type != intel_cqm_pmu.type)
return -ENOENT;
- if (event->attr.config & ~QOS_EVENT_MASK)
+ if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
+ (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
return -EINVAL;
/* unsupported modes and filters */
@@ -1110,9 +1383,21 @@ static int intel_cqm_event_init(struct perf_event *event)
mutex_lock(&cache_mutex);
+ /*
+ * Start the mbm overflow timers when the first event is created.
+ */
+ if (mbm_enabled && list_empty(&cache_groups))
+ mbm_start_timers();
+
/* Will also set rmid */
intel_cqm_setup_event(event, &group);
+ /*
+ * Hold the cache_lock as mbm timer handlers be
+ * scanning the list of events.
+ */
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
if (group) {
list_add_tail(&event->hw.cqm_group_entry,
&group->hw.cqm_group_entry);
@@ -1131,6 +1416,7 @@ static int intel_cqm_event_init(struct perf_event *event)
rotate = true;
}
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
mutex_unlock(&cache_mutex);
if (rotate)
@@ -1145,6 +1431,16 @@ EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
+EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
+EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
+EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
+
+EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
+EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
+EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
+EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
+
static struct attribute *intel_cqm_events_attr[] = {
EVENT_PTR(intel_cqm_llc),
EVENT_PTR(intel_cqm_llc_pkg),
@@ -1154,9 +1450,38 @@ static struct attribute *intel_cqm_events_attr[] = {
NULL,
};
+static struct attribute *intel_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_total_bytes),
+ EVENT_PTR(intel_cqm_local_bytes),
+ EVENT_PTR(intel_cqm_total_bytes_pkg),
+ EVENT_PTR(intel_cqm_local_bytes_pkg),
+ EVENT_PTR(intel_cqm_total_bytes_unit),
+ EVENT_PTR(intel_cqm_local_bytes_unit),
+ EVENT_PTR(intel_cqm_total_bytes_scale),
+ EVENT_PTR(intel_cqm_local_bytes_scale),
+ NULL,
+};
+
+static struct attribute *intel_cmt_mbm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_total_bytes),
+ EVENT_PTR(intel_cqm_local_bytes),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_total_bytes_pkg),
+ EVENT_PTR(intel_cqm_local_bytes_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_total_bytes_unit),
+ EVENT_PTR(intel_cqm_local_bytes_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_total_bytes_scale),
+ EVENT_PTR(intel_cqm_local_bytes_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
static struct attribute_group intel_cqm_events_group = {
.name = "events",
- .attrs = intel_cqm_events_attr,
+ .attrs = NULL,
};
PMU_FORMAT_ATTR(event, "config:0-7");
@@ -1303,12 +1628,70 @@ static const struct x86_cpu_id intel_cqm_match[] = {
{}
};
+static void mbm_cleanup(void)
+{
+ if (!mbm_enabled)
+ return;
+
+ kfree(mbm_local);
+ kfree(mbm_total);
+ mbm_enabled = false;
+}
+
+static const struct x86_cpu_id intel_mbm_local_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
+ {}
+};
+
+static const struct x86_cpu_id intel_mbm_total_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
+ {}
+};
+
+static int intel_mbm_init(void)
+{
+ int ret = 0, array_size, maxid = cqm_max_rmid + 1;
+
+ mbm_socket_max = topology_max_packages();
+ array_size = sizeof(struct sample) * maxid * mbm_socket_max;
+ mbm_local = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_local)
+ return -ENOMEM;
+
+ mbm_total = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_total) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ array_size = sizeof(struct hrtimer) * mbm_socket_max;
+ mbm_timers = kmalloc(array_size, GFP_KERNEL);
+ if (!mbm_timers) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ mbm_hrtimer_init();
+
+out:
+ if (ret)
+ mbm_cleanup();
+
+ return ret;
+}
+
static int __init intel_cqm_init(void)
{
- char *str, scale[20];
+ char *str = NULL, scale[20];
int i, cpu, ret;
- if (!x86_match_cpu(intel_cqm_match))
+ if (x86_match_cpu(intel_cqm_match))
+ cqm_enabled = true;
+
+ if (x86_match_cpu(intel_mbm_local_match) &&
+ x86_match_cpu(intel_mbm_total_match))
+ mbm_enabled = true;
+
+ if (!cqm_enabled && !mbm_enabled)
return -ENODEV;
cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
@@ -1365,16 +1748,41 @@ static int __init intel_cqm_init(void)
cqm_pick_event_reader(i);
}
- __perf_cpu_notifier(intel_cqm_cpu_notifier);
+ if (mbm_enabled)
+ ret = intel_mbm_init();
+ if (ret && !cqm_enabled)
+ goto out;
+
+ if (cqm_enabled && mbm_enabled)
+ intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
+ else if (!cqm_enabled && mbm_enabled)
+ intel_cqm_events_group.attrs = intel_mbm_events_attr;
+ else if (cqm_enabled && !mbm_enabled)
+ intel_cqm_events_group.attrs = intel_cqm_events_attr;
ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
- if (ret)
+ if (ret) {
pr_err("Intel CQM perf registration failed: %d\n", ret);
- else
+ goto out;
+ }
+
+ if (cqm_enabled)
pr_info("Intel CQM monitoring enabled\n");
+ if (mbm_enabled)
+ pr_info("Intel MBM enabled\n");
+ /*
+ * Register the hot cpu notifier once we are sure cqm
+ * is enabled to avoid notifier leak.
+ */
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
out:
cpu_notifier_register_done();
+ if (ret) {
+ kfree(str);
+ cqm_cleanup();
+ mbm_cleanup();
+ }
return ret;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ce7211a07c0b..8584b90d8e0b 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -570,11 +570,12 @@ int intel_pmu_drain_bts_buffer(void)
* We will overwrite the from and to address before we output
* the sample.
*/
+ rcu_read_lock();
perf_prepare_sample(&header, &data, event, &regs);
if (perf_output_begin(&handle, event, header.size *
(top - base - skip)))
- return 1;
+ goto unlock;
for (at = base; at < top; at++) {
/* Filter out any records that contain kernel addresses. */
@@ -593,6 +594,8 @@ int intel_pmu_drain_bts_buffer(void)
/* There's new data available. */
event->hw.interrupts++;
event->pending_kill = POLL_IN;
+unlock:
+ rcu_read_unlock();
return 1;
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 69dd11887dd1..6c3b7c1780c9 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -649,7 +649,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* return the type of control flow change at address "from"
- * intruction is not necessarily a branch (in case of interrupt).
+ * instruction is not necessarily a branch (in case of interrupt).
*
* The branch type returned also includes the priv level of the
* target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c
index b834a3f55a01..70c93f9b03ac 100644
--- a/arch/x86/events/intel/rapl.c
+++ b/arch/x86/events/intel/rapl.c
@@ -711,6 +711,7 @@ static int __init rapl_pmu_init(void)
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
case 63: /* Haswell-Server */
+ case 79: /* Broadwell-Server */
apply_quirk = true;
rapl_cntr_mask = RAPL_IDX_SRV;
rapl_pmu_events_group.attrs = rapl_events_srv_attr;
@@ -718,6 +719,7 @@ static int __init rapl_pmu_init(void)
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
case 61: /* Broadwell */
+ case 71: /* Broadwell-H */
rapl_cntr_mask = RAPL_IDX_HSW;
rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
break;
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 93f6bd9bf761..ab2bcaaebe38 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -46,7 +46,6 @@
(SNBEP_PMON_CTL_EV_SEL_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PMON_CTL_INVERT | \
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -148,7 +147,6 @@
/* IVBEP PCU */
#define IVBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
(SNBEP_PMON_CTL_EV_SEL_MASK | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
@@ -258,7 +256,6 @@
SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
SNBEP_PMON_CTL_EDGE_DET | \
SNBEP_CBO_PMON_CTL_TID_EN | \
- SNBEP_PMON_CTL_EV_SEL_EXT | \
SNBEP_PMON_CTL_INVERT | \
KNL_PCU_MSR_PMON_CTL_TRESH_MASK | \
SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
@@ -472,7 +469,7 @@ static struct attribute *snbep_uncore_cbox_formats_attr[] = {
};
static struct attribute *snbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
+ &format_attr_event.attr,
&format_attr_occ_sel.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
@@ -1313,7 +1310,7 @@ static struct attribute *ivbep_uncore_cbox_formats_attr[] = {
};
static struct attribute *ivbep_uncore_pcu_formats_attr[] = {
- &format_attr_event_ext.attr,
+ &format_attr_event.attr,
&format_attr_occ_sel.attr,
&format_attr_edge.attr,
&format_attr_thresh5.attr,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 68155cafa8a1..ba6ef18528c9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -272,7 +272,7 @@ struct cpu_hw_events {
* events to select for counter rescheduling.
*
* Care must be taken as the rescheduling algorithm is O(n!) which
- * will increase scheduling cycles for an over-commited system
+ * will increase scheduling cycles for an over-committed system
* dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
* and its counter masks must be kept at a minimum.
*/
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 0899cfc8dfe8..98f25bbafac4 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -643,8 +643,8 @@ static inline void entering_irq(void)
static inline void entering_ack_irq(void)
{
- ack_APIC_irq();
entering_irq();
+ ack_APIC_irq();
}
static inline void ipi_entering_ack_irq(void)
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
index f50de6951738..532f85e6651f 100644
--- a/arch/x86/include/asm/checksum_32.h
+++ b/arch/x86/include/asm/checksum_32.h
@@ -112,8 +112,7 @@ static inline __sum16 csum_fold(__wsum sum)
}
static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
asm("addl %1, %0 ;\n"
@@ -131,8 +130,7 @@ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
* returns a 16-bit checksum, already complemented
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
@@ -151,8 +149,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
#define _HAVE_ARCH_IPV6_CSUM
static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
- __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
asm("addl 0(%1), %0 ;\n"
"adcl 4(%1), %0 ;\n"
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index cd00e1774491..c020ee75dce7 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -84,8 +84,8 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
* 32bit unfolded.
*/
static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
- unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
@@ -110,8 +110,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
* complemented and ready to be filled in.
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto,
+ __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
}
@@ -177,7 +177,7 @@ struct in6_addr;
#define _HAVE_ARCH_IPV6_CSUM 1
extern __sum16
csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum);
+ __u32 len, __u8 proto, __wsum sum);
static inline unsigned add32_with_carry(unsigned a, unsigned b)
{
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index acdee09228b3..ebb102e1bbc7 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -316,9 +316,10 @@ static inline bool is_x32_task(void)
return false;
}
-static inline bool is_compat_task(void)
+static inline bool in_compat_syscall(void)
{
return is_ia32_task() || is_x32_task();
}
+#define in_compat_syscall in_compat_syscall /* override the generic impl */
#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 68e4e8258b84..3636ec06c887 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -26,6 +26,7 @@ enum cpuid_leafs
CPUID_8000_0008_EBX,
CPUID_6_EAX,
CPUID_8000_000A_EDX,
+ CPUID_7_ECX,
};
#ifdef CONFIG_X86_FEATURE_NAMES
@@ -48,28 +49,42 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
test_bit(bit, (unsigned long *)((c)->x86_capability))
#define REQUIRED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
#define DISABLED_MASK_BIT_SET(bit) \
- ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0)) || \
- (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1)) || \
- (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2)) || \
- (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3)) || \
- (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4)) || \
- (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5)) || \
- (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6)) || \
- (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7)) || \
- (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8)) || \
- (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9)) )
+ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \
+ (((bit)>>5)==1 && (1UL<<((bit)&31) & DISABLED_MASK1 )) || \
+ (((bit)>>5)==2 && (1UL<<((bit)&31) & DISABLED_MASK2 )) || \
+ (((bit)>>5)==3 && (1UL<<((bit)&31) & DISABLED_MASK3 )) || \
+ (((bit)>>5)==4 && (1UL<<((bit)&31) & DISABLED_MASK4 )) || \
+ (((bit)>>5)==5 && (1UL<<((bit)&31) & DISABLED_MASK5 )) || \
+ (((bit)>>5)==6 && (1UL<<((bit)&31) & DISABLED_MASK6 )) || \
+ (((bit)>>5)==7 && (1UL<<((bit)&31) & DISABLED_MASK7 )) || \
+ (((bit)>>5)==8 && (1UL<<((bit)&31) & DISABLED_MASK8 )) || \
+ (((bit)>>5)==9 && (1UL<<((bit)&31) & DISABLED_MASK9 )) || \
+ (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) || \
+ (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \
+ (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \
+ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \
+ (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
#define cpu_has(c, bit) \
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 074b7604bd51..8f9afefd2dc5 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -12,7 +12,7 @@
/*
* Defines x86 CPU feature bits
*/
-#define NCAPINTS 16 /* N 32-bit words worth of info */
+#define NCAPINTS 17 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -94,7 +94,7 @@
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */
#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-/* free, was #define X86_FEATURE_11AP ( 3*32+19) * "" Bad local APIC aka 11AP */
+#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */
@@ -245,6 +245,8 @@
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
@@ -274,6 +276,10 @@
#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
+#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */
+
/*
* BUG word(s)
*/
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..39343be7d4f4 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -28,6 +28,14 @@
# define DISABLE_CENTAUR_MCR 0
#endif /* CONFIG_X86_64 */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+# define DISABLE_PKU (1<<(X86_FEATURE_PKU))
+# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE))
+#else
+# define DISABLE_PKU 0
+# define DISABLE_OSPKE 0
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
+
/*
* Make sure to add features to the correct mask
*/
@@ -41,5 +49,12 @@
#define DISABLED_MASK7 0
#define DISABLED_MASK8 0
#define DISABLED_MASK9 (DISABLE_MPX)
+#define DISABLED_MASK10 0
+#define DISABLED_MASK11 0
+#define DISABLED_MASK12 0
+#define DISABLED_MASK13 0
+#define DISABLED_MASK14 0
+#define DISABLED_MASK15 0
+#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
#endif /* _ASM_X86_DISABLED_FEATURES_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 0010c78c4998..53748c45e488 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -3,6 +3,7 @@
#include <asm/fpu/api.h>
#include <asm/pgtable.h>
+#include <asm/tlb.h>
/*
* We map the EFI regions needed for runtime services non-contiguously,
@@ -25,6 +26,8 @@
#define EFI32_LOADER_SIGNATURE "EL32"
#define EFI64_LOADER_SIGNATURE "EL64"
+#define MAX_CMDLINE_ADDRESS UINT_MAX
+
#ifdef CONFIG_X86_32
@@ -64,6 +67,17 @@ extern u64 asmlinkage efi_call(void *fp, ...);
#define efi_call_phys(f, args...) efi_call((f), args)
+/*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+ u64 r15;
+ u64 prev_cr3;
+ pgd_t *efi_pgt;
+ bool use_pgd;
+ u64 phys_stack;
+} __packed;
+
#define efi_call_virt(f, ...) \
({ \
efi_status_t __s; \
@@ -71,7 +85,20 @@ extern u64 asmlinkage efi_call(void *fp, ...);
efi_sync_low_kernel_mappings(); \
preempt_disable(); \
__kernel_fpu_begin(); \
+ \
+ if (efi_scratch.use_pgd) { \
+ efi_scratch.prev_cr3 = read_cr3(); \
+ write_cr3((unsigned long)efi_scratch.efi_pgt); \
+ __flush_tlb_all(); \
+ } \
+ \
__s = efi_call((void *)efi.systab->runtime->f, __VA_ARGS__); \
+ \
+ if (efi_scratch.use_pgd) { \
+ write_cr3(efi_scratch.prev_cr3); \
+ __flush_tlb_all(); \
+ } \
+ \
__kernel_fpu_end(); \
preempt_enable(); \
__s; \
@@ -111,11 +138,12 @@ extern void __init efi_memory_uc(u64 addr, unsigned long size);
extern void __init efi_map_region(efi_memory_desc_t *md);
extern void __init efi_map_region_fixed(efi_memory_desc_t *md);
extern void efi_sync_low_kernel_mappings(void);
+extern int __init efi_alloc_page_tables(void);
extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages);
extern void __init old_map_region(efi_memory_desc_t *md);
extern void __init runtime_code_page_mkexec(void);
-extern void __init efi_runtime_mkexec(void);
+extern void __init efi_runtime_update_mappings(void);
extern void __init efi_dump_pagetable(void);
extern void __init efi_apply_memmap_quirks(void);
extern int __init efi_reuse_config(u64 tables, int nr_tables);
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index a2124343edf5..31ac8e6d9f36 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -25,6 +25,8 @@
extern void fpu__activate_curr(struct fpu *fpu);
extern void fpu__activate_fpstate_read(struct fpu *fpu);
extern void fpu__activate_fpstate_write(struct fpu *fpu);
+extern void fpu__current_fpstate_write_begin(void);
+extern void fpu__current_fpstate_write_end(void);
extern void fpu__save(struct fpu *fpu);
extern void fpu__restore(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 1c6f6ac52ad0..36b90bbfc69f 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -108,6 +108,8 @@ enum xfeature {
XFEATURE_OPMASK,
XFEATURE_ZMM_Hi256,
XFEATURE_Hi16_ZMM,
+ XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
+ XFEATURE_PKRU,
XFEATURE_MAX,
};
@@ -120,6 +122,7 @@ enum xfeature {
#define XFEATURE_MASK_OPMASK (1 << XFEATURE_OPMASK)
#define XFEATURE_MASK_ZMM_Hi256 (1 << XFEATURE_ZMM_Hi256)
#define XFEATURE_MASK_Hi16_ZMM (1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
#define XFEATURE_MASK_FPSSE (XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
#define XFEATURE_MASK_AVX512 (XFEATURE_MASK_OPMASK \
@@ -212,6 +215,15 @@ struct avx_512_hi16_state {
struct reg_512_bit hi16_zmm[16];
} __packed;
+/*
+ * State component 9: 32-bit PKRU register. The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+ u32 pkru;
+ u32 pad;
+} __packed;
+
struct xstate_header {
u64 xfeatures;
u64 xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index f23cd8c80b1c..38951b0fcc5a 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -24,7 +24,8 @@
XFEATURE_MASK_YMM | \
XFEATURE_MASK_OPMASK | \
XFEATURE_MASK_ZMM_Hi256 | \
- XFEATURE_MASK_Hi16_ZMM)
+ XFEATURE_MASK_Hi16_ZMM | \
+ XFEATURE_MASK_PKRU)
/* Supported features which require eager state saving */
#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 24938852db30..a4820d4df617 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -52,13 +52,13 @@ int ftrace_int3_handler(struct pt_regs *regs);
* this screws up the trace output when tracing a ia32 task.
* Instead of reporting bogus syscalls, just do not trace them.
*
- * If the user realy wants these, then they should use the
+ * If the user really wants these, then they should use the
* raw syscall tracepoints with filtering.
*/
#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
{
- if (is_compat_task())
+ if (in_compat_syscall())
return true;
return false;
}
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
deleted file mode 100644
index b3799d88ffcf..000000000000
--- a/arch/x86/include/asm/gpio.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __LINUX_GPIO_H
-#warning Include linux/gpio.h instead of asm/gpio.h
-#include <linux/gpio.h>
-#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1815b736269d..b90e1053049b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -141,6 +141,7 @@ struct irq_alloc_info {
struct irq_cfg {
unsigned int dest_apicid;
u8 vector;
+ u8 old_vector;
};
extern struct irq_cfg *irq_cfg(unsigned int irq);
@@ -168,20 +169,6 @@ extern atomic_t irq_mis_count;
extern void elcr_set_level_irq(unsigned int irq);
-/* SMP */
-extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
-extern __visible void smp_spurious_interrupt(struct pt_regs *);
-extern __visible void smp_x86_platform_ipi(struct pt_regs *);
-extern __visible void smp_error_interrupt(struct pt_regs *);
-#ifdef CONFIG_X86_IO_APIC
-extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
-#endif
-#ifdef CONFIG_SMP
-extern __visible void smp_reschedule_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_interrupt(struct pt_regs *);
-extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-#endif
-
extern char irq_entries_start[];
#ifdef CONFIG_TRACING
#define trace_irq_entries_start irq_entries_start
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 01c8b501cb6d..f62a9f37f79f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,7 +84,8 @@
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
| X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \
| X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \
- | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP))
+ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE | X86_CR4_SMAP \
+ | X86_CR4_PKE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -187,12 +188,14 @@ enum {
#define PFERR_USER_BIT 2
#define PFERR_RSVD_BIT 3
#define PFERR_FETCH_BIT 4
+#define PFERR_PK_BIT 5
#define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT)
#define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT)
#define PFERR_USER_MASK (1U << PFERR_USER_BIT)
#define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
#define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
+#define PFERR_PK_MASK (1U << PFERR_PK_BIT)
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
@@ -335,6 +338,14 @@ struct kvm_mmu {
*/
u8 permissions[16];
+ /*
+ * The pkru_mask indicates if protection key checks are needed. It
+ * consists of 16 domains indexed by page fault error code bits [4:1],
+ * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+ * Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+ */
+ u32 pkru_mask;
+
u64 *pae_root;
u64 *lm_root;
@@ -874,6 +885,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ u32 (*get_pkru)(struct kvm_vcpu *vcpu);
void (*fpu_activate)(struct kvm_vcpu *vcpu);
void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index e795f5274217..7e68f9558552 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -25,7 +25,6 @@
#include <linux/module.h>
#include <linux/ftrace.h>
-#ifdef CONFIG_LIVEPATCH
static inline int klp_check_compiler_support(void)
{
#ifndef CC_USING_FENTRY
@@ -40,8 +39,5 @@ static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
{
regs->ip = ip;
}
-#else
-#error Include linux/livepatch.h, not asm/livepatch.h
-#endif
#endif /* _ASM_X86_LIVEPATCH_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bfd9b2a35a0b..84280029cafd 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -52,15 +52,15 @@ struct ldt_struct {
/*
* Used for LDT copy/destruction.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context_ldt(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context(struct task_struct *tsk,
- struct mm_struct *mm)
+static inline int init_new_context_ldt(struct task_struct *tsk,
+ struct mm_struct *mm)
{
return 0;
}
-static inline void destroy_context(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) {}
#endif
static inline void load_mm_ldt(struct mm_struct *mm)
@@ -104,6 +104,17 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
#endif
}
+static inline int init_new_context(struct task_struct *tsk,
+ struct mm_struct *mm)
+{
+ init_new_context_ldt(tsk, mm);
+ return 0;
+}
+static inline void destroy_context(struct mm_struct *mm)
+{
+ destroy_context_ldt(mm);
+}
+
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -275,4 +286,68 @@ static inline void arch_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
mpx_notify_unmap(mm, vma, start, end);
}
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+ u16 pkey = 0;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
+ VM_PKEY_BIT2 | VM_PKEY_BIT3;
+ pkey = (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
+#endif
+ return pkey;
+}
+
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+ u32 pkru = read_pkru();
+
+ if (!__pkru_allows_read(pkru, pkey))
+ return false;
+ if (write && !__pkru_allows_write(pkru, pkey))
+ return false;
+
+ return true;
+}
+
+/*
+ * We only want to enforce protection keys on the current process
+ * because we effectively have no access to PKRU for other
+ * processes or any way to tell *which * PKRU in a threaded
+ * process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current
+ * mm, or if we are in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+ if (!current->mm)
+ return true;
+ /*
+ * Should PKRU be enforced on the access to this VMA? If
+ * the VMA is from another process, then PKRU has no
+ * relevance and should not be enforced.
+ */
+ if (current->mm != vma->vm_mm)
+ return true;
+
+ return false;
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+ bool write, bool execute, bool foreign)
+{
+ /* pkeys never affect instruction fetches */
+ if (execute)
+ return true;
+ /* allow access if the VMA is not one from this process */
+ if (foreign || vma_is_foreign(vma))
+ return true;
+ return __pkru_allows_pkey(vma_pkey(vma), write);
+}
+
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+ return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
+}
+
#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 93fb7c1cffda..7a79ee2778b3 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -42,14 +42,6 @@ struct saved_msrs {
struct saved_msr *array;
};
-static inline unsigned long long native_read_tscp(unsigned int *aux)
-{
- unsigned long low, high;
- asm volatile(".byte 0x0f,0x01,0xf9"
- : "=a" (low), "=d" (high), "=c" (*aux));
- return low | ((u64)high << 32);
-}
-
/*
* both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A"
* constraint has different meanings. For i386, "A" means exactly
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index f6192502149e..601f1b8f9961 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -13,6 +13,7 @@
#include <linux/bug.h>
#include <linux/types.h>
#include <linux/cpumask.h>
+#include <asm/frame.h>
static inline int paravirt_enabled(void)
{
@@ -756,15 +757,19 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
* call. The return value in rax/eax will not be saved, even for void
* functions.
*/
+#define PV_THUNK_NAME(func) "__raw_callee_save_" #func
#define PV_CALLEE_SAVE_REGS_THUNK(func) \
extern typeof(func) __raw_callee_save_##func; \
\
asm(".pushsection .text;" \
- ".globl __raw_callee_save_" #func " ; " \
- "__raw_callee_save_" #func ": " \
+ ".globl " PV_THUNK_NAME(func) ";" \
+ ".type " PV_THUNK_NAME(func) ", @function;" \
+ PV_THUNK_NAME(func) ":" \
+ FRAME_BEGIN \
PV_SAVE_ALL_CALLER_REGS \
"call " #func ";" \
PV_RESTORE_ALL_CALLER_REGS \
+ FRAME_END \
"ret;" \
".popsection")
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 77db5616a473..e8c2326478c8 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -466,8 +466,9 @@ int paravirt_disable_iospace(void);
* makes sure the incoming and outgoing types are always correct.
*/
#ifdef CONFIG_X86_32
-#define PVOP_VCALL_ARGS \
- unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx
+#define PVOP_VCALL_ARGS \
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \
+ register void *__sp asm("esp")
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
@@ -485,9 +486,10 @@ int paravirt_disable_iospace(void);
#define VEXTRA_CLOBBERS
#else /* CONFIG_X86_64 */
/* [re]ax isn't an arg, but the return val */
-#define PVOP_VCALL_ARGS \
- unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx, __eax = __eax
+#define PVOP_VCALL_ARGS \
+ unsigned long __edi = __edi, __esi = __esi, \
+ __edx = __edx, __ecx = __ecx, __eax = __eax; \
+ register void *__sp asm("rsp")
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
@@ -526,7 +528,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr \
+ : call_clbr, "+r" (__sp) \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -536,7 +538,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr \
+ : call_clbr, "+r" (__sp) \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -563,7 +565,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr \
+ : call_clbr, "+r" (__sp) \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 462594320d39..9ab7507ca1c2 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -20,6 +20,9 @@ struct pci_sysdata {
#ifdef CONFIG_X86_64
void *iommu; /* IOMMU private data */
#endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+ void *fwnode; /* IRQ domain for MSI assignment */
+#endif
};
extern int pci_routeirq;
@@ -32,6 +35,7 @@ extern int noioapicreroute;
static inline int pci_domain_nr(struct pci_bus *bus)
{
struct pci_sysdata *sd = bus->sysdata;
+
return sd->domain;
}
@@ -41,6 +45,17 @@ static inline int pci_proc_domain(struct pci_bus *bus)
}
#endif
+#ifdef CONFIG_PCI_MSI_IRQ_DOMAIN
+static inline void *_pci_root_bus_fwnode(struct pci_bus *bus)
+{
+ struct pci_sysdata *sd = bus->sysdata;
+
+ return sd->fwnode;
+}
+
+#define pci_root_bus_fwnode _pci_root_bus_fwnode
+#endif
+
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
@@ -105,9 +120,6 @@ void native_restore_msi_irqs(struct pci_dev *dev);
#include <asm/pci_64.h>
#endif
-/* implement the pci_ DMA API in terms of the generic device dma_ one */
-#include <asm-generic/pci-dma-compat.h>
-
/* generic pci stuff */
#include <asm-generic/pci.h>
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 0687c4748b8f..97f3242e133c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -99,6 +99,20 @@ static inline int pte_dirty(pte_t pte)
return pte_flags(pte) & _PAGE_DIRTY;
}
+
+static inline u32 read_pkru(void)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ return __read_pkru();
+ return 0;
+}
+
+static inline void write_pkru(u32 pkru)
+{
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ __write_pkru(pkru);
+}
+
static inline int pte_young(pte_t pte)
{
return pte_flags(pte) & _PAGE_ACCESSED;
@@ -911,6 +925,36 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
}
#endif
+#define PKRU_AD_BIT 0x1
+#define PKRU_WD_BIT 0x2
+#define PKRU_BITS_PER_PKEY 2
+
+static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+}
+
+static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
+{
+ int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+ /*
+ * Access-disable disables writes too so we need to check
+ * both bits here.
+ */
+ return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+}
+
+static inline u16 pte_flags_pkey(unsigned long pte_flags)
+{
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ /* ifdef to avoid doing 59-bit shift on 32-bit values */
+ return (pte_flags & _PAGE_PKEY_MASK) >> _PAGE_BIT_PKEY_BIT0;
+#else
+ return 0;
+#endif
+}
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4432ab7f407c..7b5efe264eff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -20,13 +20,18 @@
#define _PAGE_BIT_SOFTW2 10 /* " */
#define _PAGE_BIT_SOFTW3 11 /* " */
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
+#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
+#define _PAGE_BIT_PKEY_BIT0 59 /* Protection Keys, bit 1/4 */
+#define _PAGE_BIT_PKEY_BIT1 60 /* Protection Keys, bit 2/4 */
+#define _PAGE_BIT_PKEY_BIT2 61 /* Protection Keys, bit 3/4 */
+#define _PAGE_BIT_PKEY_BIT3 62 /* Protection Keys, bit 4/4 */
+#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
-#define _PAGE_BIT_SOFTW4 58 /* available for programmer */
-#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
-#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
+#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
/* If _PAGE_BIT_PRESENT is clear, we use these: */
/* - if the user mapped it with PROT_NONE; pte_present gives true */
@@ -47,8 +52,24 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT2)
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT3)
+#else
+#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0))
+#define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0))
+#endif
#define __HAVE_ARCH_PTE_SPECIAL
+#define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \
+ _PAGE_PKEY_BIT1 | \
+ _PAGE_PKEY_BIT2 | \
+ _PAGE_PKEY_BIT3)
+
#ifdef CONFIG_KMEMCHECK
#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
#else
@@ -99,7 +120,12 @@
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
_PAGE_DIRTY)
-/* Set of bits not changed in pte_modify */
+/*
+ * Set of bits not changed in pte_modify. The pte's
+ * protection key is treated like _PAGE_RW, for
+ * instance, and is *not* included in this mask since
+ * pte_modify() does modify it.
+ */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
_PAGE_SOFT_DIRTY)
@@ -215,7 +241,10 @@ enum page_cache_mode {
/* Extracts the PFN from a (pte|pmd|pud|pgd)val_t of a 4KB page */
#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
-/* Extracts the flags from a (pte|pmd|pud|pgd)val_t of a 4KB page */
+/*
+ * Extracts the flags from a (pte|pmd|pud|pgd)val_t
+ * This includes the protection key value.
+ */
#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
typedef struct pgprot { pgprotval_t pgprot; } pgprot_t;
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
new file mode 100644
index 000000000000..7b84565c916c
--- /dev/null
+++ b/arch/x86/include/asm/pkeys.h
@@ -0,0 +1,34 @@
+#ifndef _ASM_X86_PKEYS_H
+#define _ASM_X86_PKEYS_H
+
+#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
+
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val);
+
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+#define PKEY_DEDICATED_EXECUTE_ONLY 15
+extern int __execute_only_pkey(struct mm_struct *mm);
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __execute_only_pkey(mm);
+}
+
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey);
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return 0;
+
+ return __arch_override_mprotect_pkey(vma, prot, pkey);
+}
+
+#endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h
index c57fd1ea9689..bf8b35d2035a 100644
--- a/arch/x86/include/asm/pmem.h
+++ b/arch/x86/include/asm/pmem.h
@@ -137,6 +137,11 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size)
arch_wb_cache_pmem(addr, size);
}
+static inline void arch_invalidate_pmem(void __pmem *addr, size_t size)
+{
+ clflush_cache_range((void __force *) addr, size);
+}
+
static inline bool __arch_has_wmb_pmem(void)
{
/*
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 01bcde84d3e4..d397deb58146 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -94,10 +94,19 @@ static __always_inline bool should_resched(int preempt_offset)
#ifdef CONFIG_PREEMPT
extern asmlinkage void ___preempt_schedule(void);
-# define __preempt_schedule() asm ("call ___preempt_schedule")
+# define __preempt_schedule() \
+({ \
+ register void *__sp asm(_ASM_SP); \
+ asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \
+})
+
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void ___preempt_schedule_notrace(void);
-# define __preempt_schedule_notrace() asm ("call ___preempt_schedule_notrace")
+# define __preempt_schedule_notrace() \
+({ \
+ register void *__sp asm(_ASM_SP); \
+ asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \
+})
extern asmlinkage void preempt_schedule_notrace(void);
#endif
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index 9f92c180ed2f..9d55f9b6e167 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -36,8 +36,10 @@ PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
*/
asm (".pushsection .text;"
".globl " PV_UNLOCK ";"
+ ".type " PV_UNLOCK ", @function;"
".align 4,0x90;"
PV_UNLOCK ": "
+ FRAME_BEGIN
"push %rdx;"
"mov $0x1,%eax;"
"xor %edx,%edx;"
@@ -45,6 +47,7 @@ asm (".pushsection .text;"
"cmp $0x1,%al;"
"jne .slowpath;"
"pop %rdx;"
+ FRAME_END
"ret;"
".slowpath: "
"push %rsi;"
@@ -52,6 +55,7 @@ asm (".pushsection .text;"
"call " PV_UNLOCK_SLOWPATH ";"
"pop %rsi;"
"pop %rdx;"
+ FRAME_END
"ret;"
".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
".popsection");
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index 5c6e4fb370f5..4916144e3c42 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -92,5 +92,12 @@
#define REQUIRED_MASK7 0
#define REQUIRED_MASK8 0
#define REQUIRED_MASK9 0
+#define REQUIRED_MASK10 0
+#define REQUIRED_MASK11 0
+#define REQUIRED_MASK12 0
+#define REQUIRED_MASK13 0
+#define REQUIRED_MASK14 0
+#define REQUIRED_MASK15 0
+#define REQUIRED_MASK16 0
#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index cad82c9c2fde..ceec86eb68e9 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -25,7 +25,7 @@
* This should be totally fair - if anything is waiting, a process that wants a
* lock will go to the back of the queue. When the currently active lock is
* released, if there's a writer at the front of the queue, then that and only
- * that will be woken up; if there's a bunch of consequtive readers at the
+ * that will be woken up; if there's a bunch of consecutive readers at the
* front, then they'll all be woken up, but no other readers will be.
*/
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2270e41b32fd..d96d04377765 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -98,6 +98,44 @@ static inline void native_write_cr8(unsigned long val)
}
#endif
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static inline u32 __read_pkru(void)
+{
+ u32 ecx = 0;
+ u32 edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (pkru), "=d" (edx)
+ : "c" (ecx));
+ return pkru;
+}
+
+static inline void __write_pkru(u32 pkru)
+{
+ u32 ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (pkru), "c"(ecx), "d"(edx));
+}
+#else
+static inline u32 __read_pkru(void)
+{
+ return 0;
+}
+
+static inline void __write_pkru(u32 pkru)
+{
+}
+#endif
+
static inline void native_wbinvd(void)
{
asm volatile("wbinvd": : :"memory");
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 70bbe39043a9..7c247e7404be 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,7 +37,7 @@ print_context_stack_bp(struct thread_info *tinfo,
/* Generic stack tracer with callbacks */
struct stacktrace_ops {
- void (*address)(void *data, unsigned long address, int reliable);
+ int (*address)(void *data, unsigned long address, int reliable);
/* On negative return stop dumping */
int (*stack)(void *data, char *name);
walk_stack_t walk_stack;
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index ca6ba3607705..90dbbd9666d4 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -87,9 +87,9 @@ int strcmp(const char *cs, const char *ct);
*
* Low level memory copy function that catches machine checks
*
- * Return true for success, false for fail
+ * Return 0 for success, -EFAULT for fail
*/
-bool memcpy_mcsafe(void *dst, const void *src, size_t cnt);
+int memcpy_mcsafe(void *dst, const void *src, size_t cnt);
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index c0f27d7ea7ff..a969ae607be8 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -105,9 +105,8 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
struct exception_table_entry {
int insn, fixup, handler;
};
-/* This is not the generic standard exception_table_entry format */
-#define ARCH_HAS_SORT_EXTABLE
-#define ARCH_HAS_SEARCH_EXTABLE
+
+#define ARCH_HAS_RELATIVE_EXTABLE
extern int fixup_exception(struct pt_regs *regs, int trapnr);
extern bool ex_has_fault_handler(unsigned long ip);
@@ -179,10 +178,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
({ \
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
+ register void *__sp asm(_ASM_SP); \
__chk_user_ptr(ptr); \
might_fault(); \
- asm volatile("call __get_user_%P3" \
- : "=a" (__ret_gu), "=r" (__val_gu) \
+ asm volatile("call __get_user_%P4" \
+ : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 3bcdcc84259d..a12a047184ee 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -110,9 +110,10 @@ extern struct { char _entry[32]; } hypercall_page[];
register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
- register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \
+ register void *__sp asm(_ASM_SP);
-#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp)
#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 8b2d4bea9962..39171b3646bb 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -62,4 +62,6 @@ void xen_arch_register_cpu(int num);
void xen_arch_unregister_cpu(int num);
#endif
+extern void xen_set_iopl_mask(unsigned mask);
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 513b05f15bb4..39bca7fac087 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -6,6 +6,28 @@
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags) __pgprot( \
+ ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+ ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) ( \
+ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
+ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
+ ((key) & 0x4 ? VM_PKEY_BIT2 : 0) | \
+ ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
#include <asm-generic/mman.h>
#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..567de50a4c2a 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -118,6 +118,8 @@
#define X86_CR4_SMEP _BITUL(X86_CR4_SMEP_BIT)
#define X86_CR4_SMAP_BIT 21 /* enable SMAP support */
#define X86_CR4_SMAP _BITUL(X86_CR4_SMAP_BIT)
+#define X86_CR4_PKE_BIT 22 /* enable Protection Keys support */
+#define X86_CR4_PKE _BITUL(X86_CR4_PKE_BIT)
/*
* x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b1b78ffe01d0..adaae2c781c1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -16,9 +16,20 @@ CFLAGS_REMOVE_ftrace.o = -pg
CFLAGS_REMOVE_early_printk.o = -pg
endif
-KASAN_SANITIZE_head$(BITS).o := n
-KASAN_SANITIZE_dumpstack.o := n
-KASAN_SANITIZE_dumpstack_$(BITS).o := n
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
+OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y
+OBJECT_FILES_NON_STANDARD_test_nx.o := y
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index e75907601a41..8c2f1ef6ca23 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -956,7 +956,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
@@ -984,7 +984,7 @@ static int __init acpi_parse_madt_lapic_entries(void)
/*
* Note that the LAPIC address is obtained from the MADT (32-bit value)
- * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+ * and (optionally) overridden by a LAPIC_ADDR_OVR entry (64-bit value).
*/
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC_OVERRIDE,
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 8c35df468104..169963f471bb 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -5,6 +5,7 @@
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
+#include <asm/frame.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
@@ -39,6 +40,7 @@ bogus_64_magic:
jmp bogus_64_magic
ENTRY(do_suspend_lowlevel)
+ FRAME_BEGIN
subq $8, %rsp
xorl %eax, %eax
call save_processor_state
@@ -109,6 +111,7 @@ ENTRY(do_suspend_lowlevel)
xorl %eax, %eax
addq $8, %rsp
+ FRAME_END
jmp restore_processor_state
ENDPROC(do_suspend_lowlevel)
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 222a57076039..cefacbad1531 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -221,7 +221,7 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
unsigned long cpu = (unsigned long)hcpu;
struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
- switch (action & 0xf) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DEAD:
dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 6e85f713641d..0a2bb1f62e72 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -227,19 +227,11 @@ static u32 __init search_agp_bridge(u32 *order, int *valid_agp)
return 0;
}
-static int gart_fix_e820 __initdata = 1;
+static bool gart_fix_e820 __initdata = true;
static int __init parse_gart_mem(char *p)
{
- if (!p)
- return -EINVAL;
-
- if (!strncmp(p, "off", 3))
- gart_fix_e820 = 0;
- else if (!strncmp(p, "on", 2))
- gart_fix_e820 = 1;
-
- return 0;
+ return kstrtobool(p, &gart_fix_e820);
}
early_param("gart_fix_e820", parse_gart_mem);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT := n
+
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o
obj-y += hw_nmi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 531b9611c51d..d356987a04e9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1611,7 +1611,7 @@ void __init enable_IR_x2apic(void)
legacy_pic->mask_all();
mask_ioapic_entries();
- /* If irq_remapping_prepare() succeded, try to enable it */
+ /* If irq_remapping_prepare() succeeded, try to enable it */
if (ir_stat >= 0)
ir_stat = try_to_enable_IR();
/* ir_stat contains the remap mode or an error code */
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 3b670df4ba7b..ad59d70bcb1a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -213,6 +213,7 @@ update:
*/
cpumask_and(d->old_domain, d->old_domain, cpu_online_mask);
d->move_in_progress = !cpumask_empty(d->old_domain);
+ d->cfg.old_vector = d->move_in_progress ? d->cfg.vector : 0;
d->cfg.vector = vector;
cpumask_copy(d->domain, vector_cpumask);
success:
@@ -655,46 +656,97 @@ void irq_complete_move(struct irq_cfg *cfg)
}
/*
- * Called with @desc->lock held and interrupts disabled.
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
*/
void irq_force_complete_move(struct irq_desc *desc)
{
struct irq_data *irqdata = irq_desc_get_irq_data(desc);
struct apic_chip_data *data = apic_chip_data(irqdata);
struct irq_cfg *cfg = data ? &data->cfg : NULL;
+ unsigned int cpu;
if (!cfg)
return;
- __irq_complete_move(cfg, cfg->vector);
-
/*
* This is tricky. If the cleanup of @data->old_domain has not been
* done yet, then the following setaffinity call will fail with
* -EBUSY. This can leave the interrupt in a stale state.
*
- * The cleanup cannot make progress because we hold @desc->lock. So in
- * case @data->old_domain is not yet cleaned up, we need to drop the
- * lock and acquire it again. @desc cannot go away, because the
- * hotplug code holds the sparse irq lock.
+ * All CPUs are stuck in stop machine with interrupts disabled so
+ * calling __irq_complete_move() would be completely pointless.
*/
raw_spin_lock(&vector_lock);
- /* Clean out all offline cpus (including ourself) first. */
+ /*
+ * Clean out all offline cpus (including the outgoing one) from the
+ * old_domain mask.
+ */
cpumask_and(data->old_domain, data->old_domain, cpu_online_mask);
- while (!cpumask_empty(data->old_domain)) {
+
+ /*
+ * If move_in_progress is cleared and the old_domain mask is empty,
+ * then there is nothing to cleanup. fixup_irqs() will take care of
+ * the stale vectors on the outgoing cpu.
+ */
+ if (!data->move_in_progress && cpumask_empty(data->old_domain)) {
raw_spin_unlock(&vector_lock);
- raw_spin_unlock(&desc->lock);
- cpu_relax();
- raw_spin_lock(&desc->lock);
+ return;
+ }
+
+ /*
+ * 1) The interrupt is in move_in_progress state. That means that we
+ * have not seen an interrupt since the io_apic was reprogrammed to
+ * the new vector.
+ *
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
+ * have not been processed yet.
+ */
+ if (data->move_in_progress) {
/*
- * Reevaluate apic_chip_data. It might have been cleared after
- * we dropped @desc->lock.
+ * In theory there is a race:
+ *
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
+ * is effective, i.e. it's raised on
+ * the old vector.
+ *
+ * So if the target cpu cannot handle that interrupt before
+ * the old vector is cleaned up, we get a spurious interrupt
+ * and in the worst case the ioapic irq line becomes stale.
+ *
+ * But in case of cpu hotplug this should be a non issue
+ * because if the affinity update happens right before all
+ * cpus rendevouz in stop machine, there is no way that the
+ * interrupt can be blocked on the target cpu because all cpus
+ * loops first with interrupts enabled in stop machine, so the
+ * old vector is not yet cleaned up when the interrupt fires.
+ *
+ * So the only way to run into this issue is if the delivery
+ * of the interrupt on the apic/system bus would be delayed
+ * beyond the point where the target cpu disables interrupts
+ * in stop machine. I doubt that it can happen, but at least
+ * there is a theroretical chance. Virtualization might be
+ * able to expose this, but AFAICT the IOAPIC emulation is not
+ * as stupid as the real hardware.
+ *
+ * Anyway, there is nothing we can do about that at this point
+ * w/o refactoring the whole fixup_irq() business completely.
+ * We print at least the irq number and the old vector number,
+ * so we have the necessary information when a problem in that
+ * area arises.
*/
- data = apic_chip_data(irqdata);
- if (!data)
- return;
- raw_spin_lock(&vector_lock);
+ pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
+ irqdata->irq, cfg->old_vector);
}
+ /*
+ * If old_domain is not empty, then other cpus still have the irq
+ * descriptor set in their vector array. Clean it up.
+ */
+ for_each_cpu(cpu, data->old_domain)
+ per_cpu(vector_irq, cpu)[cfg->old_vector] = VECTOR_UNUSED;
+
+ /* Cleanup the left overs of the (half finished) move */
+ cpumask_clear(data->old_domain);
+ data->move_in_progress = 0;
raw_spin_unlock(&vector_lock);
}
#endif
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 624db00583f4..8f4942e2bcbb 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -792,7 +792,8 @@ static int uv_scir_cpu_notify(struct notifier_block *self, unsigned long action,
{
long cpu = (long)hcpu;
- switch (action) {
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_FAILED:
case CPU_ONLINE:
uv_heartbeat_enable(cpu);
break;
@@ -860,7 +861,7 @@ int uv_set_vga_state(struct pci_dev *pdev, bool decode,
*/
void uv_cpu_init(void)
{
- /* CPU 0 initilization will be done via uv_system_init. */
+ /* CPU 0 initialization will be done via uv_system_init. */
if (!uv_blade_info)
return;
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 052c9c3026cc..9307f182fe30 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1088,7 +1088,7 @@ static int apm_get_battery_status(u_short which, u_short *status,
* @device: identity of device
* @enable: on/off
*
- * Activate or deactive power management on either a specific device
+ * Activate or deactivate power management on either a specific device
* or the entire system (%APM_DEVICE_ALL).
*/
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0d373d7affc8..4a8697f7d4ef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
CFLAGS_REMOVE_perf_event.o = -pg
endif
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
# Make sure load_percpu_segment has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 97c59fd60702..6e47e3a916f1 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -75,14 +75,17 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
*/
extern __visible void vide(void);
-__asm__(".globl vide\n\t.align 4\nvide: ret");
+__asm__(".globl vide\n"
+ ".type vide, @function\n"
+ ".align 4\n"
+ "vide: ret\n");
static void init_amd_k5(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_X86_32
/*
* General Systems BIOSen alias the cpu frequency registers
- * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * of the Elan at 0x000df000. Unfortunately, one of the Linux
* drivers subsequently pokes it, and changes the CPU speed.
* Workaround : Remove the unneeded alias.
*/
@@ -306,7 +309,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u32 eax, ebx, ecx, edx;
cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
- nodes_per_socket = ((ecx >> 8) & 7) + 1;
node_id = ecx & 7;
/* get compute unit information */
@@ -317,7 +319,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
- nodes_per_socket = ((value >> 3) & 7) + 1;
node_id = value & 7;
} else
return;
@@ -519,6 +520,18 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
if (cpu_has(c, X86_FEATURE_MWAITX))
use_mwaitx_delay();
+
+ if (boot_cpu_has(X86_FEATURE_TOPOEXT)) {
+ u32 ecx;
+
+ ecx = cpuid_ecx(0x8000001e);
+ nodes_per_socket = ((ecx >> 8) & 7) + 1;
+ } else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
+ u64 value;
+
+ rdmsrl(MSR_FAM10H_NODE_ID, value);
+ nodes_per_socket = ((value >> 3) & 7) + 1;
+ }
}
static void early_init_amd(struct cpuinfo_x86 *c)
@@ -536,6 +549,10 @@ static void early_init_amd(struct cpuinfo_x86 *c)
set_sched_clock_stable();
}
+ /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */
+ if (c->x86_power & BIT(12))
+ set_cpu_cap(c, X86_FEATURE_ACC_POWER);
+
#ifdef CONFIG_X86_64
set_cpu_cap(c, X86_FEATURE_SYSCALL32);
#else
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 249461f95851..8394b3d1f94f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -304,6 +304,48 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
/*
+ * Protection Keys are not available in 32-bit mode.
+ */
+static bool pku_disabled;
+
+static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+{
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ return;
+ if (pku_disabled)
+ return;
+
+ cr4_set_bits(X86_CR4_PKE);
+ /*
+ * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ * cpuid bit to be set. We need to ensure that we
+ * update that bit in this CPU's "cpu_info".
+ */
+ get_cpu_cap(c);
+}
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+static __init int setup_disable_pku(char *arg)
+{
+ /*
+ * Do not clear the X86_FEATURE_PKU bit. All of the
+ * runtime checks are against OSPKE so clearing the
+ * bit does nothing.
+ *
+ * This way, we will see "pku" in cpuinfo, but not
+ * "ospke", which is exactly what we want. It shows
+ * that the CPU has PKU, but the OS has not enabled it.
+ * This happens to be exactly how a system would look
+ * if we disabled the config option.
+ */
+ pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ pku_disabled = true;
+ return 1;
+}
+__setup("nopku", setup_disable_pku);
+#endif /* CONFIG_X86_64 */
+
+/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
* software. Add those features to this table to auto-disable them.
@@ -625,6 +667,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[CPUID_7_0_EBX] = ebx;
c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+ c->x86_capability[CPUID_7_ECX] = ecx;
}
/* Extended state features: level 0x0000000d */
@@ -649,7 +692,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
c->x86_capability[CPUID_F_1_EDX] = edx;
- if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ if ((cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) ||
+ ((cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL)) ||
+ (cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)))) {
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
@@ -925,7 +970,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
if (this_cpu->c_identify)
this_cpu->c_identify(c);
- /* Clear/Set all flags overriden by options, after probe */
+ /* Clear/Set all flags overridden by options, after probe */
for (i = 0; i < NCAPINTS; i++) {
c->x86_capability[i] &= ~cpu_caps_cleared[i];
c->x86_capability[i] |= cpu_caps_set[i];
@@ -982,9 +1027,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
x86_init_cache_qos(c);
+ setup_pku(c);
/*
- * Clear/Set all flags overriden by options, need do it
+ * Clear/Set all flags overridden by options, need do it
* before following smp all cpus cap AND.
*/
for (i = 0; i < NCAPINTS; i++) {
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fcbcb2f678ca..19f57360dfd2 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -42,7 +42,7 @@ EXPORT_SYMBOL_GPL(mtrr_state);
* "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
* Opteron Processors" (26094 Rev. 3.30 February 2006), section
* "13.2.1.2 SYSCFG Register": "The MtrrFixDramModEn bit should be set
- * to 1 during BIOS initalization of the fixed MTRRs, then cleared to
+ * to 1 during BIOS initialization of the fixed MTRRs, then cleared to
* 0 for operation."
*/
static inline void k8_check_syscfg_dram_mod_en(void)
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 32e5699eadfe..8efa57a5f29e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,7 +135,8 @@ print_context_stack_bp(struct thread_info *tinfo,
if (!__kernel_text_address(addr))
break;
- ops->address(data, addr, 1);
+ if (ops->address(data, addr, 1))
+ break;
frame = frame->next_frame;
ret_addr = &frame->return_address;
print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
@@ -154,10 +155,11 @@ static int print_trace_stack(void *data, char *name)
/*
* Print one address/symbol entries per line.
*/
-static void print_trace_address(void *data, unsigned long addr, int reliable)
+static int print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
printk_stack_address(addr, reliable, data);
+ return 0;
}
static const struct stacktrace_ops print_trace_ops = {
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 21bf92490a7b..8a121991e5ba 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -287,7 +287,7 @@ static __init void early_pci_serial_init(char *s)
}
/*
- * Lastly, initalize the hardware
+ * Lastly, initialize the hardware
*/
if (*s) {
if (strcmp(s, "nocfg") == 0)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 0b1b9abd4d5f..8e37cc8a539a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -354,6 +354,69 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
}
/*
+ * This function must be called before we write the current
+ * task's fpstate.
+ *
+ * This call gets the current FPU register state and moves
+ * it in to the 'fpstate'. Preemption is disabled so that
+ * no writes to the 'fpstate' can occur from context
+ * swiches.
+ *
+ * Must be followed by a fpu__current_fpstate_write_end().
+ */
+void fpu__current_fpstate_write_begin(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ /*
+ * Ensure that the context-switching code does not write
+ * over the fpstate while we are doing our update.
+ */
+ preempt_disable();
+
+ /*
+ * Move the fpregs in to the fpu's 'fpstate'.
+ */
+ fpu__activate_fpstate_read(fpu);
+
+ /*
+ * The caller is about to write to 'fpu'. Ensure that no
+ * CPU thinks that its fpregs match the fpstate. This
+ * ensures we will not be lazy and skip a XRSTOR in the
+ * future.
+ */
+ fpu->last_cpu = -1;
+}
+
+/*
+ * This function must be paired with fpu__current_fpstate_write_begin()
+ *
+ * This will ensure that the modified fpstate gets placed back in
+ * the fpregs if necessary.
+ *
+ * Note: This function may be called whether or not an _actual_
+ * write to the fpstate occurred.
+ */
+void fpu__current_fpstate_write_end(void)
+{
+ struct fpu *fpu = &current->thread.fpu;
+
+ /*
+ * 'fpu' now has an updated copy of the state, but the
+ * registers may still be out of date. Update them with
+ * an XRSTOR if they are active.
+ */
+ if (fpregs_active())
+ copy_kernel_to_fpregs(&fpu->state);
+
+ /*
+ * Our update is done and the fpregs/fpstate are in sync
+ * if necessary. Context switches can happen again.
+ */
+ preempt_enable();
+}
+
+/*
* 'fpu__restore()' is called to copy FPU registers from
* the FPU fpstate to the live hw registers and to activate
* access to the hardware registers, so that FPU instructions
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index 0bc3490420c5..8bd1c003942a 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -8,7 +8,7 @@
/*
* The xstateregs_active() routine is the same as the regset_fpregs_active() routine,
* as the "regset->n" for the xstate regset will be updated based on the feature
- * capabilites supported by the xsave.
+ * capabilities supported by the xsave.
*/
int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 6e8354f5a593..b48ef35b28d4 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -5,6 +5,7 @@
*/
#include <linux/compat.h>
#include <linux/cpu.h>
+#include <linux/pkeys.h>
#include <asm/fpu/api.h>
#include <asm/fpu/internal.h>
@@ -13,6 +14,11 @@
#include <asm/tlbflush.h>
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
static const char *xfeature_names[] =
{
"x87 floating point registers" ,
@@ -23,6 +29,8 @@ static const char *xfeature_names[] =
"AVX-512 opmask" ,
"AVX-512 Hi256" ,
"AVX-512 ZMM_Hi256" ,
+ "Processor Trace (unused)" ,
+ "Protection Keys User registers",
"unknown xstate feature" ,
};
@@ -56,6 +64,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
setup_clear_cpu_cap(X86_FEATURE_MPX);
setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
+ setup_clear_cpu_cap(X86_FEATURE_PKU);
}
/*
@@ -234,7 +243,7 @@ static void __init print_xstate_feature(u64 xstate_mask)
const char *feature_name;
if (cpu_has_xfeatures(xstate_mask, &feature_name))
- pr_info("x86/fpu: Supporting XSAVE feature 0x%02Lx: '%s'\n", xstate_mask, feature_name);
+ pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", xstate_mask, feature_name);
}
/*
@@ -250,6 +259,7 @@ static void __init print_xstate_features(void)
print_xstate_feature(XFEATURE_MASK_OPMASK);
print_xstate_feature(XFEATURE_MASK_ZMM_Hi256);
print_xstate_feature(XFEATURE_MASK_Hi16_ZMM);
+ print_xstate_feature(XFEATURE_MASK_PKRU);
}
/*
@@ -466,6 +476,7 @@ static void check_xstate_against_struct(int nr)
XCHECK_SZ(sz, nr, XFEATURE_OPMASK, struct avx_512_opmask_state);
XCHECK_SZ(sz, nr, XFEATURE_ZMM_Hi256, struct avx_512_zmm_uppers_state);
XCHECK_SZ(sz, nr, XFEATURE_Hi16_ZMM, struct avx_512_hi16_state);
+ XCHECK_SZ(sz, nr, XFEATURE_PKRU, struct pkru_state);
/*
* Make *SURE* to add any feature numbers in below if
@@ -473,7 +484,8 @@ static void check_xstate_against_struct(int nr)
* numbers.
*/
if ((nr < XFEATURE_YMM) ||
- (nr >= XFEATURE_MAX)) {
+ (nr >= XFEATURE_MAX) ||
+ (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
WARN_ONCE(1, "no structure for xstate: %d\n", nr);
XSTATE_WARN_ON(1);
}
@@ -671,6 +683,19 @@ void fpu__resume_cpu(void)
}
/*
+ * Given an xstate feature mask, calculate where in the xsave
+ * buffer the state is. Callers should ensure that the buffer
+ * is valid.
+ *
+ * Note: does not work for compacted buffers.
+ */
+void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+{
+ int feature_nr = fls64(xstate_feature_mask) - 1;
+
+ return (void *)xsave + xstate_comp_offsets[feature_nr];
+}
+/*
* Given the xsave area and a state inside, this function returns the
* address of the state.
*
@@ -690,7 +715,6 @@ void fpu__resume_cpu(void)
*/
void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
{
- int feature_nr = fls64(xstate_feature) - 1;
/*
* Do we even *have* xsave state?
*/
@@ -718,7 +742,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature)
if (!(xsave->header.xfeatures & xstate_feature))
return NULL;
- return (void *)xsave + xstate_comp_offsets[feature_nr];
+ return __raw_xsave_addr(xsave, xstate_feature);
}
EXPORT_SYMBOL_GPL(get_xsave_addr);
@@ -753,3 +777,156 @@ const void *get_xsave_field_ptr(int xsave_state)
return get_xsave_addr(&fpu->state.xsave, xsave_state);
}
+
+
+/*
+ * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
+ * to take out of its "init state". This will ensure that an
+ * XRSTOR actually restores the state.
+ */
+static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
+ int xstate_feature_mask)
+{
+ xsave->header.xfeatures |= xstate_feature_mask;
+}
+
+/*
+ * This function is safe to call whether the FPU is in use or not.
+ *
+ * Note that this only works on the current task.
+ *
+ * Inputs:
+ * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
+ * XFEATURE_MASK_SSE, etc...)
+ * @xsave_state_ptr: a pointer to a copy of the state that you would
+ * like written in to the current task's FPU xsave state. This pointer
+ * must not be located in the current tasks's xsave area.
+ * Output:
+ * address of the state in the xsave area or NULL if the state
+ * is not present or is in its 'init state'.
+ */
+static void fpu__xfeature_set_state(int xstate_feature_mask,
+ void *xstate_feature_src, size_t len)
+{
+ struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+ struct fpu *fpu = &current->thread.fpu;
+ void *dst;
+
+ if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
+ WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
+ return;
+ }
+
+ /*
+ * Tell the FPU code that we need the FPU state to be in
+ * 'fpu' (not in the registers), and that we need it to
+ * be stable while we write to it.
+ */
+ fpu__current_fpstate_write_begin();
+
+ /*
+ * This method *WILL* *NOT* work for compact-format
+ * buffers. If the 'xstate_feature_mask' is unset in
+ * xcomp_bv then we may need to move other feature state
+ * "up" in the buffer.
+ */
+ if (xsave->header.xcomp_bv & xstate_feature_mask) {
+ WARN_ON_ONCE(1);
+ goto out;
+ }
+
+ /* find the location in the xsave buffer of the desired state */
+ dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
+
+ /*
+ * Make sure that the pointer being passed in did not
+ * come from the xsave buffer itself.
+ */
+ WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
+
+ /* put the caller-provided data in the location */
+ memcpy(dst, xstate_feature_src, len);
+
+ /*
+ * Mark the xfeature so that the CPU knows there is state
+ * in the buffer now.
+ */
+ fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
+out:
+ /*
+ * We are done writing to the 'fpu'. Reenable preeption
+ * and (possibly) move the fpstate back in to the fpregs.
+ */
+ fpu__current_fpstate_write_end();
+}
+
+#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
+#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+
+/*
+ * This will go out and modify the XSAVE buffer so that PKRU is
+ * set to a particular state for access to 'pkey'.
+ *
+ * PKRU state does affect kernel access to user memory. We do
+ * not modfiy PKRU *itself* here, only the XSAVE state that will
+ * be restored in to PKRU when we return back to userspace.
+ */
+int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+ unsigned long init_val)
+{
+ struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct pkru_state *old_pkru_state;
+ struct pkru_state new_pkru_state;
+ int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+ u32 new_pkru_bits = 0;
+
+ /*
+ * This check implies XSAVE support. OSPKE only gets
+ * set if we enable XSAVE and we enable PKU in XCR0.
+ */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return -EINVAL;
+
+ /* Set the bits we need in PKRU */
+ if (init_val & PKEY_DISABLE_ACCESS)
+ new_pkru_bits |= PKRU_AD_BIT;
+ if (init_val & PKEY_DISABLE_WRITE)
+ new_pkru_bits |= PKRU_WD_BIT;
+
+ /* Shift the bits in to the correct place in PKRU for pkey. */
+ new_pkru_bits <<= pkey_shift;
+
+ /* Locate old copy of the state in the xsave buffer */
+ old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
+
+ /*
+ * When state is not in the buffer, it is in the init
+ * state, set it manually. Otherwise, copy out the old
+ * state.
+ */
+ if (!old_pkru_state)
+ new_pkru_state.pkru = 0;
+ else
+ new_pkru_state.pkru = old_pkru_state->pkru;
+
+ /* mask off any old bits in place */
+ new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+ /* Set the newly-requested bits */
+ new_pkru_state.pkru |= new_pkru_bits;
+
+ /*
+ * We could theoretically live without zeroing pkru.pad.
+ * The current XSAVE feature state definition says that
+ * only bytes 0->3 are used. But we do not want to
+ * chance leaking kernel stack out to userspace in case a
+ * memcpy() of the whole xsave buffer was done.
+ *
+ * They're in the same cacheline anyway.
+ */
+ new_pkru_state.pad = 0;
+
+ fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state,
+ sizeof(new_pkru_state));
+
+ return 0;
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 702547ce33c9..d036cfb4495d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -1,5 +1,5 @@
/*
- * Code for replacing ftrace calls with jumps.
+ * Dynamic function tracing support.
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index be0ebbb6d1d1..a1f0e4a5c47e 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -717,7 +717,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
struct hpet_work_struct work;
struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
- switch (action & 0xf) {
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_ONLINE:
INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
init_completion(&work.complete);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 37dae792dbbe..589b3193f102 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -96,9 +96,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
SYSCALL_DEFINE1(iopl, unsigned int, level)
{
struct pt_regs *regs = current_pt_regs();
- unsigned int old = (regs->flags >> 12) & 3;
struct thread_struct *t = &current->thread;
+ /*
+ * Careful: the IOPL bits in regs->flags are undefined under Xen PV
+ * and changing them has no effect.
+ */
+ unsigned int old = t->iopl >> X86_EFLAGS_IOPL_BIT;
+
if (level > 3)
return -EINVAL;
/* Trying to gain more privileges? */
@@ -106,8 +111,9 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
}
- regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
- t->iopl = level << 12;
+ regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) |
+ (level << X86_EFLAGS_IOPL_BIT);
+ t->iopl = level << X86_EFLAGS_IOPL_BIT;
set_iopl_mask(t->iopl);
return 0;
diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c
index 0f8a6bbaaa44..2af478e3fd4e 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -271,7 +271,7 @@ static int bzImage64_probe(const char *buf, unsigned long len)
int ret = -ENOEXEC;
struct setup_header *header;
- /* kernel should be atleast two sectors long */
+ /* kernel should be at least two sectors long */
if (len < 2 * 512) {
pr_err("File is too short to be a bzImage\n");
return ret;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ed15cd486d06..2da6ee9ae69b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -609,9 +609,9 @@ static struct notifier_block kgdb_notifier = {
};
/**
- * kgdb_arch_init - Perform any architecture specific initalization.
+ * kgdb_arch_init - Perform any architecture specific initialization.
*
- * This function will handle the initalization of any architecture
+ * This function will handle the initialization of any architecture
* specific callbacks.
*/
int kgdb_arch_init(void)
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0f05deeff5ce..ae703acb85c1 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -49,6 +49,7 @@
#include <linux/kdebug.h>
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
+#include <linux/frame.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
@@ -671,39 +672,39 @@ NOKPROBE_SYMBOL(kprobe_int3_handler);
* When a retprobed function returns, this code saves registers and
* calls trampoline_handler() runs, which calls the kretprobe's handler.
*/
-static void __used kretprobe_trampoline_holder(void)
-{
- asm volatile (
- ".global kretprobe_trampoline\n"
- "kretprobe_trampoline: \n"
+asm(
+ ".global kretprobe_trampoline\n"
+ ".type kretprobe_trampoline, @function\n"
+ "kretprobe_trampoline:\n"
#ifdef CONFIG_X86_64
- /* We don't bother saving the ss register */
- " pushq %rsp\n"
- " pushfq\n"
- SAVE_REGS_STRING
- " movq %rsp, %rdi\n"
- " call trampoline_handler\n"
- /* Replace saved sp with true return address. */
- " movq %rax, 152(%rsp)\n"
- RESTORE_REGS_STRING
- " popfq\n"
+ /* We don't bother saving the ss register */
+ " pushq %rsp\n"
+ " pushfq\n"
+ SAVE_REGS_STRING
+ " movq %rsp, %rdi\n"
+ " call trampoline_handler\n"
+ /* Replace saved sp with true return address. */
+ " movq %rax, 152(%rsp)\n"
+ RESTORE_REGS_STRING
+ " popfq\n"
#else
- " pushf\n"
- SAVE_REGS_STRING
- " movl %esp, %eax\n"
- " call trampoline_handler\n"
- /* Move flags to cs */
- " movl 56(%esp), %edx\n"
- " movl %edx, 52(%esp)\n"
- /* Replace saved flags with true return address. */
- " movl %eax, 56(%esp)\n"
- RESTORE_REGS_STRING
- " popf\n"
+ " pushf\n"
+ SAVE_REGS_STRING
+ " movl %esp, %eax\n"
+ " call trampoline_handler\n"
+ /* Move flags to cs */
+ " movl 56(%esp), %edx\n"
+ " movl %edx, 52(%esp)\n"
+ /* Replace saved flags with true return address. */
+ " movl %eax, 56(%esp)\n"
+ RESTORE_REGS_STRING
+ " popf\n"
#endif
- " ret\n");
-}
-NOKPROBE_SYMBOL(kretprobe_trampoline_holder);
+ " ret\n"
+ ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
+);
NOKPROBE_SYMBOL(kretprobe_trampoline);
+STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
/*
* Called from kretprobe_trampoline
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 47190bd399e7..807950860fb7 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -36,6 +36,7 @@
#include <linux/kprobes.h>
#include <linux/debugfs.h>
#include <linux/nmi.h>
+#include <linux/swait.h>
#include <asm/timer.h>
#include <asm/cpu.h>
#include <asm/traps.h>
@@ -91,14 +92,14 @@ static void kvm_io_delay(void)
struct kvm_task_sleep_node {
struct hlist_node link;
- wait_queue_head_t wq;
+ struct swait_queue_head wq;
u32 token;
int cpu;
bool halted;
};
static struct kvm_task_sleep_head {
- spinlock_t lock;
+ raw_spinlock_t lock;
struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
@@ -122,17 +123,17 @@ void kvm_async_pf_task_wait(u32 token)
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
- DEFINE_WAIT(wait);
+ DECLARE_SWAITQUEUE(wait);
rcu_irq_enter();
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
kfree(e);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
rcu_irq_exit();
return;
@@ -141,13 +142,13 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token;
n.cpu = smp_processor_id();
n.halted = is_idle_task(current) || preempt_count() > 1;
- init_waitqueue_head(&n.wq);
+ init_swait_queue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
for (;;) {
if (!n.halted)
- prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
@@ -166,7 +167,7 @@ void kvm_async_pf_task_wait(u32 token)
}
}
if (!n.halted)
- finish_wait(&n.wq, &wait);
+ finish_swait(&n.wq, &wait);
rcu_irq_exit();
return;
@@ -178,8 +179,8 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
hlist_del_init(&n->link);
if (n->halted)
smp_send_reschedule(n->cpu);
- else if (waitqueue_active(&n->wq))
- wake_up(&n->wq);
+ else if (swait_active(&n->wq))
+ swake_up(&n->wq);
}
static void apf_task_wake_all(void)
@@ -189,14 +190,14 @@ static void apf_task_wake_all(void)
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
struct kvm_task_sleep_node *n =
hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
}
}
@@ -212,7 +213,7 @@ void kvm_async_pf_task_wake(u32 token)
}
again:
- spin_lock(&b->lock);
+ raw_spin_lock(&b->lock);
n = _find_apf_task(b, token);
if (!n) {
/*
@@ -225,17 +226,17 @@ again:
* Allocation failed! Busy wait while other cpu
* handles async PF.
*/
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
cpu_relax();
goto again;
}
n->token = token;
n->cpu = smp_processor_id();
- init_waitqueue_head(&n->wq);
+ init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
apf_task_wake_one(n);
- spin_unlock(&b->lock);
+ raw_spin_unlock(&b->lock);
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
@@ -486,7 +487,7 @@ void __init kvm_guest_init(void)
paravirt_ops_setup();
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
- spin_lock_init(&async_pf_sleepers[i].lock);
+ raw_spin_lock_init(&async_pf_sleepers[i].lock);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 72cef58693c7..1d39bfbd26bb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -226,7 +226,7 @@ static void kvm_setup_secondary_clock(void)
* registered memory location. If the guest happens to shutdown, this memory
* won't be valid. In cases like kexec, in which you install a new kernel, this
* means a random memory location will be kept being written. So before any
- * kind of shutdown from our side, we unregister the clock by writting anything
+ * kind of shutdown from our side, we unregister the clock by writing anything
* that does not have the 'enable' bit set in the msr
*/
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 6acc9dd91f36..6707039b9032 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -103,7 +103,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
* we do not have to muck with descriptors here, that is
* done in switch_mm() as needed.
*/
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
{
struct ldt_struct *new_ldt;
struct mm_struct *old_mm;
@@ -144,7 +144,7 @@ out_unlock:
*
* 64bit: Don't touch the LDT register - we're already in the next thread.
*/
-void destroy_context(struct mm_struct *mm)
+void destroy_context_ldt(struct mm_struct *mm)
{
free_ldt_struct(mm->context.ldt);
mm->context.ldt = NULL;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b9d99e0f82c4..6cbab31ac23a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -48,6 +48,7 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
#include <asm/switch_to.h>
+#include <asm/xen/hypervisor.h>
asmlinkage extern void ret_from_fork(void);
@@ -116,6 +117,8 @@ void __show_regs(struct pt_regs *regs, int all)
printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+ if (boot_cpu_has(X86_FEATURE_OSPKE))
+ printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
}
void release_thread(struct task_struct *dead_task)
@@ -411,6 +414,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+#ifdef CONFIG_XEN
+ /*
+ * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+ * current_pt_regs()->flags may not match the current task's
+ * intended IOPL. We need to switch it manually.
+ */
+ if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+ prev->iopl != next->iopl))
+ xen_set_iopl_mask(next->iopl);
+#endif
+
if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
/*
* AMD CPUs have a misfeature: SYSRET sets the SS selector but
@@ -476,7 +490,7 @@ void set_personality_ia32(bool x32)
if (current->mm)
current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
- /* is_compat_task() uses the presence of the x32
+ /* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
current_thread_info()->status &= ~TS_COMPAT;
} else {
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index aa52c1009475..2367ae07eb76 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
+#include <asm/mmu_context.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1282,3 +1283,11 @@ static int __init register_kernel_offset_dumper(void)
return 0;
}
__initcall(register_kernel_offset_dumper);
+
+void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
+{
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 643dbdccf4bc..b2c99f811c3f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -274,11 +274,6 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu)
if (test_and_set_bit(pkg, physical_package_map))
goto found;
- if (pkg < __max_logical_packages) {
- set_bit(pkg, logical_package_map);
- physical_to_logical_pkg[pkg] = pkg;
- goto found;
- }
new = find_first_zero_bit(logical_package_map, __max_logical_packages);
if (new >= __max_logical_packages) {
physical_to_logical_pkg[pkg] = -1;
@@ -317,9 +312,27 @@ static void __init smp_init_package_map(void)
/*
* Today neither Intel nor AMD support heterogenous systems. That
* might change in the future....
+ *
+ * While ideally we'd want '* smp_num_siblings' in the below @ncpus
+ * computation, this won't actually work since some Intel BIOSes
+ * report inconsistent HT data when they disable HT.
+ *
+ * In particular, they reduce the APIC-IDs to only include the cores,
+ * but leave the CPUID topology to say there are (2) siblings.
+ * This means we don't know how many threads there will be until
+ * after the APIC enumeration.
+ *
+ * By not including this we'll sometimes over-estimate the number of
+ * logical packages by the amount of !present siblings, but this is
+ * still better than MAX_LOCAL_APIC.
+ *
+ * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited
+ * on the command line leading to a similar issue as the HT disable
+ * problem because the hyperthreads are usually enumerated after the
+ * primary cores.
*/
- ncpus = boot_cpu_data.x86_max_cores * smp_num_siblings;
- __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+ ncpus = boot_cpu_data.x86_max_cores;
+ __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
/*
* Possibly larger than what we need as the number of apic ids per
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index fdd0c6430e5a..9ee98eefc44d 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -14,30 +14,34 @@ static int save_stack_stack(void *data, char *name)
return 0;
}
-static void
+static int
__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
#ifdef CONFIG_FRAME_POINTER
if (!reliable)
- return;
+ return 0;
#endif
if (nosched && in_sched_functions(addr))
- return;
+ return 0;
if (trace->skip > 0) {
trace->skip--;
- return;
+ return 0;
}
- if (trace->nr_entries < trace->max_entries)
+ if (trace->nr_entries < trace->max_entries) {
trace->entries[trace->nr_entries++] = addr;
+ return 0;
+ } else {
+ return -1; /* no more room, stop walking the stack */
+ }
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static int save_stack_address(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, false);
}
-static void
+static int
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
return __save_stack_address(data, addr, reliable, true);
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496db434..e72a07f20b05 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -135,7 +135,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 56380440d862..c9c4c7ce3eb2 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -881,7 +881,7 @@ void tsc_restore_sched_clock_state(void)
local_irq_save(flags);
/*
- * We're comming out of suspend, there's no concurrency yet; don't
+ * We're coming out of suspend, there's no concurrency yet; don't
* bother being nice about the RCU stuff, just write to both
* data fields.
*/
@@ -1306,11 +1306,15 @@ void __init tsc_init(void)
unsigned long calibrate_delay_is_known(void)
{
int sibling, cpu = smp_processor_id();
+ struct cpumask *mask = topology_core_cpumask(cpu);
if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
return 0;
- sibling = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+ if (!mask)
+ return 0;
+
+ sibling = cpumask_any_but(mask, cpu);
if (sibling < nr_cpu_ids)
return cpu_data(sibling).loops_per_jiffy;
return 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 5af9958cbdb6..d239639e0c1d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -81,11 +81,11 @@ PHDRS {
SECTIONS
{
#ifdef CONFIG_X86_32
- . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
- phys_startup_32 = startup_32 - LOAD_OFFSET;
+ . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+ phys_startup_32 = ABSOLUTE(startup_32 - LOAD_OFFSET);
#else
- . = __START_KERNEL;
- phys_startup_64 = startup_64 - LOAD_OFFSET;
+ . = __START_KERNEL;
+ phys_startup_64 = ABSOLUTE(startup_64 - LOAD_OFFSET);
#endif
/* Text and read-only data */
@@ -333,6 +333,7 @@ SECTIONS
__brk_limit = .;
}
+ . = ALIGN(PAGE_SIZE);
_end = .;
STABS_DEBUG
@@ -340,7 +341,10 @@ SECTIONS
/* Sections to be discarded */
DISCARDS
- /DISCARD/ : { *(.eh_frame) }
+ /DISCARD/ : {
+ *(.eh_frame)
+ *(__func_stack_frame_non_standard)
+ }
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 0029644bf09c..8efb839948e5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -88,6 +88,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer_mode_mask = 1 << 17;
}
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (best) {
+ /* Update OSPKE bit */
+ if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) {
+ best->ecx &= ~F(OSPKE);
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE))
+ best->ecx |= F(OSPKE);
+ }
+ }
+
best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
if (!best) {
vcpu->arch.guest_supported_xcr0 = 0;
@@ -305,7 +315,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
/* cpuid 1.edx */
- const u32 kvm_supported_word0_x86_features =
+ const u32 kvm_cpuid_1_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
@@ -315,7 +325,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
0 /* HTT, TM, Reserved, PBE */;
/* cpuid 0x80000001.edx */
- const u32 kvm_supported_word1_x86_features =
+ const u32 kvm_cpuid_8000_0001_edx_x86_features =
F(FPU) | F(VME) | F(DE) | F(PSE) |
F(TSC) | F(MSR) | F(PAE) | F(MCE) |
F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
@@ -325,7 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
/* cpuid 1.ecx */
- const u32 kvm_supported_word4_x86_features =
+ const u32 kvm_cpuid_1_ecx_x86_features =
/* NOTE: MONITOR (and MWAIT) are emulated as NOP,
* but *not* advertised to guests via CPUID ! */
F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
@@ -337,29 +347,32 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
F(F16C) | F(RDRAND);
/* cpuid 0x80000001.ecx */
- const u32 kvm_supported_word6_x86_features =
+ const u32 kvm_cpuid_8000_0001_ecx_x86_features =
F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) |
0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
/* cpuid 0xC0000001.edx */
- const u32 kvm_supported_word5_x86_features =
+ const u32 kvm_cpuid_C000_0001_edx_x86_features =
F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
F(PMM) | F(PMM_EN);
/* cpuid 7.0.ebx */
- const u32 kvm_supported_word9_x86_features =
+ const u32 kvm_cpuid_7_0_ebx_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
/* cpuid 0xD.1.eax */
- const u32 kvm_supported_word10_x86_features =
+ const u32 kvm_cpuid_D_1_eax_x86_features =
F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
+ /* cpuid 7.0.ecx*/
+ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/;
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -376,10 +389,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, (u32)0xd);
break;
case 1:
- entry->edx &= kvm_supported_word0_x86_features;
- cpuid_mask(&entry->edx, 0);
- entry->ecx &= kvm_supported_word4_x86_features;
- cpuid_mask(&entry->ecx, 4);
+ entry->edx &= kvm_cpuid_1_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_1_EDX);
+ entry->ecx &= kvm_cpuid_1_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_1_ECX);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -433,14 +446,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capability word 9 */
if (index == 0) {
- entry->ebx &= kvm_supported_word9_x86_features;
- cpuid_mask(&entry->ebx, 9);
+ entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+ cpuid_mask(&entry->ebx, CPUID_7_0_EBX);
// TSC_ADJUST is emulated
entry->ebx |= F(TSC_ADJUST);
- } else
+ entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_7_ECX);
+ /* PKU is not yet implemented for shadow paging. */
+ if (!tdp_enabled)
+ entry->ecx &= ~F(PKU);
+ } else {
entry->ebx = 0;
+ entry->ecx = 0;
+ }
entry->eax = 0;
- entry->ecx = 0;
entry->edx = 0;
break;
}
@@ -514,7 +533,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
do_cpuid_1_ent(&entry[i], function, idx);
if (idx == 1) {
- entry[i].eax &= kvm_supported_word10_x86_features;
+ entry[i].eax &= kvm_cpuid_D_1_eax_x86_features;
entry[i].ebx = 0;
if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
entry[i].ebx =
@@ -564,10 +583,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, 0x8000001a);
break;
case 0x80000001:
- entry->edx &= kvm_supported_word1_x86_features;
- cpuid_mask(&entry->edx, 1);
- entry->ecx &= kvm_supported_word6_x86_features;
- cpuid_mask(&entry->ecx, 6);
+ entry->edx &= kvm_cpuid_8000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_8000_0001_EDX);
+ entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features;
+ cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX);
break;
case 0x80000007: /* Advanced power management */
/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -600,8 +619,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->eax = min(entry->eax, 0xC0000004);
break;
case 0xC0000001:
- entry->edx &= kvm_supported_word5_x86_features;
- cpuid_mask(&entry->edx, 5);
+ entry->edx &= kvm_cpuid_C000_0001_edx_x86_features;
+ cpuid_mask(&entry->edx, CPUID_C000_0001_EDX);
break;
case 3: /* Processor serial number */
case 5: /* MONITOR/MWAIT */
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 66a6581724ad..e17a74b1d852 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -80,6 +80,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_pku(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ecx & bit(X86_FEATURE_PKU));
+}
+
static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b9b09fec173b..0f6294376fbd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -309,23 +309,29 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
-#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t"
+#define FOP_FUNC(name) \
+ ".align " __stringify(FASTOP_SIZE) " \n\t" \
+ ".type " name ", @function \n\t" \
+ name ":\n\t"
+
#define FOP_RET "ret \n\t"
#define FOP_START(op) \
extern void em_##op(struct fastop *fake); \
asm(".pushsection .text, \"ax\" \n\t" \
".global em_" #op " \n\t" \
- FOP_ALIGN \
- "em_" #op ": \n\t"
+ FOP_FUNC("em_" #op)
#define FOP_END \
".popsection")
-#define FOPNOP() FOP_ALIGN FOP_RET
+#define FOPNOP() \
+ FOP_FUNC(__stringify(__UNIQUE_ID(nop))) \
+ FOP_RET
#define FOP1E(op, dst) \
- FOP_ALIGN "10: " #op " %" #dst " \n\t" FOP_RET
+ FOP_FUNC(#op "_" #dst) \
+ "10: " #op " %" #dst " \n\t" FOP_RET
#define FOP1EEX(op, dst) \
FOP1E(op, dst) _ASM_EXTABLE(10b, kvm_fastop_exception)
@@ -357,7 +363,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
FOP_END
#define FOP2E(op, dst, src) \
- FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET
+ FOP_FUNC(#op "_" #dst "_" #src) \
+ #op " %" #src ", %" #dst " \n\t" FOP_RET
#define FASTOP2(op) \
FOP_START(op) \
@@ -395,7 +402,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
FOP_END
#define FOP3E(op, dst, src, src2) \
- FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
+ FOP_FUNC(#op "_" #dst "_" #src "_" #src2) \
+ #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
/* 3-operand, word-only, src2=cl */
#define FASTOP3WCL(op) \
@@ -407,7 +415,12 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
FOP_END
/* Special case for SETcc - 1 instruction per cc */
-#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t"
+#define FOP_SETCC(op) \
+ ".align 4 \n\t" \
+ ".type " #op ", @function \n\t" \
+ #op ": \n\t" \
+ #op " %al \n\t" \
+ FOP_RET
asm(".global kvm_fastop_exception \n"
"kvm_fastop_exception: xor %esi, %esi; ret");
@@ -956,7 +969,7 @@ static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
return fastop(ctxt, em_bsr);
}
-static u8 test_cc(unsigned int condition, unsigned long flags)
+static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index e1e89ee4af75..762cdf2595f9 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,6 +84,11 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
+static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
+{
+ return kvm_x86_ops->get_pkru(vcpu);
+}
+
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c512f095cdac..70e95d097ef1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -479,7 +479,7 @@ static bool spte_is_locklessly_modifiable(u64 spte)
static bool spte_has_volatile_bits(u64 spte)
{
/*
- * Always atomicly update spte if it can be updated
+ * Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
@@ -550,7 +550,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
/*
* For the spte updated out of mmu-lock is safe, since
- * we always atomicly update it, see the comments in
+ * we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
if (spte_is_locklessly_modifiable(old_spte) &&
@@ -632,12 +632,12 @@ static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
* kvm_flush_remote_tlbs() IPI to all active vcpus.
*/
local_irq_disable();
- vcpu->mode = READING_SHADOW_PAGE_TABLES;
+
/*
* Make sure a following spte read is not reordered ahead of the write
* to vcpu->mode.
*/
- smp_mb();
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
}
static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@ -647,8 +647,7 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
* reads to sptes. If it does, kvm_commit_zap_page() can see us
* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
*/
- smp_mb();
- vcpu->mode = OUTSIDE_GUEST_MODE;
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
local_irq_enable();
}
@@ -2390,14 +2389,13 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
return;
/*
- * wmb: make sure everyone sees our modifications to the page tables
- * rmb: make sure we see changes to vcpu->mode
- */
- smp_mb();
-
- /*
- * Wait for all vcpus to exit guest mode and/or lockless shadow
- * page table walks.
+ * We need to make sure everyone sees our modifications to
+ * the page tables and see changes to vcpu->mode here. The barrier
+ * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ *
+ * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ * guest mode and/or lockless shadow page table walks.
*/
kvm_flush_remote_tlbs(kvm);
@@ -3923,6 +3921,81 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
}
}
+/*
+* PKU is an additional mechanism by which the paging controls access to
+* user-mode addresses based on the value in the PKRU register. Protection
+* key violations are reported through a bit in the page fault error code.
+* Unlike other bits of the error code, the PK bit is not known at the
+* call site of e.g. gva_to_gpa; it must be computed directly in
+* permission_fault based on two bits of PKRU, on some machine state (CR4,
+* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+*
+* In particular the following conditions come from the error code, the
+* page tables and the machine state:
+* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+* - PK is always zero if U=0 in the page tables
+* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+*
+* The PKRU bitmask caches the result of these four conditions. The error
+* code (minus the P bit) and the page table's U bit form an index into the
+* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
+* with the two bits of the PKRU register corresponding to the protection key.
+* For the first three conditions above the bits will be 00, thus masking
+* away both AD and WD. For all reads or if the last condition holds, WD
+* only will be masked away.
+*/
+static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ bool ept)
+{
+ unsigned bit;
+ bool wp;
+
+ if (ept) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+ if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ mmu->pkru_mask = 0;
+ return;
+ }
+
+ wp = is_write_protection(vcpu);
+
+ for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ unsigned pfec, pkey_bits;
+ bool check_pkey, check_write, ff, uf, wf, pte_user;
+
+ pfec = bit << 1;
+ ff = pfec & PFERR_FETCH_MASK;
+ uf = pfec & PFERR_USER_MASK;
+ wf = pfec & PFERR_WRITE_MASK;
+
+ /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ pte_user = pfec & PFERR_RSVD_MASK;
+
+ /*
+ * Only need to check the access which is not an
+ * instruction fetch and is to a user page.
+ */
+ check_pkey = (!ff && pte_user);
+ /*
+ * write access is controlled by PKRU if it is a
+ * user access or CR0.WP = 1.
+ */
+ check_write = check_pkey && wf && (uf || wp);
+
+ /* PKRU.AD stops both read and write access. */
+ pkey_bits = !!check_pkey;
+ /* PKRU.WD stops write access. */
+ pkey_bits |= (!!check_write) << 1;
+
+ mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ }
+}
+
static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
{
unsigned root_level = mmu->root_level;
@@ -3941,6 +4014,7 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
MMU_WARN_ON(!is_pae(vcpu));
@@ -3968,6 +4042,7 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
reset_rsvds_bits_mask(vcpu, context);
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
context->page_fault = paging32_page_fault;
@@ -4026,6 +4101,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, context, false);
+ update_pkru_bitmask(vcpu, context, false);
update_last_nonleaf_level(vcpu, context);
reset_tdp_shadow_zero_bits_mask(vcpu, context);
}
@@ -4078,6 +4154,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
context->direct_map = false;
update_permission_bitmask(vcpu, context, true);
+ update_pkru_bitmask(vcpu, context, true);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -4132,6 +4209,7 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
}
update_permission_bitmask(vcpu, g_context, false);
+ update_pkru_bitmask(vcpu, g_context, false);
update_last_nonleaf_level(vcpu, g_context);
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 58fe98a0a526..b70df72e2b33 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -10,10 +10,11 @@
#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
#define PT_WRITABLE_SHIFT 1
+#define PT_USER_SHIFT 2
#define PT_PRESENT_MASK (1ULL << 0)
#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
-#define PT_USER_MASK (1ULL << 2)
+#define PT_USER_MASK (1ULL << PT_USER_SHIFT)
#define PT_PWT_MASK (1ULL << 3)
#define PT_PCD_MASK (1ULL << 4)
#define PT_ACCESSED_SHIFT 5
@@ -141,11 +142,16 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu)
}
/*
- * Will a fault with a given page-fault error code (pfec) cause a permission
- * fault with the given access (in ACC_* format)?
+ * Check if a given access (described through the I/D, W/R and U/S bits of a
+ * page fault error code pfec) causes a permission fault with the given PTE
+ * access rights (in ACC_* format).
+ *
+ * Return zero if the access does not fault; return the page fault error code
+ * if the access faults.
*/
-static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- unsigned pte_access, unsigned pfec)
+static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ unsigned pte_access, unsigned pte_pkey,
+ unsigned pfec)
{
int cpl = kvm_x86_ops->get_cpl(vcpu);
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
@@ -166,10 +172,32 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
unsigned long smap = (cpl - 3) & (rflags & X86_EFLAGS_AC);
int index = (pfec >> 1) +
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ bool fault = (mmu->permissions[index] >> pte_access) & 1;
+
+ WARN_ON(pfec & (PFERR_PK_MASK | PFERR_RSVD_MASK));
+ pfec |= PFERR_PRESENT_MASK;
+
+ if (unlikely(mmu->pkru_mask)) {
+ u32 pkru_bits, offset;
+
+ /*
+ * PKRU defines 32 bits, there are 16 domains and 2
+ * attribute bits per domain in pkru. pte_pkey is the
+ * index of the protection domain, so pte_pkey * 2 is
+ * is the index of the first bit for the domain.
+ */
+ pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+
+ /* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
+ offset = pfec - 1 +
+ ((pte_access & PT_USER_MASK) << (PFERR_RSVD_BIT - PT_USER_SHIFT));
- WARN_ON(pfec & PFERR_RSVD_MASK);
+ pkru_bits &= mmu->pkru_mask >> offset;
+ pfec |= -pkru_bits & PFERR_PK_MASK;
+ fault |= (pkru_bits != 0);
+ }
- return (mmu->permissions[index] >> pte_access) & 1;
+ return -(uint32_t)fault & pfec;
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 11f76436f74f..b431539c3714 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -142,12 +142,17 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm,
bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
enum kvm_page_track_mode mode)
{
- struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- int index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
+ struct kvm_memory_slot *slot;
+ int index;
if (WARN_ON(!page_track_mode_is_valid(mode)))
return false;
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ if (!slot)
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e159a8185ad9..1d971c7553c3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -257,6 +257,17 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
return 0;
}
+static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
+{
+ unsigned pkeys = 0;
+#if PTTYPE == 64
+ pte_t pte = {.pte = gpte};
+
+ pkeys = pte_flags_pkey(pte_flags(pte));
+#endif
+ return pkeys;
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -268,7 +279,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
pt_element_t pte;
pt_element_t __user *uninitialized_var(ptep_user);
gfn_t table_gfn;
- unsigned index, pt_access, pte_access, accessed_dirty;
+ unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
gpa_t pte_gpa;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
@@ -359,10 +370,10 @@ retry_walk:
walker->ptes[walker->level - 1] = pte;
} while (!is_last_gpte(mmu, walker->level, pte));
- if (unlikely(permission_fault(vcpu, mmu, pte_access, access))) {
- errcode |= PFERR_PRESENT_MASK;
+ pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
+ errcode = permission_fault(vcpu, mmu, pte_access, pte_pkey, access);
+ if (unlikely(errcode))
goto error;
- }
gfn = gpte_to_gfn_lvl(pte, walker->level);
gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
@@ -949,6 +960,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ /*
+ * Update spte before increasing tlbs_dirty to make
+ * sure no tlb flush is lost after spte is zapped; see
+ * the comments in kvm_flush_remote_tlbs().
+ */
+ smp_wmb();
vcpu->kvm->tlbs_dirty++;
continue;
}
@@ -964,6 +981,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
+ /*
+ * The same as above where we are doing
+ * prefetch_invalid_gpte().
+ */
+ smp_wmb();
vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 95070386d599..31346a3f20a5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1280,6 +1280,11 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
+static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -4347,6 +4352,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+
+ .get_pkru = svm_get_pkru,
+
.fpu_activate = svm_fpu_activate,
.fpu_deactivate = svm_fpu_deactivate,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5e45c2731a5d..ee1c8a93871c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -598,6 +598,10 @@ struct vcpu_vmx {
struct page *pml_pg;
u64 current_tsc_ratio;
+
+ bool guest_pkru_valid;
+ u32 guest_pkru;
+ u32 host_pkru;
};
enum segment_cache_field {
@@ -2107,6 +2111,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
} while (cmpxchg(&pi_desc->control, old.control,
new.control) != old.control);
}
+
/*
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
@@ -2167,6 +2172,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
vmx_vcpu_pi_load(vcpu, cpu);
+ vmx->host_pkru = read_pkru();
}
static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@ -2286,6 +2292,11 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
vmcs_writel(GUEST_RFLAGS, rflags);
}
+static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+{
+ return to_vmx(vcpu)->guest_pkru;
+}
+
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2712,8 +2723,15 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
} else
vmx->nested.nested_vmx_ept_caps = 0;
+ /*
+ * Old versions of KVM use the single-context version without
+ * checking for support, so declare that it is supported even
+ * though it is treated as global context. The alternative is
+ * not failing the single-context invvpid, and it is worse.
+ */
if (enable_vpid)
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
else
vmx->nested.nested_vmx_vpid_caps = 0;
@@ -3886,13 +3904,17 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!enable_unrestricted_guest && !is_paging(vcpu))
/*
- * SMEP/SMAP is disabled if CPU is in non-paging mode in
- * hardware. However KVM always uses paging mode without
- * unrestricted guest.
- * To emulate this behavior, SMEP/SMAP needs to be manually
- * disabled when guest switches to non-paging mode.
+ * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ * hardware. To emulate this behavior, SMEP/SMAP/PKU needs
+ * to be manually disabled when guest switches to non-paging
+ * mode.
+ *
+ * If !enable_unrestricted_guest, the CPU is always running
+ * with CR0.PG=1 and CR4 needs to be modified.
+ * If enable_unrestricted_guest, the CPU automatically
+ * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
*/
- hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+ hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
@@ -5506,7 +5528,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
-/* called to set cr0 as approriate for clts instruction exit. */
+/* called to set cr0 as appropriate for clts instruction exit. */
static void handle_clts(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu)) {
@@ -7245,7 +7267,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
/* The value to write might be 32 or 64 bits, depending on L1's long
* mode, and eventually we need to write that into a field of several
* possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the approriate number of
+ * bit (field_value), and then copies only the appropriate number of
* bits into the vmcs12 field.
*/
u64 field_value = 0;
@@ -7399,6 +7421,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7457,6 +7480,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
if (!(types & (1UL << type))) {
nested_vmx_failValid(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ skip_emulated_instruction(vcpu);
return 1;
}
@@ -7473,12 +7497,17 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
}
switch (type) {
+ case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ /*
+ * Old versions of KVM use the single-context version so we
+ * have to support it; just treat it the same as all-context.
+ */
case VMX_VPID_EXTENT_ALL_CONTEXT:
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
nested_vmx_succeed(vcpu);
break;
default:
- /* Trap single context invalidation invvpid calls */
+ /* Trap individual address invalidation invvpid calls */
BUG_ON(1);
break;
}
@@ -8385,6 +8414,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
{
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ register void *__sp asm(_ASM_SP);
/*
* If external interrupt exists, IF bit is set in rflags/eflags on the
@@ -8417,8 +8447,9 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
"call *%[entry]\n\t"
:
#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp)
+ [sp]"=&r"(tmp),
#endif
+ "+r"(__sp)
:
[entry]"r"(entry),
[ss]"i"(__KERNEL_DS),
@@ -8619,6 +8650,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
+ if (vmx->guest_pkru_valid)
+ __write_pkru(vmx->guest_pkru);
+
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -8759,6 +8793,20 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
/*
+ * eager fpu is enabled if PKEY is supported and CR4 is switched
+ * back on host, so it is safe to read guest PKRU from current
+ * XSAVE.
+ */
+ if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+ vmx->guest_pkru = __read_pkru();
+ if (vmx->guest_pkru != vmx->host_pkru) {
+ vmx->guest_pkru_valid = true;
+ __write_pkru(vmx->host_pkru);
+ } else
+ vmx->guest_pkru_valid = false;
+ }
+
+ /*
* the KVM_REQ_EVENT optimization bit is only on for one entry, and if
* we did not inject a still-pending event to L1 now because of
* nested_run_pending, we need to re-enable this bit.
@@ -10882,6 +10930,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+
+ .get_pkru = vmx_get_pkru,
+
.fpu_activate = vmx_fpu_activate,
.fpu_deactivate = vmx_fpu_deactivate,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7236bd3a4c3d..742d0f7d3556 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -723,7 +723,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP;
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
if (cr4 & CR4_RESERVED_BITS)
return 1;
@@ -740,6 +740,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
return 1;
+ if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+ return 1;
+
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
@@ -765,7 +768,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
- if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
return 0;
@@ -1559,7 +1562,7 @@ static cycle_t read_tsc(void)
/*
* GCC likes to generate cmov here, but this branch is extremely
- * predictable (it's just a funciton of time and the likely is
+ * predictable (it's just a function of time and the likely is
* very likely) and there's a data dependence, so force GCC
* to generate a branch instead. I don't barrier() because
* we don't actually need a barrier, and if this function
@@ -4326,9 +4329,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
+ /*
+ * currently PKRU is only applied to ept enabled guest so
+ * there is no pkey in EPT page table for L1 guest or EPT
+ * shadow page table for L2 guest.
+ */
if (vcpu_match_mmio_gva(vcpu, gva)
&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.access, access)) {
+ vcpu->arch.access, 0, access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
trace_vcpu_match_mmio(gva, *gpa, write, false);
@@ -6588,8 +6596,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
- /* We should set ->mode before check ->requests,
- * see the comment in make_all_cpus_request.
+ /*
+ * We should set ->mode before check ->requests,
+ * Please see the comment in kvm_make_all_cpus_request.
+ * This also orders the write to mode from any reads
+ * to the page tables done while the VCPU is running.
+ * Please see the comment in kvm_flush_remote_tlbs.
*/
smp_mb__after_srcu_read_unlock();
@@ -7123,7 +7135,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- if (sregs->cr4 & X86_CR4_OSXSAVE)
+ if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
kvm_update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 007940faa5c6..7ce3634ab5fe 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -183,7 +183,8 @@ bool kvm_vector_hashing_enabled(void);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
- | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512)
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
extern u64 host_xcr0;
extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..72a576752a7e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
# Makefile for x86 specific library files.
#
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o := n
+
inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
quiet_cmd_inat_tables = GEN $@
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
index 1318f75d56e4..28a6654f0d08 100644
--- a/arch/x86/lib/csum-wrappers_64.c
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -135,7 +135,7 @@ EXPORT_SYMBOL(csum_partial_copy_nocheck);
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto, __wsum sum)
+ __u32 len, __u8 proto, __wsum sum)
{
__u64 rest, sum64;
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 8f72b334aea0..1a416935bac9 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -374,7 +374,7 @@ void insn_get_displacement(struct insn *insn)
if (mod == 3)
goto out;
if (mod == 1) {
- insn->displacement.value = get_next(char, insn);
+ insn->displacement.value = get_next(signed char, insn);
insn->displacement.nbytes = 1;
} else if (insn->addr_bytes == 2) {
if ((mod == 0 && rm == 6) || mod == 2) {
@@ -532,7 +532,7 @@ void insn_get_immediate(struct insn *insn)
switch (inat_immediate_size(insn->attr)) {
case INAT_IMM_BYTE:
- insn->immediate.value = get_next(char, insn);
+ insn->immediate.value = get_next(signed char, insn);
insn->immediate.nbytes = 1;
break;
case INAT_IMM_WORD:
@@ -566,7 +566,7 @@ void insn_get_immediate(struct insn *insn)
goto err_out;
}
if (inat_has_second_immediate(insn->attr)) {
- insn->immediate2.value = get_next(char, insn);
+ insn->immediate2.value = get_next(signed char, insn);
insn->immediate2.nbytes = 1;
}
done:
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index cbb8ee5830ff..2ec0b0abbfaa 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,6 +1,7 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
+#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/alternative-asm.h>
@@ -268,16 +269,16 @@ ENTRY(memcpy_mcsafe)
decl %ecx
jnz .L_copy_trailing_bytes
- /* Copy successful. Return true */
+ /* Copy successful. Return zero */
.L_done_memcpy_trap:
xorq %rax, %rax
ret
ENDPROC(memcpy_mcsafe)
.section .fixup, "ax"
- /* Return false for any failure */
+ /* Return -EFAULT for any failure */
.L_memcpy_mcsafe_fail:
- mov $1, %rax
+ mov $-EFAULT, %rax
ret
.previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index c9c81227ea37..e1229ecd2a82 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -9,7 +9,7 @@
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
- * simpler and shorter than the orignal function as well.
+ * simpler and shorter than the original function as well.
*
* rdi destination
* rsi value (char)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 40027db99140..be110efa0096 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -15,6 +15,7 @@
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
+#include <asm/frame.h>
#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
@@ -84,24 +85,29 @@
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
+ FRAME_BEGIN
save_common_regs
__ASM_SIZE(push,) %__ASM_REG(dx)
movq %rax,%rdi
call rwsem_down_read_failed
__ASM_SIZE(pop,) %__ASM_REG(dx)
restore_common_regs
+ FRAME_END
ret
ENDPROC(call_rwsem_down_read_failed)
ENTRY(call_rwsem_down_write_failed)
+ FRAME_BEGIN
save_common_regs
movq %rax,%rdi
call rwsem_down_write_failed
restore_common_regs
+ FRAME_END
ret
ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
+ FRAME_BEGIN
/* do nothing if still outstanding active readers */
__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
@@ -109,15 +115,18 @@ ENTRY(call_rwsem_wake)
movq %rax,%rdi
call rwsem_wake
restore_common_regs
-1: ret
+1: FRAME_END
+ ret
ENDPROC(call_rwsem_wake)
ENTRY(call_rwsem_downgrade_wake)
+ FRAME_BEGIN
save_common_regs
__ASM_SIZE(push,) %__ASM_REG(dx)
movq %rax,%rdi
call rwsem_downgrade_wake
__ASM_SIZE(pop,) %__ASM_REG(dx)
restore_common_regs
+ FRAME_END
ret
ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o := n
+
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
@@ -34,3 +37,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 9dd7e4b7fcde..82447b3fba38 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,17 +1,10 @@
#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sort.h>
#include <asm/uaccess.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
static inline unsigned long
-ex_insn_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
@@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip)
*ip = new_ip;
return 1;
}
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry *mid;
- unsigned long addr;
-
- mid = ((last - first) >> 1) + first;
- addr = ex_insn_addr(mid);
- if (addr < value)
- first = mid + 1;
- else if (addr > value)
- last = mid - 1;
- else
- return mid;
- }
- return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /*
- * This value will always end up fittin in an int, because on
- * both i386 and x86-64 the kernel symbol-reachable address
- * space is < 2 GiB.
- *
- * This compare is only valid after normalization.
- */
- return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- struct exception_table_entry *p;
- int i;
-
- /* Convert all entries to being relative to the start of the section */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn += i;
- i += 4;
- p->fixup += i;
- i += 4;
- p->handler += i;
- i += 4;
- }
-
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, NULL);
-
- /* Denormalize all entries */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn -= i;
- i += 4;
- p->fixup -= i;
- i += 4;
- p->handler -= i;
- i += 4;
- }
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 03898aea6e0f..5ce1ed02f7e8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -15,12 +15,14 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
+#include <asm/mmu_context.h> /* vma_pkey() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -33,6 +35,7 @@
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
@@ -41,6 +44,7 @@ enum x86_pf_error_code {
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
+ PF_PK = 1 << 5,
};
/*
@@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
+/*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state, and this function fills out a field in
+ * siginfo so userspace can discover which protection key was set
+ * on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+ struct vm_area_struct *vma)
+{
+ /* This is effectively an #ifdef */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ /* Fault not from Protection Keys: nothing to do */
+ if (si_code != SEGV_PKUERR)
+ return;
+ /*
+ * force_sig_info_fault() is called from a number of
+ * contexts, some of which have a VMA and some of which
+ * do not. The PF_PK handing happens after we have a
+ * valid VMA, so we should never reach this without a
+ * valid VMA.
+ */
+ if (!vma) {
+ WARN_ONCE(1, "PKU fault with no VMA passed in");
+ info->si_pkey = 0;
+ return;
+ }
+ /*
+ * si_pkey should be thought of as a strong hint, but not
+ * absolutely guranteed to be 100% accurate because of
+ * the race explained above.
+ */
+ info->si_pkey = vma_pkey(vma);
+}
+
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, int fault)
+ struct task_struct *tsk, struct vm_area_struct *vma,
+ int fault)
{
unsigned lsb = 0;
siginfo_t info;
@@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
+ fill_sig_info_pkey(si_code, &info, vma);
+
force_sig_info(si_signo, &info, tsk);
}
@@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
+ /* No context means no VMA to pass down */
+ struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
- force_sig_info_fault(signal, si_code, address, tsk, 0);
+ force_sig_info_fault(signal, si_code, address,
+ tsk, vma, 0);
}
/*
@@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma,
+ int si_code)
{
struct task_struct *tsk = current;
@@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
return;
}
@@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address, si_code);
+ __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- __bad_area(regs, error_code, address, SEGV_MAPERR);
+ __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+}
+
+static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+{
+ /* This code is always called on the current mm */
+ bool foreign = false;
+
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return false;
+ if (error_code & PF_PK)
+ return true;
+ /* this checks permission keys on the VMA: */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return true;
+ return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area(regs, error_code, address, SEGV_ACCERR);
+ /*
+ * This OSPKE check is not strictly necessary at runtime.
+ * But, doing it this way allows compiler optimizations
+ * if pkeys are compiled out.
+ */
+ if (bad_area_access_from_pkeys(error_code, vma))
+ __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
+ else
+ __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ struct vm_area_struct *vma, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+ force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
+ unsigned long address, struct vm_area_struct *vma,
+ unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
+ do_sigbus(regs, error_code, address, vma, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, vma);
else
BUG();
}
@@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
+ /*
+ * Note: We do not do lazy flushing on protection key
+ * changes, so no spurious fault will ever set PF_PK.
+ */
+ if ((error_code & PF_PK))
+ return 1;
return 1;
}
@@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
+ /* This is only called for the current mm, so: */
+ bool foreign = false;
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a PF_PK as soon as we fill in a
+ * page.
+ */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return 1;
+
if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
+ if (error_code & PF_INSTR)
+ flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
@@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
@@ -1232,7 +1338,7 @@ retry:
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address);
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1270,7 +1376,7 @@ good_area:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
+ mm_fault_error(regs, error_code, address, vma, fault);
return;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d8bf50..b8b6a60b32cf 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/memremap.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
@@ -75,6 +76,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
/*
+ * 'pteval' can come from a pte, pmd or pud. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline int pte_allows_gup(unsigned long pteval, int write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ /* Check memory protection keys permissions. */
+ if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
+ return 0;
+
+ return 1;
+}
+
+/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
@@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
- unsigned long mask;
int nr_start = *nr;
pte_t *ptep;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
@@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_unmap(ptep);
return 0;
}
- } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ } else if (!pte_allows_gup(pte_val(pte), write) ||
+ pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -131,7 +150,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON_PAGE(page != compound_head(page), page);
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_add(nr, &page->_count);
+ page_ref_add(page, nr);
SetPageReferenced(page);
}
@@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pmd_flags(pmd) & mask) != mask)
+ if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pud_flags(pud) & mask) != mask)
+ if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@@ -422,7 +433,7 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(current, mm, start,
+ ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
write, 0, pages);
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index ef05755a1900..80476878eb4c 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
int nr_pages = 1;
int force = 0;
- gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
- nr_pages, write, force, NULL, NULL);
+ gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
+ force, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,
@@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
/*
* This covers 32-bit emulation as well as 32-bit kernels
- * running on 64-bit harware.
+ * running on 64-bit hardware.
*/
if (!is_64bit_mm(mm))
return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
/*
* 'x86_virt_bits' returns what the hardware is capable
- * of, and returns the full >32-bit adddress space when
+ * of, and returns the full >32-bit address space when
* running 32-bit kernels on 64-bit hardware.
*/
virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4d0b26253042..01be9ec3bf79 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -909,16 +909,25 @@ static void populate_pte(struct cpa_data *cpa,
pte = pte_offset_kernel(pmd, start);
- while (num_pages-- && start < end) {
+ /*
+ * Set the GLOBAL flags only if the PRESENT flag is
+ * set otherwise pte_present will return true even on
+ * a non present pte. The canon_pgprot will clear
+ * _PAGE_GLOBAL for the ancient hardware that doesn't
+ * support it.
+ */
+ if (pgprot_val(pgprot) & _PAGE_PRESENT)
+ pgprot_val(pgprot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
- /* deal with the NX bit */
- if (!(pgprot_val(pgprot) & _PAGE_NX))
- cpa->pfn &= ~_PAGE_NX;
+ pgprot = canon_pgprot(pgprot);
- set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+ while (num_pages-- && start < end) {
+ set_pte(pte, pfn_pte(cpa->pfn, pgprot));
start += PAGE_SIZE;
- cpa->pfn += PAGE_SIZE;
+ cpa->pfn++;
pte++;
}
}
@@ -974,11 +983,11 @@ static int populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+ set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pmd_pgprot)));
start += PMD_SIZE;
- cpa->pfn += PMD_SIZE;
+ cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
cur_pages += PMD_SIZE >> PAGE_SHIFT;
}
@@ -1046,12 +1055,12 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
/*
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
- while (end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+ while (cpu_has_gbpages && end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pud_pgprot)));
start += PUD_SIZE;
- cpa->pfn += PUD_SIZE;
+ cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
cur_pages += PUD_SIZE >> PAGE_SHIFT;
pud++;
}
@@ -1964,6 +1973,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
if (!(page_flags & _PAGE_NX))
cpa.mask_clr = __pgprot(_PAGE_NX);
+ if (!(page_flags & _PAGE_RW))
+ cpa.mask_clr = __pgprot(_PAGE_RW);
+
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 04e2e7144bee..faec01e7a17d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -149,7 +149,7 @@ enum {
PAT_WT = 4, /* Write Through */
PAT_WP = 5, /* Write Protected */
PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+ PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
};
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000000..e8c474451928
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,101 @@
+/*
+ * Intel Memory Protection Keys management
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/mm_types.h> /* mm_struct, vma, etc... */
+#include <linux/pkeys.h> /* PKEY_* */
+#include <uapi/asm-generic/mman-common.h>
+
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
+#include <asm/mmu_context.h> /* vma_pkey() */
+#include <asm/fpu/internal.h> /* fpregs_active() */
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+ int ret;
+
+ /*
+ * We do not want to go through the relatively costly
+ * dance to set PKRU if we do not need to. Check it
+ * first and assume that if the execute-only pkey is
+ * write-disabled that we do not have to set it
+ * ourselves. We need preempt off so that nobody
+ * can make fpregs inactive.
+ */
+ preempt_disable();
+ if (fpregs_active() &&
+ !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+ preempt_enable();
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+ }
+ preempt_enable();
+ ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+ PKEY_DISABLE_ACCESS);
+ /*
+ * If the PKRU-set operation failed somehow, just return
+ * 0 and effectively disable execute-only support.
+ */
+ if (ret)
+ return 0;
+
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ return false;
+ if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is only called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
+{
+ /*
+ * Is this an mprotect_pkey() call? If so, never
+ * override the value that came from the user.
+ */
+ if (pkey != -1)
+ return pkey;
+ /*
+ * Look for a protection-key-drive execute-only mapping
+ * which is now being given permissions that are not
+ * execute-only. Move it back to the default pkey.
+ */
+ if (vma_is_pkey_exec_only(vma) &&
+ (prot & (PROT_READ|PROT_WRITE))) {
+ return 0;
+ }
+ /*
+ * The mapping is execute-only. Go try to get the
+ * execute-only protection key. If we fail to do that,
+ * fall through as if we do not have execute-only
+ * support.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ }
+ /*
+ * This is a vanilla, non-pkey mprotect (or we failed to
+ * setup execute-only), inherit the pkey from the VMA we
+ * are working on.
+ */
+ return vma_pkey(vma);
+}
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 4093216b3791..f2a7faf4706e 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -8,6 +8,7 @@
* of the License.
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
/*
* Calling convention :
@@ -22,15 +23,16 @@
32 /* space for rbx,r13,r14,r15 */ + \
8 /* space for skb_copy_bits */)
-sk_load_word:
- .globl sk_load_word
+#define FUNC(name) \
+ .globl name; \
+ .type name, @function; \
+ name:
+FUNC(sk_load_word)
test %esi,%esi
js bpf_slow_path_word_neg
-sk_load_word_positive_offset:
- .globl sk_load_word_positive_offset
-
+FUNC(sk_load_word_positive_offset)
mov %r9d,%eax # hlen
sub %esi,%eax # hlen - offset
cmp $3,%eax
@@ -39,15 +41,11 @@ sk_load_word_positive_offset:
bswap %eax /* ntohl() */
ret
-sk_load_half:
- .globl sk_load_half
-
+FUNC(sk_load_half)
test %esi,%esi
js bpf_slow_path_half_neg
-sk_load_half_positive_offset:
- .globl sk_load_half_positive_offset
-
+FUNC(sk_load_half_positive_offset)
mov %r9d,%eax
sub %esi,%eax # hlen - offset
cmp $1,%eax
@@ -56,15 +54,11 @@ sk_load_half_positive_offset:
rol $8,%ax # ntohs()
ret
-sk_load_byte:
- .globl sk_load_byte
-
+FUNC(sk_load_byte)
test %esi,%esi
js bpf_slow_path_byte_neg
-sk_load_byte_positive_offset:
- .globl sk_load_byte_positive_offset
-
+FUNC(sk_load_byte_positive_offset)
cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
jle bpf_slow_path_byte
movzbl (SKBDATA,%rsi),%eax
@@ -72,16 +66,18 @@ sk_load_byte_positive_offset:
/* rsi contains offset and can be scratched */
#define bpf_slow_path_common(LEN) \
+ lea -MAX_BPF_STACK + 32(%rbp), %rdx;\
+ FRAME_BEGIN; \
mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
/* rsi already has offset */ \
mov $LEN,%ecx; /* len */ \
- lea - MAX_BPF_STACK + 32(%rbp),%rdx; \
call skb_copy_bits; \
test %eax,%eax; \
pop SKBDATA; \
- pop %r9;
+ pop %r9; \
+ FRAME_END
bpf_slow_path_word:
@@ -106,6 +102,7 @@ bpf_slow_path_byte:
ret
#define sk_negative_common(SIZE) \
+ FRAME_BEGIN; \
mov %rbx, %rdi; /* arg1 == skb */ \
push %r9; \
push SKBDATA; \
@@ -115,13 +112,14 @@ bpf_slow_path_byte:
test %rax,%rax; \
pop SKBDATA; \
pop %r9; \
+ FRAME_END; \
jz bpf_error
bpf_slow_path_word_neg:
cmp SKF_MAX_NEG_OFF, %esi /* test range */
jl bpf_error /* offset lower -> error */
-sk_load_word_negative_offset:
- .globl sk_load_word_negative_offset
+
+FUNC(sk_load_word_negative_offset)
sk_negative_common(4)
mov (%rax), %eax
bswap %eax
@@ -130,8 +128,8 @@ sk_load_word_negative_offset:
bpf_slow_path_half_neg:
cmp SKF_MAX_NEG_OFF, %esi
jl bpf_error
-sk_load_half_negative_offset:
- .globl sk_load_half_negative_offset
+
+FUNC(sk_load_half_negative_offset)
sk_negative_common(2)
mov (%rax),%ax
rol $8,%ax
@@ -141,8 +139,8 @@ sk_load_half_negative_offset:
bpf_slow_path_byte_neg:
cmp SKF_MAX_NEG_OFF, %esi
jl bpf_error
-sk_load_byte_negative_offset:
- .globl sk_load_byte_negative_offset
+
+FUNC(sk_load_byte_negative_offset)
sk_negative_common(1)
movzbl (%rax), %eax
ret
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 4e664bdb535a..cb31a4440e58 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -23,12 +23,13 @@ static int backtrace_stack(void *data, char *name)
return 0;
}
-static void backtrace_address(void *data, unsigned long addr, int reliable)
+static int backtrace_address(void *data, unsigned long addr, int reliable)
{
unsigned int *depth = data;
if ((*depth)--)
oprofile_add_trace(addr);
+ return 0;
}
static struct stacktrace_ops backtrace_ops = {
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1d2e6392f5fa..0e07e0968c3a 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -437,7 +437,8 @@ static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
void *data)
{
int cpu = (unsigned long)data;
- switch (action) {
+
+ switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_FAILED:
case CPU_ONLINE:
smp_call_function_single(cpu, nmi_cpu_up, NULL, 0);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index d34b5118b4e8..381a43c40bf7 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -12,7 +12,6 @@
#include <linux/dmi.h>
#include <linux/slab.h>
-#include <asm-generic/pci-bridge.h>
#include <asm/acpi.h>
#include <asm/segment.h>
#include <asm/io.h>
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index e58565556703..b7de1929714b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -297,14 +297,14 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_r
*
* The standard boot ROM sequence for an x86 machine uses the BIOS
* to select an initial video card for boot display. This boot video
- * card will have it's BIOS copied to C0000 in system RAM.
+ * card will have its BIOS copied to 0xC0000 in system RAM.
* IORESOURCE_ROM_SHADOW is used to associate the boot video
* card with this copy. On laptops this copy has to be used since
* the main ROM may be compressed or combined with another image.
* See pci_map_rom() for use of this flag. Before marking the device
* with IORESOURCE_ROM_SHADOW check if a vga_default_device is already set
- * by either arch cde or vga-arbitration, if so only apply the fixup to this
- * already determined primary video card.
+ * by either arch code or vga-arbitration; if so only apply the fixup to this
+ * already-determined primary video card.
*/
static void pci_fixup_video(struct pci_dev *pdev)
@@ -312,6 +312,7 @@ static void pci_fixup_video(struct pci_dev *pdev)
struct pci_dev *bridge;
struct pci_bus *bus;
u16 config;
+ struct resource *res;
/* Is VGA routed to us? */
bus = pdev->bus;
@@ -336,8 +337,18 @@ static void pci_fixup_video(struct pci_dev *pdev)
if (!vga_default_device() || pdev == vga_default_device()) {
pci_read_config_word(pdev, PCI_COMMAND, &config);
if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) {
- pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW;
- dev_printk(KERN_DEBUG, &pdev->dev, "Video device with shadowed ROM\n");
+ res = &pdev->resource[PCI_ROM_RESOURCE];
+
+ pci_disable_rom(pdev);
+ if (res->parent)
+ release_resource(res);
+
+ res->start = 0xC0000;
+ res->end = res->start + 0x20000 - 1;
+ res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
+ IORESOURCE_PCI_FIXED;
+ dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n",
+ res);
}
}
}
@@ -540,3 +551,10 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev)
}
}
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone);
+
+static void pci_bdwep_bar(struct pci_dev *dev)
+{
+ dev->non_compliant_bars = 1;
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar);
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index d57e48016f15..7792aba266df 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -503,6 +503,18 @@ static struct pci_ops vmd_ops = {
.write = vmd_pci_write,
};
+static void vmd_attach_resources(struct vmd_dev *vmd)
+{
+ vmd->dev->resource[VMD_MEMBAR1].child = &vmd->resources[1];
+ vmd->dev->resource[VMD_MEMBAR2].child = &vmd->resources[2];
+}
+
+static void vmd_detach_resources(struct vmd_dev *vmd)
+{
+ vmd->dev->resource[VMD_MEMBAR1].child = NULL;
+ vmd->dev->resource[VMD_MEMBAR2].child = NULL;
+}
+
/*
* VMD domains start at 0x1000 to not clash with ACPI _SEG domains.
*/
@@ -527,11 +539,28 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
res = &vmd->dev->resource[VMD_CFGBAR];
vmd->resources[0] = (struct resource) {
.name = "VMD CFGBAR",
- .start = res->start,
+ .start = 0,
.end = (resource_size(res) >> 20) - 1,
.flags = IORESOURCE_BUS | IORESOURCE_PCI_FIXED,
};
+ /*
+ * If the window is below 4GB, clear IORESOURCE_MEM_64 so we can
+ * put 32-bit resources in the window.
+ *
+ * There's no hardware reason why a 64-bit window *couldn't*
+ * contain a 32-bit resource, but pbus_size_mem() computes the
+ * bridge window size assuming a 64-bit window will contain no
+ * 32-bit resources. __pci_assign_resource() enforces that
+ * artificial restriction to make sure everything will fit.
+ *
+ * The only way we could use a 64-bit non-prefechable MEMBAR is
+ * if its address is <4GB so that we can convert it to a 32-bit
+ * resource. To be visible to the host OS, all VMD endpoints must
+ * be initially configured by platform BIOS, which includes setting
+ * up these resources. We can assume the device is configured
+ * according to the platform needs.
+ */
res = &vmd->dev->resource[VMD_MEMBAR1];
upper_bits = upper_32_bits(res->end);
flags = res->flags & ~IORESOURCE_SIZEALIGN;
@@ -542,6 +571,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start,
.end = res->end,
.flags = flags,
+ .parent = res,
};
res = &vmd->dev->resource[VMD_MEMBAR2];
@@ -554,6 +584,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
.start = res->start + 0x2000,
.end = res->end,
.flags = flags,
+ .parent = res,
};
sd->domain = vmd_find_free_domain();
@@ -578,6 +609,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
return -ENODEV;
}
+ vmd_attach_resources(vmd);
vmd_setup_dma_ops(vmd);
dev_set_msi_domain(&vmd->bus->dev, vmd->irq_domain);
pci_rescan_bus(vmd->bus);
@@ -674,6 +706,7 @@ static void vmd_remove(struct pci_dev *dev)
{
struct vmd_dev *vmd = pci_get_drvdata(dev);
+ vmd_detach_resources(vmd);
pci_set_drvdata(dev, NULL);
sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
pci_stop_root_bus(vmd->bus);
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
index 2846aaab5103..066619b0700c 100644
--- a/arch/x86/platform/efi/Makefile
+++ b/arch/x86/platform/efi/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD_efi_thunk_$(BITS).o := y
+
obj-$(CONFIG_EFI) += quirks.o efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o
obj-$(CONFIG_EARLY_PRINTK_EFI) += early_printk.o
diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index ea48449b2e63..a2433817c987 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -10,6 +10,9 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/acpi.h>
@@ -28,8 +31,7 @@ struct bmp_header {
void __init efi_bgrt_init(void)
{
acpi_status status;
- void __iomem *image;
- bool ioremapped = false;
+ void *image;
struct bmp_header bmp_header;
if (acpi_disabled)
@@ -55,11 +57,6 @@ void __init efi_bgrt_init(void)
bgrt_tab->status);
return;
}
- if (bgrt_tab->status != 1) {
- pr_debug("Ignoring BGRT: invalid status %u (expected 1)\n",
- bgrt_tab->status);
- return;
- }
if (bgrt_tab->image_type != 0) {
pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n",
bgrt_tab->image_type);
@@ -70,20 +67,19 @@ void __init efi_bgrt_init(void)
return;
}
- image = efi_lookup_mapped_addr(bgrt_tab->image_address);
+ image = memremap(bgrt_tab->image_address, sizeof(bmp_header), MEMREMAP_WB);
if (!image) {
- image = early_ioremap(bgrt_tab->image_address,
- sizeof(bmp_header));
- ioremapped = true;
- if (!image) {
- pr_err("Ignoring BGRT: failed to map image header memory\n");
- return;
- }
+ pr_err("Ignoring BGRT: failed to map image header memory\n");
+ return;
}
- memcpy_fromio(&bmp_header, image, sizeof(bmp_header));
- if (ioremapped)
- early_iounmap(image, sizeof(bmp_header));
+ memcpy(&bmp_header, image, sizeof(bmp_header));
+ memunmap(image);
+ if (bmp_header.id != 0x4d42) {
+ pr_err("Ignoring BGRT: Incorrect BMP magic number 0x%x (expected 0x4d42)\n",
+ bmp_header.id);
+ return;
+ }
bgrt_image_size = bmp_header.size;
bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN);
@@ -93,18 +89,14 @@ void __init efi_bgrt_init(void)
return;
}
- if (ioremapped) {
- image = early_ioremap(bgrt_tab->image_address,
- bmp_header.size);
- if (!image) {
- pr_err("Ignoring BGRT: failed to map image memory\n");
- kfree(bgrt_image);
- bgrt_image = NULL;
- return;
- }
+ image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB);
+ if (!image) {
+ pr_err("Ignoring BGRT: failed to map image memory\n");
+ kfree(bgrt_image);
+ bgrt_image = NULL;
+ return;
}
- memcpy_fromio(bgrt_image, image, bgrt_image_size);
- if (ioremapped)
- early_iounmap(image, bmp_header.size);
+ memcpy(bgrt_image, image, bgrt_image_size);
+ memunmap(image);
}
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index ad285404ea7f..994a7df84a7b 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -235,10 +235,10 @@ void __init efi_print_memmap(void)
char buf[64];
md = p;
- pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n",
+ pr_info("mem%02u: %s range=[0x%016llx-0x%016llx] (%lluMB)\n",
i, efi_md_typeattr_format(buf, sizeof(buf), md),
md->phys_addr,
- md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
+ md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT) - 1,
(md->num_pages >> (20 - EFI_PAGE_SHIFT)));
}
#endif /* EFI_DEBUG */
@@ -815,6 +815,7 @@ static void __init kexec_enter_virtual_mode(void)
{
#ifdef CONFIG_KEXEC_CORE
efi_memory_desc_t *md;
+ unsigned int num_pages;
void *p;
efi.systab = NULL;
@@ -829,6 +830,12 @@ static void __init kexec_enter_virtual_mode(void)
return;
}
+ if (efi_alloc_page_tables()) {
+ pr_err("Failed to allocate EFI page tables\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
/*
* Map efi regions which were passed via setup_data. The virt_addr is a
* fixed addr which was used in first kernel of a kexec boot.
@@ -843,6 +850,14 @@ static void __init kexec_enter_virtual_mode(void)
BUG_ON(!efi.systab);
+ num_pages = ALIGN(memmap.nr_map * memmap.desc_size, PAGE_SIZE);
+ num_pages >>= PAGE_SHIFT;
+
+ if (efi_setup_page_tables(memmap.phys_map, num_pages)) {
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
efi_sync_low_kernel_mappings();
/*
@@ -869,7 +884,7 @@ static void __init kexec_enter_virtual_mode(void)
* This function will switch the EFI runtime services to virtual mode.
* Essentially, we look through the EFI memmap and map every region that
* has the runtime attribute bit set in its memory descriptor into the
- * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ * efi_pgd page table.
*
* The old method which used to update that memory descriptor with the
* virtual address obtained from ioremap() is still supported when the
@@ -879,8 +894,8 @@ static void __init kexec_enter_virtual_mode(void)
*
* The new method does a pagetable switch in a preemption-safe manner
* so that we're in a different address space when calling a runtime
- * function. For function arguments passing we do copy the PGDs of the
- * kernel page table into ->trampoline_pgd prior to each call.
+ * function. For function arguments passing we do copy the PUDs of the
+ * kernel page table into efi_pgd prior to each call.
*
* Specially for kexec boot, efi runtime maps in previous kernel should
* be passed in via setup_data. In that case runtime ranges will be mapped
@@ -895,6 +910,12 @@ static void __init __efi_enter_virtual_mode(void)
efi.systab = NULL;
+ if (efi_alloc_page_tables()) {
+ pr_err("Failed to allocate EFI page tables\n");
+ clear_bit(EFI_RUNTIME_SERVICES, &efi.flags);
+ return;
+ }
+
efi_merge_regions();
new_memmap = efi_map_regions(&count, &pg_shift);
if (!new_memmap) {
@@ -913,7 +934,6 @@ static void __init __efi_enter_virtual_mode(void)
}
efi_sync_low_kernel_mappings();
- efi_dump_pagetable();
if (efi_is_native()) {
status = phys_efi_set_virtual_address_map(
@@ -951,31 +971,20 @@ static void __init __efi_enter_virtual_mode(void)
efi.set_virtual_address_map = NULL;
- efi_runtime_mkexec();
+ /*
+ * Apply more restrictive page table mapping attributes now that
+ * SVAM() has been called and the firmware has performed all
+ * necessary relocation fixups for the new virtual addresses.
+ */
+ efi_runtime_update_mappings();
+ efi_dump_pagetable();
/*
- * We mapped the descriptor array into the EFI pagetable above but we're
- * not unmapping it here. Here's why:
- *
- * We're copying select PGDs from the kernel page table to the EFI page
- * table and when we do so and make changes to those PGDs like unmapping
- * stuff from them, those changes appear in the kernel page table and we
- * go boom.
- *
- * From setup_real_mode():
- *
- * ...
- * trampoline_pgd[0] = init_level4_pgt[pgd_index(__PAGE_OFFSET)].pgd;
- *
- * In this particular case, our allocation is in PGD 0 of the EFI page
- * table but we've copied that PGD from PGD[272] of the EFI page table:
- *
- * pgd_index(__PAGE_OFFSET = 0xffff880000000000) = 272
- *
- * where the direct memory mapping in kernel space is.
- *
- * new_memmap's VA comes from that direct mapping and thus clearing it,
- * it would get cleared in the kernel page table too.
+ * We mapped the descriptor array into the EFI pagetable above
+ * but we're not unmapping it here because if we're running in
+ * EFI mixed mode we need all of memory to be accessible when
+ * we pass parameters to the EFI runtime services in the
+ * thunking code.
*
* efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift);
*/
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index ed5b67338294..338402b91d2e 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -38,6 +38,11 @@
* say 0 - 3G.
*/
+int __init efi_alloc_page_tables(void)
+{
+ return 0;
+}
+
void efi_sync_low_kernel_mappings(void) {}
void __init efi_dump_pagetable(void) {}
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
@@ -85,7 +90,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
__flush_tlb_all();
}
-void __init efi_runtime_mkexec(void)
+void __init efi_runtime_update_mappings(void)
{
if (__supported_pte_mask & _PAGE_NX)
runtime_code_page_mkexec();
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index a0ac0f9c307f..49e4dd4a1f58 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -15,6 +15,8 @@
*
*/
+#define pr_fmt(fmt) "efi: " fmt
+
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -40,6 +42,7 @@
#include <asm/fixmap.h>
#include <asm/realmode.h>
#include <asm/time.h>
+#include <asm/pgalloc.h>
/*
* We allocate runtime services regions bottom-up, starting from -4G, i.e.
@@ -47,16 +50,7 @@
*/
static u64 efi_va = EFI_VA_START;
-/*
- * Scratch space used for switching the pagetable in the EFI stub
- */
-struct efi_scratch {
- u64 r15;
- u64 prev_cr3;
- pgd_t *efi_pgt;
- bool use_pgd;
- u64 phys_stack;
-} __packed;
+struct efi_scratch efi_scratch;
static void __init early_code_mapping_set_exec(int executable)
{
@@ -83,8 +77,11 @@ pgd_t * __init efi_call_phys_prolog(void)
int pgd;
int n_pgds;
- if (!efi_enabled(EFI_OLD_MEMMAP))
- return NULL;
+ if (!efi_enabled(EFI_OLD_MEMMAP)) {
+ save_pgd = (pgd_t *)read_cr3();
+ write_cr3((unsigned long)efi_scratch.efi_pgt);
+ goto out;
+ }
early_code_mapping_set_exec(1);
@@ -96,6 +93,7 @@ pgd_t * __init efi_call_phys_prolog(void)
vaddress = (unsigned long)__va(pgd * PGDIR_SIZE);
set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
}
+out:
__flush_tlb_all();
return save_pgd;
@@ -109,8 +107,11 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
int pgd_idx;
int nr_pgds;
- if (!save_pgd)
+ if (!efi_enabled(EFI_OLD_MEMMAP)) {
+ write_cr3((unsigned long)save_pgd);
+ __flush_tlb_all();
return;
+ }
nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
@@ -123,27 +124,98 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
early_code_mapping_set_exec(0);
}
+static pgd_t *efi_pgd;
+
+/*
+ * We need our own copy of the higher levels of the page tables
+ * because we want to avoid inserting EFI region mappings (EFI_VA_END
+ * to EFI_VA_START) into the standard kernel page tables. Everything
+ * else can be shared, see efi_sync_low_kernel_mappings().
+ */
+int __init efi_alloc_page_tables(void)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ gfp_t gfp_mask;
+
+ if (efi_enabled(EFI_OLD_MEMMAP))
+ return 0;
+
+ gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO;
+ efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
+ if (!efi_pgd)
+ return -ENOMEM;
+
+ pgd = efi_pgd + pgd_index(EFI_VA_END);
+
+ pud = pud_alloc_one(NULL, 0);
+ if (!pud) {
+ free_page((unsigned long)efi_pgd);
+ return -ENOMEM;
+ }
+
+ pgd_populate(NULL, pgd, pud);
+
+ return 0;
+}
+
/*
* Add low kernel mappings for passing arguments to EFI functions.
*/
void efi_sync_low_kernel_mappings(void)
{
- unsigned num_pgds;
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+ unsigned num_entries;
+ pgd_t *pgd_k, *pgd_efi;
+ pud_t *pud_k, *pud_efi;
if (efi_enabled(EFI_OLD_MEMMAP))
return;
- num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
+ /*
+ * We can share all PGD entries apart from the one entry that
+ * covers the EFI runtime mapping space.
+ *
+ * Make sure the EFI runtime region mappings are guaranteed to
+ * only span a single PGD entry and that the entry also maps
+ * other important kernel regions.
+ */
+ BUILD_BUG_ON(pgd_index(EFI_VA_END) != pgd_index(MODULES_END));
+ BUILD_BUG_ON((EFI_VA_START & PGDIR_MASK) !=
+ (EFI_VA_END & PGDIR_MASK));
+
+ pgd_efi = efi_pgd + pgd_index(PAGE_OFFSET);
+ pgd_k = pgd_offset_k(PAGE_OFFSET);
- memcpy(pgd + pgd_index(PAGE_OFFSET),
- init_mm.pgd + pgd_index(PAGE_OFFSET),
- sizeof(pgd_t) * num_pgds);
+ num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
+ memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
+
+ /*
+ * We share all the PUD entries apart from those that map the
+ * EFI regions. Copy around them.
+ */
+ BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
+ BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
+
+ pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+ pud_efi = pud_offset(pgd_efi, 0);
+
+ pgd_k = pgd_offset_k(EFI_VA_END);
+ pud_k = pud_offset(pgd_k, 0);
+
+ num_entries = pud_index(EFI_VA_END);
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
+
+ pud_efi = pud_offset(pgd_efi, EFI_VA_START);
+ pud_k = pud_offset(pgd_k, EFI_VA_START);
+
+ num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
+ memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
}
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
- unsigned long text;
+ unsigned long pfn, text;
+ efi_memory_desc_t *md;
struct page *page;
unsigned npages;
pgd_t *pgd;
@@ -151,8 +223,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
if (efi_enabled(EFI_OLD_MEMMAP))
return 0;
- efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
- pgd = __va(efi_scratch.efi_pgt);
+ efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd);
+ pgd = efi_pgd;
/*
* It can happen that the physical address of new_memmap lands in memory
@@ -160,7 +232,8 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
* and ident-map those pages containing the map before calling
* phys_efi_set_virtual_address_map().
*/
- if (kernel_map_pages_in_pgd(pgd, pa_memmap, pa_memmap, num_pages, _PAGE_NX)) {
+ pfn = pa_memmap >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) {
pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap);
return 1;
}
@@ -176,6 +249,25 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
if (!IS_ENABLED(CONFIG_EFI_MIXED))
return 0;
+ /*
+ * Map all of RAM so that we can access arguments in the 1:1
+ * mapping when making EFI runtime calls.
+ */
+ for_each_efi_memory_desc(&memmap, md) {
+ if (md->type != EFI_CONVENTIONAL_MEMORY &&
+ md->type != EFI_LOADER_DATA &&
+ md->type != EFI_LOADER_CODE)
+ continue;
+
+ pfn = md->phys_addr >> PAGE_SHIFT;
+ npages = md->num_pages;
+
+ if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) {
+ pr_err("Failed to map 1:1 memory\n");
+ return 1;
+ }
+ }
+
page = alloc_page(GFP_KERNEL|__GFP_DMA32);
if (!page)
panic("Unable to allocate EFI runtime stack < 4GB\n");
@@ -183,10 +275,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
efi_scratch.phys_stack = virt_to_phys(page_address(page));
efi_scratch.phys_stack += PAGE_SIZE; /* stack grows down */
- npages = (_end - _text) >> PAGE_SHIFT;
+ npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
+ pfn = text >> PAGE_SHIFT;
- if (kernel_map_pages_in_pgd(pgd, text >> PAGE_SHIFT, text, npages, 0)) {
+ if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) {
pr_err("Failed to map kernel text 1:1\n");
return 1;
}
@@ -196,20 +289,20 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
- kernel_unmap_pages_in_pgd(pgd, pa_memmap, num_pages);
+ kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages);
}
static void __init __map_region(efi_memory_desc_t *md, u64 va)
{
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
- unsigned long pf = 0;
+ unsigned long flags = _PAGE_RW;
+ unsigned long pfn;
+ pgd_t *pgd = efi_pgd;
if (!(md->attribute & EFI_MEMORY_WB))
- pf |= _PAGE_PCD;
+ flags |= _PAGE_PCD;
- if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+ pfn = md->phys_addr >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
md->phys_addr, va);
}
@@ -300,21 +393,56 @@ void __init parse_efi_setup(u64 phys_addr, u32 data_len)
efi_setup = phys_addr + sizeof(struct setup_data);
}
-void __init efi_runtime_mkexec(void)
+void __init efi_runtime_update_mappings(void)
{
- if (!efi_enabled(EFI_OLD_MEMMAP))
+ unsigned long pfn;
+ pgd_t *pgd = efi_pgd;
+ efi_memory_desc_t *md;
+ void *p;
+
+ if (efi_enabled(EFI_OLD_MEMMAP)) {
+ if (__supported_pte_mask & _PAGE_NX)
+ runtime_code_page_mkexec();
return;
+ }
+
+ if (!efi_enabled(EFI_NX_PE_DATA))
+ return;
+
+ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+ unsigned long pf = 0;
+ md = p;
+
+ if (!(md->attribute & EFI_MEMORY_RUNTIME))
+ continue;
- if (__supported_pte_mask & _PAGE_NX)
- runtime_code_page_mkexec();
+ if (!(md->attribute & EFI_MEMORY_WB))
+ pf |= _PAGE_PCD;
+
+ if ((md->attribute & EFI_MEMORY_XP) ||
+ (md->type == EFI_RUNTIME_SERVICES_DATA))
+ pf |= _PAGE_NX;
+
+ if (!(md->attribute & EFI_MEMORY_RO) &&
+ (md->type != EFI_RUNTIME_SERVICES_CODE))
+ pf |= _PAGE_RW;
+
+ /* Update the 1:1 mapping */
+ pfn = md->phys_addr >> PAGE_SHIFT;
+ if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, md->num_pages, pf))
+ pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+ md->phys_addr, md->virt_addr);
+
+ if (kernel_map_pages_in_pgd(pgd, pfn, md->virt_addr, md->num_pages, pf))
+ pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+ md->phys_addr, md->virt_addr);
+ }
}
void __init efi_dump_pagetable(void)
{
#ifdef CONFIG_EFI_PGT_DUMP
- pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
-
- ptdump_walk_pgd_level(NULL, pgd);
+ ptdump_walk_pgd_level(NULL, efi_pgd);
#endif
}
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 86d0f9e08dd9..92723aeae0f9 100644
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -11,6 +11,7 @@
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/page_types.h>
+#include <asm/frame.h>
#define SAVE_XMM \
mov %rsp, %rax; \
@@ -38,42 +39,8 @@
mov %rsi, %cr0; \
mov (%rsp), %rsp
- /* stolen from gcc */
- .macro FLUSH_TLB_ALL
- movq %r15, efi_scratch(%rip)
- movq %r14, efi_scratch+8(%rip)
- movq %cr4, %r15
- movq %r15, %r14
- andb $0x7f, %r14b
- movq %r14, %cr4
- movq %r15, %cr4
- movq efi_scratch+8(%rip), %r14
- movq efi_scratch(%rip), %r15
- .endm
-
- .macro SWITCH_PGT
- cmpb $0, efi_scratch+24(%rip)
- je 1f
- movq %r15, efi_scratch(%rip) # r15
- # save previous CR3
- movq %cr3, %r15
- movq %r15, efi_scratch+8(%rip) # prev_cr3
- movq efi_scratch+16(%rip), %r15 # EFI pgt
- movq %r15, %cr3
- 1:
- .endm
-
- .macro RESTORE_PGT
- cmpb $0, efi_scratch+24(%rip)
- je 2f
- movq efi_scratch+8(%rip), %r15
- movq %r15, %cr3
- movq efi_scratch(%rip), %r15
- FLUSH_TLB_ALL
- 2:
- .endm
-
ENTRY(efi_call)
+ FRAME_BEGIN
SAVE_XMM
mov (%rsp), %rax
mov 8(%rax), %rax
@@ -83,16 +50,9 @@ ENTRY(efi_call)
mov %r8, %r9
mov %rcx, %r8
mov %rsi, %rcx
- SWITCH_PGT
call *%rdi
- RESTORE_PGT
addq $48, %rsp
RESTORE_XMM
+ FRAME_END
ret
ENDPROC(efi_call)
-
- .data
-ENTRY(efi_scratch)
- .fill 3,8,0
- .byte 0
- .quad 0
diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index ed30e79347e8..ab50ada1d56e 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -1,3 +1,5 @@
+#define pr_fmt(fmt) "efi: " fmt
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -55,13 +57,41 @@ void efi_delete_dummy_variable(void)
}
/*
+ * In the nonblocking case we do not attempt to perform garbage
+ * collection if we do not have enough free space. Rather, we do the
+ * bare minimum check and give up immediately if the available space
+ * is below EFI_MIN_RESERVE.
+ *
+ * This function is intended to be small and simple because it is
+ * invoked from crash handler paths.
+ */
+static efi_status_t
+query_variable_store_nonblocking(u32 attributes, unsigned long size)
+{
+ efi_status_t status;
+ u64 storage_size, remaining_size, max_size;
+
+ status = efi.query_variable_info_nonblocking(attributes, &storage_size,
+ &remaining_size,
+ &max_size);
+ if (status != EFI_SUCCESS)
+ return status;
+
+ if (remaining_size - size < EFI_MIN_RESERVE)
+ return EFI_OUT_OF_RESOURCES;
+
+ return EFI_SUCCESS;
+}
+
+/*
* Some firmware implementations refuse to boot if there's insufficient space
* in the variable store. Ensure that we never use more than a safe limit.
*
* Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
* store.
*/
-efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size,
+ bool nonblocking)
{
efi_status_t status;
u64 storage_size, remaining_size, max_size;
@@ -69,6 +99,9 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
if (!(attributes & EFI_VARIABLE_NON_VOLATILE))
return 0;
+ if (nonblocking)
+ return query_variable_store_nonblocking(attributes, size);
+
status = efi.query_variable_info(attributes, &storage_size,
&remaining_size, &max_size);
if (status != EFI_SUCCESS)
@@ -312,7 +345,7 @@ void __init efi_apply_memmap_quirks(void)
* services.
*/
if (!efi_runtime_supported()) {
- pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
+ pr_info("Setup done, disabling due to 32/64-bit mismatch\n");
efi_unmap_memmap();
}
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
index 0ae7f2ae2296..c26cf393d35a 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_bma023.c
@@ -1,5 +1,5 @@
/*
- * platform_bma023.c: bma023 platform data initilization file
+ * platform_bma023.c: bma023 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
*
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
index 69a783689d21..c259fb6c8f4f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_emc1403.c
@@ -1,5 +1,5 @@
/*
- * platform_emc1403.c: emc1403 platform data initilization file
+ * platform_emc1403.c: emc1403 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
index dccae6b0413f..52534ec29765 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_gpio_keys.c
@@ -1,5 +1,5 @@
/*
- * platform_gpio_keys.c: gpio_keys platform data initilization file
+ * platform_gpio_keys.c: gpio_keys platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
index 54226de7541a..a35cf912de43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_lis331.c
@@ -1,5 +1,5 @@
/*
- * platform_lis331.c: lis331 platform data initilization file
+ * platform_lis331.c: lis331 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
index 2c8acbc1e9ad..6e075afa7877 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_max7315.c
@@ -1,5 +1,5 @@
/*
- * platform_max7315.c: max7315 platform data initilization file
+ * platform_max7315.c: max7315 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
index cfe9a47a1e87..ee22864bbc2f 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_mpu3050.c
@@ -1,5 +1,5 @@
/*
- * platform_mpu3050.c: mpu3050 platform data initilization file
+ * platform_mpu3050.c: mpu3050 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic.c b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
index 9f4a775a69d6..e421106c11cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic.c
@@ -1,5 +1,5 @@
/*
- * platform_msic.c: MSIC platform data initilization file
+ * platform_msic.c: MSIC platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
index 29629397d2b3..cb3490ecb341 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_audio.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_audio.c: MSIC audio platform data initilization file
+ * platform_msic_audio.c: MSIC audio platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
index f446c33df1a8..4f72193939a6 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_battery.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_battery.c: MSIC battery platform data initilization file
+ * platform_msic_battery.c: MSIC battery platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
index 2a4f7b1dd917..70de5b531ba0 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_gpio.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_gpio.c: MSIC GPIO platform data initilization file
+ * platform_msic_gpio.c: MSIC GPIO platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
index 6497111ddb54..3d7c2011b6cf 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_ocd.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_ocd.c: MSIC OCD platform data initilization file
+ * platform_msic_ocd.c: MSIC OCD platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
index 83a3459bc337..038f618fbc52 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_power_btn.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_power_btn.c: MSIC power btn platform data initilization file
+ * platform_msic_power_btn.c: MSIC power btn platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
index a351878b96bc..114a5755b1e4 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_msic_thermal.c
@@ -1,5 +1,5 @@
/*
- * platform_msic_thermal.c: msic_thermal platform data initilization file
+ * platform_msic_thermal.c: msic_thermal platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
index 65c2a9a19db4..e30cb62e3300 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_pmic_gpio.c
@@ -1,5 +1,5 @@
/*
- * platform_pmic_gpio.c: PMIC GPIO platform data initilization file
+ * platform_pmic_gpio.c: PMIC GPIO platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
index 740fc757050c..b1526b95fd43 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tc35876x.c
@@ -1,5 +1,5 @@
/*
- * platform_tc35876x.c: tc35876x platform data initilization file
+ * platform_tc35876x.c: tc35876x platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
index 33be0b3be6e1..4f41372ce400 100644
--- a/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
+++ b/arch/x86/platform/intel-mid/device_libs/platform_tca6416.c
@@ -1,5 +1,5 @@
/*
- * platform_tca6416.c: tca6416 platform data initilization file
+ * platform_tca6416.c: tca6416 platform data initialization file
*
* (C) Copyright 2013 Intel Corporation
* Author: Sathyanarayanan Kuppuswamy <sathyanarayanan.kuppuswamy@intel.com>
diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S
index e2386cb4e0c3..4400a43b9e28 100644
--- a/arch/x86/power/hibernate_asm_64.S
+++ b/arch/x86/power/hibernate_asm_64.S
@@ -21,8 +21,10 @@
#include <asm/page_types.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/frame.h>
ENTRY(swsusp_arch_suspend)
+ FRAME_BEGIN
movq $saved_context, %rax
movq %rsp, pt_regs_sp(%rax)
movq %rbp, pt_regs_bp(%rax)
@@ -50,7 +52,9 @@ ENTRY(swsusp_arch_suspend)
movq %rax, restore_cr3(%rip)
call swsusp_save
+ FRAME_END
ret
+ENDPROC(swsusp_arch_suspend)
ENTRY(restore_image)
/* switch to temporary page tables */
@@ -107,6 +111,7 @@ ENTRY(core_restore_code)
*/
ENTRY(restore_registers)
+ FRAME_BEGIN
/* go back to the original page tables */
movq %rbx, %cr3
@@ -147,4 +152,6 @@ ENTRY(restore_registers)
/* tell the hibernation core that we've just restored the memory */
movq %rax, in_suspend(%rip)
+ FRAME_END
ret
+ENDPROC(restore_registers)
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 2c835e356349..92e3e1d84c1d 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -1,3 +1,5 @@
+OBJECT_FILES_NON_STANDARD := y
+
purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o
targets += $(purgatory-y)
diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S
index 3cefba1fefc8..50a4147f91fb 100644
--- a/arch/x86/purgatory/stack.S
+++ b/arch/x86/purgatory/stack.S
@@ -8,7 +8,7 @@
*/
/* A stack for the loaded kernel.
- * Seperate and in the data section so it can be prepopulated.
+ * Separate and in the data section so it can be prepopulated.
*/
.data
.balign 4096
diff --git a/arch/x86/realmode/Makefile b/arch/x86/realmode/Makefile
index e02c2c6c56a5..682c895753d9 100644
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -6,7 +6,9 @@
# for more details.
#
#
-KASAN_SANITIZE := n
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
subdir- := rm
obj-y += init.o
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 3e75fcf6b836..b95964610ea7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -6,7 +6,11 @@
# for more details.
#
#
-KASAN_SANITIZE := n
+KASAN_SANITIZE := n
+OBJECT_FILES_NON_STANDARD := y
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT := n
always := realmode.bin realmode.relocs
diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h
index ee940185e89f..54d96f1e3594 100644
--- a/arch/x86/um/asm/checksum.h
+++ b/arch/x86/um/asm/checksum.h
@@ -87,8 +87,8 @@ static inline __sum16 csum_fold(__wsum sum)
* 32bit unfolded.
*/
static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
- unsigned short proto, __wsum sum)
+csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
+ __u8 proto, __wsum sum)
{
asm(" addl %1, %0\n"
" adcl %2, %0\n"
@@ -104,9 +104,8 @@ csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
* returns a 16-bit checksum, already complemented
*/
static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
- unsigned short len,
- unsigned short proto,
- __wsum sum)
+ __u32 len, __u8 proto,
+ __wsum sum)
{
return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum));
}
diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h
index ab77b6f9a4bf..83a75f8a1233 100644
--- a/arch/x86/um/asm/checksum_32.h
+++ b/arch/x86/um/asm/checksum_32.h
@@ -13,7 +13,7 @@ static inline __sum16 ip_compute_csum(const void *buff, int len)
#define _HAVE_ARCH_IPV6_CSUM
static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
+ __u32 len, __u8 proto,
__wsum sum)
{
__asm__(
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index d5644bbe8cba..9fd24846d094 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -14,26 +14,24 @@
int fb_is_primary_device(struct fb_info *info)
{
struct device *device = info->device;
- struct pci_dev *pci_dev = NULL;
struct pci_dev *default_device = vga_default_device();
- struct resource *res = NULL;
+ struct pci_dev *pci_dev;
+ struct resource *res;
- if (device)
- pci_dev = to_pci_dev(device);
-
- if (!pci_dev)
+ if (!device || !dev_is_pci(device))
return 0;
+ pci_dev = to_pci_dev(device);
+
if (default_device) {
if (pci_dev == default_device)
return 1;
- else
- return 0;
+ return 0;
}
- res = &pci_dev->resource[PCI_ROM_RESOURCE];
+ res = pci_dev->resource + PCI_ROM_RESOURCE;
- if (res && res->flags & IORESOURCE_ROM_SHADOW)
+ if (res->flags & IORESOURCE_ROM_SHADOW)
return 1;
return 0;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 2c261082eadf..880862c7d9dd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -32,6 +32,7 @@
#include <linux/gfp.h>
#include <linux/memblock.h>
#include <linux/edd.h>
+#include <linux/frame.h>
#ifdef CONFIG_KEXEC_CORE
#include <linux/kexec.h>
@@ -351,8 +352,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*cx &= maskecx;
*cx |= setecx;
*dx &= maskedx;
-
}
+STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
static bool __init xen_check_mwait(void)
{
@@ -961,7 +962,7 @@ static void xen_load_sp0(struct tss_struct *tss,
tss->x86_tss.sp0 = thread->sp0;
}
-static void xen_set_iopl_mask(unsigned mask)
+void xen_set_iopl_mask(unsigned mask)
{
struct physdev_set_iopl set_iopl;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c913ca4f6958..478a2de543a5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1256,7 +1256,7 @@ static void __init xen_pagetable_cleanhighmap(void)
xen_cleanhighmap(addr, addr + size);
xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
#ifdef DEBUG
- /* This is superflous and is not neccessary, but you know what
+ /* This is superfluous and is not necessary, but you know what
* lets do it. The MODULES_VADDR -> MODULES_END should be clear of
* anything at this stage. */
xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
@@ -1474,7 +1474,7 @@ static void xen_write_cr3(unsigned long cr3)
/*
* At the start of the day - when Xen launches a guest, it has already
* built pagetables for the guest. We diligently look over them
- * in xen_setup_kernel_pagetable and graft as appropiate them in the
+ * in xen_setup_kernel_pagetable and graft as appropriate them in the
* init_level4_pgt and its friends. Then when we are happy we load
* the new init_level4_pgt - and continue on.
*
@@ -2792,7 +2792,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
struct remap_data *rmd = data;
pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
- /* If we have a contigious range, just update the mfn itself,
+ /* If we have a contiguous range, just update the mfn itself,
else update pointer to be "next mfn". */
if (rmd->contiguous)
(*rmd->mfn)++;
@@ -2833,7 +2833,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
rmd.mfn = gfn;
rmd.prot = prot;
- /* We use the err_ptr to indicate if there we are doing a contigious
+ /* We use the err_ptr to indicate if there we are doing a contiguous
* mapping or a discontigious mapping. */
rmd.contiguous = !err_ptr;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 3e45aa000718..eff224df813f 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -14,6 +14,7 @@
#include <asm/asm-offsets.h>
#include <asm/percpu.h>
#include <asm/processor-flags.h>
+#include <asm/frame.h>
#include "xen-asm.h"
@@ -23,6 +24,7 @@
* then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
+ FRAME_BEGIN
/* Unmask events */
movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
@@ -39,6 +41,7 @@ ENTRY(xen_irq_enable_direct)
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
+ FRAME_END
ret
ENDPROC(xen_irq_enable_direct)
RELOC(xen_irq_enable_direct, 2b+1)
@@ -82,6 +85,7 @@ ENDPATCH(xen_save_fl_direct)
* enters the hypervisor to get them delivered if so.
*/
ENTRY(xen_restore_fl_direct)
+ FRAME_BEGIN
#ifdef CONFIG_X86_64
testw $X86_EFLAGS_IF, %di
#else
@@ -100,6 +104,7 @@ ENTRY(xen_restore_fl_direct)
2: call check_events
1:
ENDPATCH(xen_restore_fl_direct)
+ FRAME_END
ret
ENDPROC(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
@@ -109,7 +114,8 @@ ENDPATCH(xen_restore_fl_direct)
* Force an event check by making a hypercall, but preserve regs
* before making the call.
*/
-check_events:
+ENTRY(check_events)
+ FRAME_BEGIN
#ifdef CONFIG_X86_32
push %eax
push %ecx
@@ -139,4 +145,6 @@ check_events:
pop %rcx
pop %rax
#endif
+ FRAME_END
ret
+ENDPROC(check_events)
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index cc8acc410ddb..c3df43141e70 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,6 +26,7 @@ ENTRY(xen_adjust_exception_frame)
mov 8+0(%rsp), %rcx
mov 8+8(%rsp), %r11
ret $16
+ENDPROC(xen_adjust_exception_frame)
hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index b65f59a358a2..7f8d8abf4c1a 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -26,7 +26,7 @@
(1 << XENFEAT_auto_translated_physmap) | \
(1 << XENFEAT_supervisor_mode_kernel) | \
(1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
* up regardless whether this CONFIG option is enabled or not, but it
* clarifies what the right flags need to be.
*/
@@ -38,13 +38,18 @@
__INIT
ENTRY(startup_xen)
cld
-#ifdef CONFIG_X86_32
- mov %esi,xen_start_info
- mov $init_thread_union+THREAD_SIZE,%esp
-#else
- mov %rsi,xen_start_info
- mov $init_thread_union+THREAD_SIZE,%rsp
-#endif
+
+ /* Clear .bss */
+ xor %eax,%eax
+ mov $__bss_start, %_ASM_DI
+ mov $__bss_stop, %_ASM_CX
+ sub %_ASM_DI, %_ASM_CX
+ shr $__ASM_SEL(2, 3), %_ASM_CX
+ rep __ASM_SIZE(stos)
+
+ mov %_ASM_SI, xen_start_info
+ mov $init_thread_union+THREAD_SIZE, %_ASM_SP
+
jmp xen_start_kernel
__FINIT