diff options
Diffstat (limited to 'arch/x86')
105 files changed, 6752 insertions, 1252 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6e1faa28c58..f2aed8012e9c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE - select ARCH_SUPPORTS_INT128 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA @@ -1940,6 +1940,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf9cd83de777..409c00f74e60 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -316,10 +316,6 @@ config UNWINDER_FRAME_POINTER unwinder, but the kernel text size will grow by ~3% and the kernel's overall performance will degrade by roughly 5-10%. - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 149795c369f2..25019d42ae93 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -21,30 +21,6 @@ struct mem_vector immovable_mem[MAX_NUMNODES*2]; /* - * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex - * digits, and '\0' for termination. - */ -#define MAX_ADDR_LEN 19 - -static acpi_physical_address get_cmdline_acpi_rsdp(void) -{ - acpi_physical_address addr = 0; - -#ifdef CONFIG_KEXEC - char val[MAX_ADDR_LEN] = { }; - int ret; - - ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); - if (ret < 0) - return 0; - - if (kstrtoull(val, 16, &addr)) - return 0; -#endif - return addr; -} - -/* * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and * ACPI_TABLE_GUID are found, take the former, which has more features. */ @@ -298,6 +274,30 @@ acpi_physical_address get_rsdp_addr(void) } #if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) +/* + * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex + * digits, and '\0' for termination. + */ +#define MAX_ADDR_LEN 19 + +static acpi_physical_address get_cmdline_acpi_rsdp(void) +{ + acpi_physical_address addr = 0; + +#ifdef CONFIG_KEXEC + char val[MAX_ADDR_LEN] = { }; + int ret; + + ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); + if (ret < 0) + return 0; + + if (kstrtoull(val, 16, &addr)) + return 0; +#endif + return addr; +} + /* Compute SRAT address from RSDP. */ static unsigned long get_acpi_srat_table(void) { diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index d6662fdef300..82bc60c8acb2 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -13,6 +13,7 @@ #include <asm/e820/types.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/boot.h> #include "../string.h" #include "eboot.h" @@ -813,7 +814,8 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) status = efi_relocate_kernel(sys_table, &bzimage_addr, hdr->init_size, hdr->init_size, hdr->pref_address, - hdr->kernel_alignment); + hdr->kernel_alignment, + LOAD_PHYSICAL_ADDR); if (status != EFI_SUCCESS) { efi_printk(sys_table, "efi_relocate_kernel() failed!\n"); goto fail; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 53ac0cb2396d..9652d5c2afda 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -345,6 +345,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, { const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + unsigned long needed_size; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -379,26 +380,38 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + * + * On X86_64, the memory is mapped with PMD pages. Round the + * size up so that the full extent of PMD pages mapped is + * included in the check against the valid memory table + * entries. This ensures the full mapped area is usable RAM + * and doesn't include any reserved areas. + */ + needed_size = max(output_len, kernel_total_size); +#ifdef CONFIG_X86_64 + needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); +#endif + /* Report initial kernel position details. */ debug_putaddr(input_data); debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); debug_putaddr(kernel_total_size); + debug_putaddr(needed_size); #ifdef CONFIG_X86_64 /* Report address of 32-bit trampoline */ debug_putaddr(trampoline_32bit); #endif - /* - * The memory hole needed for the kernel is the larger of either - * the entire decompressed kernel plus relocation table, or the - * entire decompressed kernel plus .bss and .brk sections. - */ choose_random_location((unsigned long)input_data, input_len, (unsigned long *)&output, - max(output_len, kernel_total_size), + needed_size, &virt_addr); /* Validate memory location choices. */ diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 759b1a927826..958440eae27e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -48,6 +49,7 @@ ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o endif # These modules require assembler to support AVX2. @@ -70,6 +72,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S new file mode 100644 index 000000000000..8591938eee26 --- /dev/null +++ b/arch/x86/crypto/blake2s-core.S @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +#ifdef CONFIG_AS_AVX512 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +#endif /* CONFIG_AS_AVX512 */ + +.text +#ifdef CONFIG_AS_SSSE3 +ENTRY(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + ret +ENDPROC(blake2s_compress_ssse3) +#endif /* CONFIG_AS_SSSE3 */ + +#ifdef CONFIG_AS_AVX512 +ENTRY(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + retq +ENDPROC(blake2s_compress_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c new file mode 100644 index 000000000000..4a37ba7cdbe5 --- /dev/null +++ b/arch/x86/crypto/blake2s-glue.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <crypto/internal/blake2s.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/hash.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/api.h> +#include <asm/processor.h> +#include <asm/simd.h> + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress_arch(struct blake2s_state *state, + const u8 *block, size_t nblocks, + const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + for (;;) { + const size_t blocks = min_t(size_t, nblocks, + PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + if (!nblocks) + break; + block += blocks * BLAKE2S_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(blake2s_compress_arch); + +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); + + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(tctx->key, key, keylen); + tctx->keylen = keylen; + + return 0; +} + +static int crypto_blake2s_init(struct shash_desc *desc) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct blake2s_state *state = shash_desc_ctx(desc); + const int outlen = crypto_shash_digestsize(desc->tfm); + + if (tctx->keylen) + blake2s_init_key(state, outlen, tctx->key, tctx->keylen); + else + blake2s_init(state, outlen); + + return 0; +} + +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, + unsigned int inlen) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; + + if (unlikely(!inlen)) + return 0; + if (inlen > fill) { + memcpy(state->buf + state->buflen, in, fill); + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + state->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + /* Hash one less (full) block than strictly possible */ + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(state->buf + state->buflen, in, inlen); + state->buflen += inlen; + + return 0; +} + +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + + blake2s_set_lastblock(state); + memset(state->buf + state->buflen, 0, + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ + blake2s_compress_arch(state, state->buf, 1, state->buflen); + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + memcpy(out, state->h, state->outlen); + memzero_explicit(state, sizeof(*state)); + + return 0; +} + +static struct shash_alg blake2s_algs[] = {{ + .base.cra_name = "blake2s-128", + .base.cra_driver_name = "blake2s-128-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_128_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-160", + .base.cra_driver_name = "blake2s-160-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_160_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-224", + .base.cra_driver_name = "blake2s-224-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_224_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-256", + .base.cra_driver_name = "blake2s-256-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_256_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}}; + +static int __init blake2s_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&blake2s_use_ssse3); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +static void __exit blake2s_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +module_init(blake2s_mod_init); +module_exit(blake2s_mod_exit); + +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 388f95a4ec24..a94e30b6f941 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -7,7 +7,7 @@ */ #include <crypto/algapi.h> -#include <crypto/chacha.h> +#include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); -#ifdef CONFIG_AS_AVX2 + asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx2; -#ifdef CONFIG_AS_AVX512 + asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx512vl; -#endif -#endif + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha_use_avx512vl) { + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif - if (chacha_use_avx2) { + + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; @@ -123,37 +123,76 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_walk *walk, +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, stream, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, bytes, nrounds); + kernel_fpu_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - int next_yield = 4096; /* bytes until next FPU yield */ - int err = 0; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, iv); - - while (walk->nbytes > 0) { - unsigned int nbytes = walk->nbytes; + chacha_init_generic(state, ctx->key, iv); - if (nbytes < walk->total) { - nbytes = round_down(nbytes, walk->stride); - next_yield -= nbytes; - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; - chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, - nbytes, ctx->nrounds); + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (next_yield <= 0) { - /* temporarily allow preemption */ - kernel_fpu_end(); + if (!static_branch_likely(&chacha_use_simd) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { kernel_fpu_begin(); - next_yield = 4096; + chacha_dosimd(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + kernel_fpu_end(); } - - err = skcipher_walk_done(walk, walk->nbytes - nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; @@ -163,55 +202,34 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; - - kernel_fpu_begin(); - err = chacha_simd_stream_xor(&walk, ctx, req->iv); - kernel_fpu_end(); - return err; + return chacha_simd_stream_xor(req, ctx, req->iv); } static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); + struct chacha_ctx subctx; u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + chacha_init_generic(state, ctx->key, req->iv); + + if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { + kernel_fpu_begin(); + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + kernel_fpu_end(); + } else { + hchacha_block_generic(state, subctx.key, ctx->nrounds); + } subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(&walk, &subctx, real_iv); - - kernel_fpu_end(); - - return err; + return chacha_simd_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { @@ -227,7 +245,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = chacha_simd, .decrypt = chacha_simd, }, { @@ -242,7 +260,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, { @@ -257,7 +275,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, + .setkey = chacha12_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, @@ -266,24 +284,28 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha_use_avx512vl = chacha_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif + return 0; + + static_branch_enable(&chacha_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_simd_mod_fini(void) { - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c new file mode 100644 index 000000000000..a52a3fb15727 --- /dev/null +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -0,0 +1,2475 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <crypto/curve25519.h> +#include <crypto/internal/kpp.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); + +enum { NUM_WORDS_ELTFP25519 = 4 }; +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; + +#define mul_eltfp25519_1w_adx(c, a, b) do { \ + mul_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_1w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ + mul_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_1w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_adx(a) do { \ + sqr_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_1w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_bmi2(a) do { \ + sqr_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_1w_bmi2(a, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_adx(c, a, b) do { \ + mul2_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_2w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ + mul2_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_2w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_adx(a) do { \ + sqr2_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_2w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_bmi2(a) do { \ + sqr2_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_2w_bmi2(a, m.buffer); \ +} while (0) + +#define sqrn_eltfp25519_1w_adx(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_adx(a); \ +} while (0) + +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_bmi2(a); \ +} while (0) + +#define copy_eltfp25519_1w(C, A) do { \ + (C)[0] = (A)[0]; \ + (C)[1] = (A)[1]; \ + (C)[2] = (A)[2]; \ + (C)[3] = (A)[3]; \ +} while (0) + +#define setzero_eltfp25519_1w(C) do { \ + (C)[0] = 0; \ + (C)[1] = 0; \ + (C)[2] = 0; \ + (C)[3] = 0; \ +} while (0) + +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, + 0xffffffffffffffffUL, 0x5fffffffffffffffUL, + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, + 0xc1c20d06231f7614UL, 0x2938218da274f972UL, + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, + 0x23758739f630a257UL, 0x295a407a01a78580UL, + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, + 0xc189218075e91436UL, 0x6d9284169b3b8484UL, + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, + 0x25232973322dbef4UL, 0x445dc4758c17f770UL, + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, + 0x894d1d855ae52359UL, 0x68e122157b743d69UL, + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, + 0x4a497962066e6043UL, 0x705b3aab41355b44UL, + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, + 0x508e862f121692fcUL, 0x3a81907fa093c291UL, + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, + 0xe488de11d761e352UL, 0x0e878a01a085545cUL, + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, + 0x266fd5809208f294UL, 0x5c847085619a26b9UL, + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, + 0x52d17436309d4253UL, 0x356f97e13efae576UL, + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, + 0x497d723f802e88e1UL, 0x30684dea602f408dUL, + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, + 0x81004b71e33cc191UL, 0x44e6be345122803cUL, + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, + 0x33979624f0e917beUL, 0x2c018dc527356b30UL, + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL +}; + +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] + * a is two 256-bit integers: a0[0:3] and a1[4:7] + * b is two 256-bit integers: b0[0:3] and b1[4:7] + */ +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "xorl %%r14d, %%r14d ;" + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "adox %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adox %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, 64(%0);" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "adox %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adox %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 72(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 80(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 88(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "movq %%r8, 64(%0) ;" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "addq %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adcq %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 72(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 80(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 88(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ + "xorl %%r15d, %%r15d;" + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ + "adcx %%r14, %%r9 ;" + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ + "adcx %%rax, %%r10 ;" + "movq 56(%1), %%rdx ;" /* B[3] */ + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ + "adcx %%rax, %%rbx ;" + "movq 40(%1), %%rdx ;" /* B[1] */ + "adcx %%r15, %%r13 ;" + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq 32(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + "movq 40(%1), %%rdx ;" /* B[1] */ + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ + + "movq 48(%1), %%rdx ;" /* B[2] */ + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox 64(%1), %%r8 ;" + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 72(%1), %%r9 ;" + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 80(%1), %%r10 ;" + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 88(%1), %%r11 ;" + /****************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /****************************************/ + "adcq $0, %%rcx ;" + "addq 64(%1), %%r8 ;" + "adcq 72(%1), %%r9 ;" + "adcq 80(%1), %%r10 ;" + "adcq 88(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ + "adox %%r9, %%r10 ;" + "movq %%r10, 8(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ + "adox %%r11, %%r15 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ + "adox %%r13, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 8(%0), %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 16(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 16(%0), %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 24(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 24(%0), %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 32(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq %%r14, 48(%0) ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + "movq %%rax, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15"); +} + +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "xorl %%ecx, %%ecx ;" + "movq (%2), %%r8 ;" + "adcx (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcx 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcx 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcx 24(%1), %%r11 ;" + "cmovc %%eax, %%ecx ;" + "xorl %%eax, %%eax ;" + "adcx %%rcx, %%r8 ;" + "adcx %%rax, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rax, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rax, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $38, %%ecx ;" + "cmovc %%ecx, %%eax ;" + "addq %%rax, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%2), %%r8 ;" + "addq (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcq 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcq 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcq 24(%1), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%1), %%r8 ;" + "subq (%2), %%r8 ;" + "movq 8(%1), %%r9 ;" + "sbbq 8(%2), %%r9 ;" + "movq 16(%1), %%r10 ;" + "sbbq 16(%2), %%r10 ;" + "movq 24(%1), %%r11 ;" + "sbbq 24(%2), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "sbbq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "sbbq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "sbbq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ +static __always_inline void +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) +{ + const u64 a24 = 121666; + asm volatile( + "movq %2, %%rdx ;" + "mulx (%1), %%r8, %%r10 ;" + "mulx 8(%1), %%r9, %%r11 ;" + "addq %%r10, %%r9 ;" + "mulx 16(%1), %%r10, %%rax ;" + "adcq %%r11, %%r10 ;" + "mulx 24(%1), %%r11, %%rcx ;" + "adcq %%rax, %%r11 ;" + /**************************/ + "adcq $0, %%rcx ;" + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ + "imul %%rdx, %%rcx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(a24) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[4]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_adx(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 2); + mul_eltfp25519_1w_adx(T[0], a, T[2]); + mul_eltfp25519_1w_adx(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 1); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 10); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 20); + mul_eltfp25519_1w_adx(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 10); + mul_eltfp25519_1w_adx(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_adx(T[0], 50); + mul_eltfp25519_1w_adx(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 100); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 50); + mul_eltfp25519_1w_adx(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[5]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_bmi2(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 2); + mul_eltfp25519_1w_bmi2(T[0], a, T[2]); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 1); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 10); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 20); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 10); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_bmi2(T[0], 50); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 100); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 50); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c + * with a number such that 0 <= C < 2**255-19. + */ +static __always_inline void fred_eltfp25519_1w(u64 *const c) +{ + u64 tmp0 = 38, tmp1 = 19; + asm volatile( + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ + + /* Add either 19 or 38 to c */ + "addq %4, %0 ;" + "adcq $0, %1 ;" + "adcq $0, %2 ;" + "adcq $0, %3 ;" + + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ + "movl $0, %k4 ;" + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ + "btrq $63, %3 ;" /* Clear bit 255 */ + + /* Subtract 19 if necessary */ + "subq %4, %0 ;" + "sbbq $0, %1 ;" + "sbbq $0, %2 ;" + "sbbq $0, %3 ;" + + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), + "+r"(tmp1) + : + : "memory", "cc"); +} + +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) +{ + u64 temp; + asm volatile( + "test %9, %9 ;" + "movq %0, %8 ;" + "cmovnzq %4, %0 ;" + "cmovnzq %8, %4 ;" + "movq %1, %8 ;" + "cmovnzq %5, %1 ;" + "cmovnzq %8, %5 ;" + "movq %2, %8 ;" + "cmovnzq %6, %2 ;" + "cmovnzq %8, %6 ;" + "movq %3, %8 ;" + "cmovnzq %7, %3 ;" + "cmovnzq %8, %7 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), + "=r"(temp) + : "r"(bit) + : "cc" + ); +} + +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) +{ + asm volatile( + "test %4, %4 ;" + "cmovnzq %5, %0 ;" + "cmovnzq %6, %1 ;" + "cmovnzq %7, %2 ;" + "cmovnzq %8, %3 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) + : "cc" + ); +} + +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_adx(A, Qz); + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_adx(A, Zr1); + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_bmi2(A, Qz); + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_bmi2(A, Zr1); + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx(mypublic, secret, basepoint); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx_base(pub, secret); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2_base(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-x86", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + +static int __init curve25519_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2)) + static_branch_enable(&curve25519_use_bmi2); + else if (boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_adx); + else + return 0; + return crypto_register_kpp(&curve25519_alg); +} + +static void __exit curve25519_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2) || + boot_cpu_has(X86_FEATURE_ADX)) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4a1c05dce950..370cd88068ec 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -7,47 +7,23 @@ #include <crypto/algapi.h> #include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> #include <crypto/internal/simd.h> -#include <crypto/poly1305.h> #include <linux/crypto.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <asm/simd.h> -struct poly1305_simd_desc_ctx { - struct poly1305_desc_ctx base; - /* derived key u set? */ - bool uset; -#ifdef CONFIG_AS_AVX2 - /* derived keys r^3, r^4 set? */ - bool wset; -#endif - /* derived Poly1305 key r^2 */ - u32 u[5]; - /* ... silently appended r^3 and r^4 when using AVX2 */ -}; - asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks); asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -#ifdef CONFIG_AS_AVX2 asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -static bool poly1305_use_avx2; -#endif - -static int poly1305_simd_init(struct shash_desc *desc) -{ - struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); - sctx->uset = false; -#ifdef CONFIG_AS_AVX2 - sctx->wset = false; -#endif - - return crypto_poly1305_init(desc); -} +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static void poly1305_simd_mult(u32 *a, const u32 *b) { @@ -60,72 +36,85 @@ static void poly1305_simd_mult(u32 *a, const u32 *b) poly1305_block_sse2(a, m, b, 1); } +static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + unsigned int datalen; + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_core_blocks(&dctx->h, dctx->r, src, + srclen / POLY1305_BLOCK_SIZE, 1); + srclen %= POLY1305_BLOCK_SIZE; + } + return srclen; +} + static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - struct poly1305_simd_desc_ctx *sctx; unsigned int blocks, datalen; - BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); - sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); - if (unlikely(!dctx->sset)) { datalen = crypto_poly1305_setdesckey(dctx, src, srclen); src += srclen - datalen; srclen = datalen; } -#ifdef CONFIG_AS_AVX2 - if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(!sctx->wset)) { - if (!sctx->uset) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&poly1305_use_avx2) && + srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(dctx->rset < 4)) { + if (dctx->rset < 2) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); } - memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r.r); - memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r.r); - sctx->wset = true; + dctx->r[2] = dctx->r[1]; + poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); + dctx->r[3] = dctx->r[2]; + poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); + dctx->rset = 4; } blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, + dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 4 * blocks; srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; } -#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (unlikely(dctx->rset < 2)) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); + dctx->rset = 2; } blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, + blocks, dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 2 * blocks; srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); + poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); srclen -= POLY1305_BLOCK_SIZE; } return srclen; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) { - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int bytes; - - /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !crypto_simd_usable()) - return crypto_poly1305_update(desc, src, srclen); + poly1305_init_generic(desc, key); +} +EXPORT_SYMBOL(poly1305_init_arch); - kernel_fpu_begin(); +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int srclen) +{ + unsigned int bytes; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -135,34 +124,84 @@ static int poly1305_simd_update(struct shash_desc *desc, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + kernel_fpu_end(); + } else { + poly1305_scalar_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + } dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = poly1305_simd_blocks(dctx, src, srclen); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + bytes = poly1305_simd_blocks(dctx, src, srclen); + kernel_fpu_end(); + } else { + bytes = poly1305_scalar_blocks(dctx, src, srclen); + } src += srclen - bytes; srclen = bytes; } - kernel_fpu_end(); - if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest) +{ + poly1305_final_generic(desc, digest); +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int crypto_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_core_init(&dctx->h); + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_generic(dctx, dst); + return 0; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + poly1305_update_arch(dctx, src, srclen); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, - .init = poly1305_simd_init, + .init = crypto_poly1305_init, .update = poly1305_simd_update, .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_simd_desc_ctx), + .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", @@ -175,16 +214,16 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); - alg.descsize = sizeof(struct poly1305_simd_desc_ctx); - if (poly1305_use_avx2) - alg.descsize += 10 * sizeof(u32); -#endif + return 0; + + static_branch_enable(&poly1305_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + return crypto_register_shash(&alg); } diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e7d35f60d53f..64c3e70b0556 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -5,12 +5,14 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/delay.h> +#include <linux/jiffies.h> #include <asm/apicdef.h> #include <asm/nmi.h> #include "../perf_event.h" -static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); +static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); +static unsigned long perf_nmi_window; static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -641,11 +643,12 @@ static void amd_pmu_disable_event(struct perf_event *event) * handler when multiple PMCs are active or PMC overflow while handling some * other source of an NMI. * - * Attempt to mitigate this by using the number of active PMCs to determine - * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset - * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the - * number of active PMCs or 2. The value of 2 is used in case an NMI does not - * arrive at the LAPIC in time to be collapsed into an already pending NMI. + * Attempt to mitigate this by creating an NMI window in which un-handled NMIs + * received during this window will be claimed. This prevents extending the + * window past when it is possible that latent NMIs should be received. The + * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has + * handled a counter. When an un-handled NMI is received, it will be claimed + * only if arriving within that window. */ static int amd_pmu_handle_irq(struct pt_regs *regs) { @@ -663,21 +666,19 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) handled = x86_pmu_handle_irq(regs); /* - * If a counter was handled, record the number of possible remaining - * NMIs that can occur. + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. */ if (handled) { - this_cpu_write(perf_nmi_counter, - min_t(unsigned int, 2, active)); + this_cpu_write(perf_nmi_tstamp, + jiffies + perf_nmi_window); return handled; } - if (!this_cpu_read(perf_nmi_counter)) + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) return NMI_DONE; - this_cpu_dec(perf_nmi_counter); - return NMI_HANDLED; } @@ -909,6 +910,9 @@ static int __init amd_core_pmu_init(void) if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; + /* Avoid calulating the value each time in the NMI handler */ + perf_nmi_window = msecs_to_jiffies(100); + switch (boot_cpu_data.x86) { case 0x15: pr_cont("Fam15h "); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 5b35b7ea5d72..26c36357c4c9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -377,7 +377,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); config &= ~perf_ibs->enable_mask; wrmsrl(hwc->config_base, config); } @@ -553,7 +554,8 @@ static struct perf_ibs perf_ibs_op = { }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, .max_period = IBS_OP_MAX_CNT << 4, @@ -614,7 +616,7 @@ fail: if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) - offset_max = 2; + offset_max = 3; else offset_max = 1; do { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 27ee47a7be66..937363b803c1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3323,8 +3323,19 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +#ifdef CONFIG_RETPOLINE +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr); +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr); +#endif + struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { +#ifdef CONFIG_RETPOLINE + if (x86_pmu.guest_get_msrs == intel_guest_get_msrs) + return intel_guest_get_msrs(nr); + else if (x86_pmu.guest_get_msrs == core_guest_get_msrs) + return core_guest_get_msrs(nr); +#endif if (x86_pmu.guest_get_msrs) return x86_pmu.guest_get_msrs(nr); *nr = 0; @@ -4983,6 +4994,8 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5031,6 +5044,8 @@ __init int intel_pmu_init(void) /* fall through */ case INTEL_FAM6_ICELAKE_L: case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9f2f39003d96..e1daf4151e11 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -45,46 +45,49 @@ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, - CNL + * CNL,KBL,CML * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 - * Available model: SNB,IVB,HSW,BDW,SKL,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, + * ICL,TGL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL + * GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,KBL,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL * Scope: Package (physical package) * */ @@ -544,6 +547,19 @@ static const struct cstate_model cnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model icl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -614,6 +630,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), @@ -625,8 +643,10 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 74e80ed9c6c4..05e43d0f430b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -627,7 +627,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) * link as the 2nd entry in the table */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p); + TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT; TOPA_ENTRY(&tp->topa, 1)->end = 1; } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 6fc2e06ab4c6..86467f85c383 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -502,10 +502,8 @@ void uncore_pmu_event_start(struct perf_event *event, int flags) local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); uncore_enable_event(box, event); - if (box->n_active == 1) { - uncore_enable_box(box); + if (box->n_active == 1) uncore_pmu_start_hrtimer(box); - } } void uncore_pmu_event_stop(struct perf_event *event, int flags) @@ -529,10 +527,8 @@ void uncore_pmu_event_stop(struct perf_event *event, int flags) WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - if (box->n_active == 0) { - uncore_disable_box(box); + if (box->n_active == 0) uncore_pmu_cancel_hrtimer(box); - } } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { @@ -778,6 +774,40 @@ static int uncore_pmu_event_init(struct perf_event *event) return ret; } +static void uncore_pmu_enable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->enable_box) + uncore_pmu->type->ops->enable_box(box); +} + +static void uncore_pmu_disable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->disable_box) + uncore_pmu->type->ops->disable_box(box); +} + static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { @@ -803,6 +833,8 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) pmu->pmu = (struct pmu) { .attr_groups = pmu->type->attr_groups, .task_ctx_nr = perf_invalid_context, + .pmu_enable = uncore_pmu_enable, + .pmu_disable = uncore_pmu_disable, .event_init = uncore_pmu_event_init, .add = uncore_pmu_event_add, .del = uncore_pmu_event_del, diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index f36f7bebbc1b..bbfdaa720b45 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -441,18 +441,6 @@ static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, return -EINVAL; } -static inline void uncore_disable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->disable_box) - box->pmu->type->ops->disable_box(box); -} - -static inline void uncore_enable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->enable_box) - box->pmu->type->ops->enable_box(box); -} - static inline void uncore_disable_event(struct intel_uncore_box *box, struct perf_event *event) { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index b1afc77f0704..6f86650b3f77 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -89,7 +89,14 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5c056b8aebef..e01078e93dd3 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -260,11 +260,21 @@ void __init hv_apic_init(void) } if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { - pr_info("Hyper-V: Using MSR based APIC access\n"); + pr_info("Hyper-V: Using enlightened APIC (%s mode)", + x2apic_enabled() ? "x2apic" : "xapic"); + /* + * With x2apic, architectural x2apic MSRs are equivalent to the + * respective synthetic MSRs, so there's no need to override + * the apic accessors. The only exception is + * hv_apic_eoi_write, because it benefits from lazy EOI when + * available, but it works for both xapic and x2apic modes. + */ apic_set_eoi_write(hv_apic_eoi_write); - apic->read = hv_apic_read; - apic->write = hv_apic_write; - apic->icr_write = hv_apic_icr_write; - apic->icr_read = hv_apic_icr_read; + if (!x2apic_enabled()) { + apic->read = hv_apic_read; + apic->write = hv_apic_write; + apic->icr_write = hv_apic_icr_write; + apic->icr_read = hv_apic_icr_read; + } } } diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index cff3f3f3bfe0..8348f7d69fd5 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPU_ENTRY_AREA_H #define _ASM_X86_CPU_ENTRY_AREA_H diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0652d3eed9bd..c4fbe379cc0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -399,5 +399,7 @@ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f04622500da3..c606c0b70738 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -83,6 +83,9 @@ #define INTEL_FAM6_TIGERLAKE_L 0x8C #define INTEL_FAM6_TIGERLAKE 0x8D +#define INTEL_FAM6_COMETLAKE 0xA5 +#define INTEL_FAM6_COMETLAKE_L 0xA6 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 23edf56cf577..b79cd6aa4075 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -156,10 +156,8 @@ enum kvm_reg { VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, - NR_VCPU_REGS -}; + NR_VCPU_REGS, -enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, @@ -219,13 +217,6 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting - * with the SVE bit in EPT PTEs. - */ -#define SPTE_SPECIAL_MASK (1ULL << 62) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -319,9 +310,12 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + bool unsync; u8 mmu_valid_gen; bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* * The following two entries are used to key the shadow page in the @@ -458,6 +452,11 @@ struct kvm_pmc { u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; + /* + * eventsel value for general purpose counters, + * ctrl value for fixed counters. + */ + u64 current_config; }; struct kvm_pmu { @@ -476,7 +475,21 @@ struct kvm_pmu { struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; - u64 reprogram_pmi; + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + + /* + * The gate to release perf_events not marked in + * pmc_in_use only once in a vcpu time slice. + */ + bool need_cleanup; + + /* + * The total number of programmed perf_events and it helps to avoid + * redundant check before cleanup if guest don't use vPMU at all. + */ + u8 event_count; }; struct kvm_pmu_ops; @@ -569,6 +582,7 @@ struct kvm_vcpu_arch { u64 smbase; u64 smi_count; bool tpr_access_reporting; + bool xsaves_enabled; u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; @@ -866,6 +880,7 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -940,6 +955,7 @@ struct kvm_arch { bool exception_payload_enabled; struct kvm_pmu_event_filter *pmu_event_filter; + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { @@ -953,6 +969,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; @@ -1042,7 +1059,6 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr3)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -1091,7 +1107,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); + bool (*get_enable_apicv)(struct kvm *kvm); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1196,7 +1212,7 @@ struct kvm_x86_ops { int (*set_nested_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); @@ -1358,6 +1374,7 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); @@ -1578,6 +1595,8 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap); void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 20ce682a2540..084e98da04a7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -93,6 +93,18 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -103,6 +115,10 @@ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 @@ -393,6 +409,8 @@ #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD_PPIN_CTL 0xc00102f0 +#define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index e28f8b723b5c..9d5252c9685c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -21,7 +21,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_LOOPS ((u32)-1) -#define MWAITX_DISABLE_CSTATES 0xf +#define MWAITX_DISABLE_CSTATES 0xf0 static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 80bc209c0708..5c24a7b35166 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include <asm/segment.h> /** - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the @@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0bc530c4eb13..ad97dc155195 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1463,6 +1463,12 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } +#define arch_faults_on_old_pte arch_faults_on_old_pte +static inline bool arch_faults_on_old_pte(void) +{ + return false; +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6e0a3b43d027..54f5d54280f6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -988,4 +988,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 5df09a0b80b8..07375b476c4f 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTI_H #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 5e8319bb207a..23c626a742e8 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -26,10 +26,11 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define POKE_MAX_OPCODE_SIZE 5 struct text_poke_loc { - void *detour; void *addr; - size_t len; - const char opcode[POKE_MAX_OPCODE_SIZE]; + int len; + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; }; extern void text_poke_early(void *addr, const void *opcode, size_t len); @@ -51,8 +52,10 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); +extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } -#define INT3_INSN_SIZE 1 -#define CALL_INSN_SIZE 5 +#define INT3_INSN_SIZE 1 +#define INT3_INSN_OPCODE 0xCC + +#define CALL_INSN_SIZE 5 +#define CALL_INSN_OPCODE 0xE8 + +#define JMP32_INSN_SIZE 5 +#define JMP32_INSN_OPCODE 0xE9 + +#define JMP8_INSN_SIZE 2 +#define JMP8_INSN_OPCODE 0xEB static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c225ede0e4..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -734,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index e00c9e875933..ac9fc51e2b18 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -4,6 +4,7 @@ #include <asm/cpufeatures.h> #include <asm/alternative.h> +#include <linux/stringify.h> /* * The hypercall definitions differ in the low word of the %edx argument @@ -20,8 +21,8 @@ */ /* Old port-based version */ -#define VMWARE_HYPERVISOR_PORT "0x5658" -#define VMWARE_HYPERVISOR_PORT_HB "0x5659" +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_PORT_HB 0x5659 /* Current vmcall / vmmcall version */ #define VMWARE_HYPERVISOR_HB BIT(0) @@ -29,7 +30,8 @@ /* The low bandwidth call. The low word of edx is presumed clear. */ #define VMWARE_HYPERCALL \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT ", %%dx; inl (%%dx)", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \ + "inl (%%dx), %%eax", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) @@ -38,7 +40,8 @@ * HB and OUT bits set. */ #define VMWARE_HYPERCALL_HB_OUT \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep outsb", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep outsb", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) @@ -47,7 +50,8 @@ * HB bit set. */ #define VMWARE_HYPERCALL_HB_IN \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep insb", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep insb", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) #endif diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3a971ea364..9ec463fe96f2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; - unsigned char int3 = 0xcc; void *ip; /* * Having observed our INT3 instruction, we now must observe * bp_patching.nr_entries. * - * nr_entries != 0 INT3 - * WMB RMB - * write INT3 if (nr_entries) + * nr_entries != 0 INT3 + * WMB RMB + * write INT3 if (nr_entries) * * Idem for other elements in bp_patching. */ @@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the sizeof(int3). See text_poke_bp_batch(). + * Discount the INT3. See text_poke_bp_batch(). */ - ip = (void *) regs->ip - sizeof(int3); + ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. @@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs) return 0; } - /* set up the specified breakpoint detour */ - regs->ip = (unsigned long) tp->detour; + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + return 0; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->rel32); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->rel32); + break; + + default: + BUG(); + } return 1; } @@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * synchronization using int3 breakpoint. * * The way it is done: - * - For each entry in the vector: + * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler); */ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - int patched_all_but_first = 0; - unsigned char int3 = 0xcc; + unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; + int do_sync; lockdep_assert_held(&text_mutex); @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) /* * Second step: update all but the first byte of the patched range. */ - for (i = 0; i < nr_entries; i++) { + for (do_sync = 0, i = 0; i < nr_entries; i++) { if (tp[i].len - sizeof(int3) > 0) { text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].opcode + sizeof(int3), + (const char *)tp[i].text + sizeof(int3), tp[i].len - sizeof(int3)); - patched_all_but_first++; + do_sync++; } } - if (patched_all_but_first) { + if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + do_sync++; + } + + if (do_sync) + on_each_cpu(do_sync_core, NULL, 1); - on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. @@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) bp_patching.nr_entries = 0; } +void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + + if (!opcode) + opcode = (void *)tp->text; + else + memcpy((void *)tp->text, opcode, len); + + if (!emulate) + emulate = opcode; + + kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); + insn_get_length(&insn); + + BUG_ON(!insn_complete(&insn)); + BUG_ON(len != insn.length); + + tp->addr = addr; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->rel32 = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->rel32 = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->rel32 = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp = { - .detour = handler, - .addr = addr, - .len = len, - }; - - if (len > POKE_MAX_OPCODE_SIZE) { - WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); - return; - } - - memcpy((void *)tp.opcode, opcode, len); + struct text_poke_loc tp; + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a6ac3712db8b..4bbccb9d16dc 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); + iommu_size >> 20); } return iommu_size; @@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); return -1; } @@ -733,8 +731,8 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); } return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9e2dd2b296cd..05feaec5d3d7 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warning("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n",(long)res); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -977,7 +977,7 @@ static int __init calibrate_APIC_clock(void) */ if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); - pr_warning("APIC frequency too slow, disabling apic timer\n"); + pr_warn("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -1021,7 +1021,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - pr_warning("APIC timer disabled due to verification failure\n"); + pr_warn("APIC timer disabled due to verification failure\n"); return -1; } @@ -1095,8 +1095,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", - smp_processor_id()); + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1586,9 +1586,6 @@ static void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif if (disable_apic) { disable_ioapic_support(); @@ -1626,16 +1623,21 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* @@ -1809,11 +1811,11 @@ static int __init setup_nox2apic(char *str) int apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); return 0; } - pr_warning("x2apic already enabled.\n"); + pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } setup_clear_cpu_cap(X86_FEATURE_X2APIC); @@ -1981,7 +1983,7 @@ static int __init apic_verify(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); + pr_warn("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -2408,9 +2410,8 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", - thiscpu, apicid); + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2424,8 +2425,7 @@ int generic_processor_info(int apicid, int version) apicid != boot_cpu_physical_apicid) { int thiscpu = max + disabled_cpus - 1; - pr_warning( - "APIC: NR_CPUS/possible_cpus limit of %i almost" + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2436,9 +2436,8 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2468,13 +2467,13 @@ int generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); version = 0x10; } if (version != boot_cpu_apic_version) { - pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", boot_cpu_apic_version, cpu, version); } @@ -2843,7 +2842,7 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_VERBOSE; #ifdef CONFIG_X86_64 else { - pr_warning("APIC Verbosity level %s not recognised" + pr_warn("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 45e92cba92f5..b0889c48a2ac 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -156,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) { struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + if (cmsk) + cpumask_clear_cpu(dead_cpu, &cmsk->mask); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..890f60083eca 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 91c2561b905f..4c7b0fa15a19 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,7 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +106,7 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); arch_smt_update(); @@ -269,6 +271,100 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TAA mitigation is turned off on the cmdline (tsx_async_abort=off) */ + if (taa_mitigation == TAA_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -786,13 +882,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +912,17 @@ void cpu_bugs_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1253,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1304,11 +1411,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1448,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1533,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1575,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9ae7d1bcd4f4..fffe21945374 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,27 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1073,15 +1074,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1092,19 +1095,30 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1121,6 +1135,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1554,6 +1583,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..38ab6e115eac 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c2fdc00df163..11d5c5950e2d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ea7fdc82f3c..5167bd2bb6b1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m) * - Prevent possible spurious interrupts from the IF bank on Family 0x17 * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { int i, num_msrs; u64 hwcr; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 743370ee4983..5f42f25bac8f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) return 0; - /* Checks after this one are Intel-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 1; if (!(m->status & MCI_STATUS_MISCV)) @@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address); bool mce_is_memory_error(struct mce *m) { - if (m->cpuvendor == X86_VENDOR_AMD || - m->cpuvendor == X86_VENDOR_HYGON) { + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -527,9 +531,10 @@ bool mce_is_memory_error(struct mce *m) return (m->status & 0xef80) == BIT(7) || (m->status & 0xef00) == BIT(8) || (m->status & 0xeffc) == 0xc; - } - return false; + default: + return false; + } } EXPORT_SYMBOL_GPL(mce_is_memory_error); @@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu) u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + if (mcgstatus & MCG_STATUS_RIPV) { mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); return true; @@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Check if this MCE is signaled to only this logical processor, - * on Intel only. + * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL) + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* @@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) @@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c) } } +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_centaur_feature_init(c); break; + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + default: break; } @@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_clear(c); break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + default: break; } @@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs - * are socket-wide. - * Disabling them for just a single offlined CPU is bad, since it will - * inhibit reporting for all shared resources on the socket like the - * last level cache (LLC), the integrated memory controller (iMC), etc. + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 88cd9598fa57..e270d0770134 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -85,8 +85,10 @@ static int cmci_supported(int *banks) * initialization is vendor keyed and this * makes sure none of the backdoors are entered otherwise. */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -423,7 +425,7 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -static void intel_init_cmci(void) +void intel_init_cmci(void) { int banks; @@ -442,7 +444,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -static void intel_init_lmce(void) +void intel_init_lmce(void) { u64 val; @@ -455,7 +457,7 @@ static void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } -static void intel_clear_lmce(void) +void intel_clear_lmce(void) { u64 val; @@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43031db429d2..842b273bce31 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval); bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index 6e2becf547c5..d01e0da0163a 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -40,15 +40,58 @@ #define THERMAL_THROTTLING_EVENT 0 #define POWER_LIMIT_EVENT 1 -/* - * Current thermal event state: +/** + * struct _thermal_state - Represent the current thermal event state + * @next_check: Stores the next timestamp, when it is allowed + * to log the next warning message. + * @last_interrupt_time: Stores the timestamp for the last threshold + * high event. + * @therm_work: Delayed workqueue structure + * @count: Stores the current running count for thermal + * or power threshold interrupts. + * @last_count: Stores the previous running count for thermal + * or power threshold interrupts. + * @max_time_ms: This shows the maximum amount of time CPU was + * in throttled state for a single thermal + * threshold high to low state. + * @total_time_ms: This is a cumulative time during which CPU was + * in the throttled state. + * @rate_control_active: Set when a throttling message is logged. + * This is used for the purpose of rate-control. + * @new_event: Stores the last high/low status of the + * THERM_STATUS_PROCHOT or + * THERM_STATUS_POWER_LIMIT. + * @level: Stores whether this _thermal_state instance is + * for a CORE level or for PACKAGE level. + * @sample_index: Index for storing the next sample in the buffer + * temp_samples[]. + * @sample_count: Total number of samples collected in the buffer + * temp_samples[]. + * @average: The last moving average of temperature samples + * @baseline_temp: Temperature at which thermal threshold high + * interrupt was generated. + * @temp_samples: Storage for temperature samples to calculate + * moving average. + * + * This structure is used to represent data related to thermal state for a CPU. + * There is a separate storage for core and package level for each CPU. */ struct _thermal_state { - bool new_event; - int event; u64 next_check; + u64 last_interrupt_time; + struct delayed_work therm_work; unsigned long count; unsigned long last_count; + unsigned long max_time_ms; + unsigned long total_time_ms; + bool rate_control_active; + bool new_event; + u8 level; + u8 sample_index; + u8 sample_count; + u8 average; + u8 baseline_temp; + u8 temp_samples[3]; }; struct thermal_state { @@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count); define_therm_throt_device_show_func(package_power_limit, count); define_therm_throt_device_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(core_throttle, max_time_ms); +define_therm_throt_device_one_ro(core_throttle_max_time_ms); + +define_therm_throt_device_show_func(package_throttle, max_time_ms); +define_therm_throt_device_one_ro(package_throttle_max_time_ms); + +define_therm_throt_device_show_func(core_throttle, total_time_ms); +define_therm_throt_device_one_ro(core_throttle_total_time_ms); + +define_therm_throt_device_show_func(package_throttle, total_time_ms); +define_therm_throt_device_one_ro(package_throttle_total_time_ms); + static struct attribute *thermal_throttle_attrs[] = { &dev_attr_core_throttle_count.attr, + &dev_attr_core_throttle_max_time_ms.attr, + &dev_attr_core_throttle_total_time_ms.attr, NULL }; @@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = { #define CORE_LEVEL 0 #define PACKAGE_LEVEL 1 +#define THERM_THROT_POLL_INTERVAL HZ +#define THERM_STATUS_PROCHOT_LOG BIT(1) + +static void clear_therm_status_log(int level) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); +} + +static void get_therm_status(int level, bool *proc_hot, u8 *temp) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + if (msr_val & THERM_STATUS_PROCHOT_LOG) + *proc_hot = true; + else + *proc_hot = false; + + *temp = (msr_val >> 16) & 0x7F; +} + +static void throttle_active_work(struct work_struct *work) +{ + struct _thermal_state *state = container_of(to_delayed_work(work), + struct _thermal_state, therm_work); + unsigned int i, avg, this_cpu = smp_processor_id(); + u64 now = get_jiffies_64(); + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* temperature value is offset from the max so lesser means hotter */ + if (!hot && temp > state->baseline_temp) { + if (state->rate_control_active) + pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + + state->rate_control_active = false; + return; + } + + if (time_before64(now, state->next_check) && + state->rate_control_active) + goto re_arm; + + state->next_check = now + CHECK_INTERVAL; + + if (state->count != state->last_count) { + /* There was one new thermal interrupt */ + state->last_count = state->count; + state->average = 0; + state->sample_count = 0; + state->sample_index = 0; + } + + state->temp_samples[state->sample_index] = temp; + state->sample_count++; + state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); + if (state->sample_count < ARRAY_SIZE(state->temp_samples)) + goto re_arm; + + avg = 0; + for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) + avg += state->temp_samples[i]; + + avg /= ARRAY_SIZE(state->temp_samples); + + if (state->average > avg) { + pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + state->rate_control_active = true; + } + + state->average = avg; + +re_arm: + clear_therm_status_log(state->level); + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); +} + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level) if (new_event) state->count++; - if (time_before64(now, state->next_check) && - state->count != state->last_count) + if (event != THERMAL_THROTTLING_EVENT) return; - state->next_check = now + CHECK_INTERVAL; - state->last_count = state->count; + if (new_event && !state->last_interrupt_time) { + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* + * Ignore short temperature spike as the system is not close + * to PROCHOT. 10C offset is large enough to ignore. It is + * already dropped from the high threshold temperature. + */ + if (temp > 10) + return; - /* if we just entered the thermal event */ - if (new_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - return; - } - if (old_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); - return; + state->baseline_temp = temp; + state->last_interrupt_time = now; + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); + } else if (old_event && state->last_interrupt_time) { + unsigned long throttle_time; + + throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); + if (throttle_time > state->max_time_ms) + state->max_time_ms = throttle_time; + state->total_time_ms += throttle_time; + state->last_interrupt_time = 0; } } @@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_max_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_total_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } } + return 0; + +del_group: + sysfs_remove_group(&dev->kobj, &thermal_attr_group); + return err; } @@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev) /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_online(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + state->package_throttle.level = PACKAGE_LEVEL; + state->core_throttle.level = CORE_LEVEL; + + INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); + INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + return thermal_throttle_add_dev(dev, cpu); } static int thermal_throttle_offline(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + cancel_delayed_work(&state->package_throttle.therm_work); + cancel_delayed_work(&state->core_throttle.therm_work); + + state->package_throttle.rate_control_active = false; + state->core_throttle.rate_control_active = false; + thermal_throttle_remove_dev(dev); return 0; } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a0e52bd00ecc..3f6b137ef4e6 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev, dummy; + u32 rev, dummy __always_unused; mc = (struct microcode_amd *)amd_ucode_patch; @@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy; + u32 rev, dummy __always_unused; BUG_ON(raw_smp_processor_id() != cpu); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index cb0fdcaf1415..7019d4b2df0c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); -/* - * Serialize late loading so that CPUs get updated one-by-one. - */ -static DEFINE_RAW_SPINLOCK(update_lock); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -566,11 +561,18 @@ static int __reload_late(void *info) if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) return -1; - raw_spin_lock(&update_lock); - apply_microcode_local(&err); - raw_spin_unlock(&update_lock); + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + apply_microcode_local(&err); + else + goto wait_for_siblings; - /* siblings return UCODE_OK because their engine got updated already */ if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); ret = -1; @@ -578,14 +580,18 @@ static int __reload_late(void *info) ret = 1; } +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + /* - * Increase the wait timeout to a safe value here since we're - * serializing the microcode update and that could take a while on a - * large number of CPUs. And that is fine as the *actual* timeout will - * be determined by the last CPU finished updating and thus cut short. + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. */ - if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) - panic("Timeout during microcode update!\n"); + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + apply_microcode_local(&err); return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ce799cfe9434..6a99535d7f37 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; struct microcode_intel *mc; enum ucode_state ret; static int prev_rev; @@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (rev != prev_rev) { + if (bsp && rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, @@ -852,7 +853,7 @@ out: c->microcode = rev; /* Update boot_cpu_data's revision too, if we're on the BSP: */ - if (c->cpu_index == boot_cpu_data.cpu_index) + if (bsp) boot_cpu_data.microcode = rev; return ret; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 267daad8c036..c656d92cd708 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -216,6 +216,10 @@ static void __init ms_hyperv_init_platform(void) int hv_host_info_ecx; int hv_host_info_edx; +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + /* * Extract the features and hints */ diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..2e3b06d6bbc6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 9735139cfdf8..46d732696c1c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,7 +49,7 @@ #define VMWARE_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx)" : \ + __asm__("inl (%%dx), %%eax" : \ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "a"(VMWARE_HYPERVISOR_MAGIC), \ "c"(VMWARE_CMD_##cmd), \ diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 753b8cfe8b8a..87b97897a881 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6f6b1d04dadf..4cba91ec8049 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -710,6 +710,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 29ffa495bd1c..206a4b6144c2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr, * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. */ pmd = fixup_pointer(level2_kernel_pgt, physaddr); - for (i = 0; i < PTRS_PER_PMD; i++) { + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; - } + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; /* * Fixup phys_base - remove the memory encryption mask to obtain diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 044053235302..c1a8b9e71408 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, - (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); } void arch_jump_label_transform(struct jump_entry *entry, @@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } __jump_label_set_jump_code(entry, type, - (union jump_code_union *) &tp->opcode, 0); + (union jump_code_union *)&tp->text, 0); - tp->addr = entry_code; - tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; - tp->len = JUMP_LABEL_NOP_SIZE; + text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); tp_vec_nr++; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b348dd506d58..8900329c28a7 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = RELATIVEJUMP_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); list_del_init(&op->list); } @@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist) void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 emulate_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ insn_buff[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + emulate_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + emulate_buff); } /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e820568ed4d5..32ef1ee733b7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -33,6 +33,7 @@ #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/tlb.h> +#include <asm/cpuidle_haltpoll.h> static int kvmapf = 1; diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 320ab978fb1f..1d0797b2338a 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // // Code shared between 32 and 64 bit diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 86663874ef04..e6d7894ad127 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("%s allocator failed (%d), falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a49fe1dcb47e..4c61f0713832 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -57,7 +57,7 @@ void __init tboot_probe(void) */ if (!e820__mapped_any(boot_params.tboot_addr, boot_params.tboot_addr, E820_TYPE_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -65,13 +65,12 @@ void __init tboot_probe(void) set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warning("tboot at 0x%llx is invalid\n", - boot_params.tboot_addr); + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); tboot = NULL; return; } if (tboot->version < 5) { - pr_warning("tboot version is invalid: %u\n", tboot->version); + pr_warn("tboot version is invalid: %u\n", tboot->version); tboot = NULL; return; } @@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { - pr_warning("unsupported sleep state 0x%x\n", sleep_state); + pr_warn("unsupported sleep state 0x%x\n", sleep_state); return -1; } @@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) if (!tboot_enabled()) return 0; - pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); return -ENODEV; } @@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps) } if (timeout) - pr_warning("tboot wait for APs timeout\n"); + pr_warn("tboot wait for APs timeout\n"); return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } @@ -516,7 +515,7 @@ int tboot_force_iommu(void) return 1; if (no_iommu || swiotlb || dmar_disabled) - pr_warning("Forcing Intel-IOMMU to enabled\n"); + pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c59454c382fd..7e322e2daaf5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index ec534f978867..b8acf639abd1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -364,12 +364,12 @@ retry: /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); if (random_warps) - pr_warning("TSC warped randomly between CPUs\n"); + pr_warn("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 548fefed71ee..b4a304893189 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -91,7 +91,7 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warning(regs, fmt, ...) \ +#define umip_pr_warn(regs, fmt, ...) \ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) /** @@ -380,14 +380,14 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); /* Do not emulate (spoof) SLDT or STR. */ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; - umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..b19ef421084d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 63316036f85a..c0aa07487eb8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -485,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); @@ -618,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=<function>, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } @@ -811,8 +816,6 @@ static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, return __do_cpuid_func(entry, func, nent, maxnent); } -#undef F - struct kvm_cpuid_param { u32 func; bool (*qualifier)(const struct kvm_cpuid_param *param); @@ -969,53 +972,72 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + struct kvm_cpuid_entry2 *max; + + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; - - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. + */ + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + if (function == 7 && index == 0) { + u64 data; + if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + (data & TSX_CTRL_CPUID_CLEAR)) + *ebx &= ~(F(RTM) | F(HLE)); + } + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 698efb8c3897..952d1a4f4d7e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2770,11 +2770,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); - setup_syscalls_segments(ctxt, &cs, &ss); - if (!(efer & EFER_SCE)) return emulate_ud(ctxt); + setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2838,12 +2837,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_PROT64) return X86EMUL_UNHANDLEABLE; - setup_syscalls_segments(ctxt, &cs, &ss); - ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); + setup_syscalls_segments(ctxt, &cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d859ae8890d0..9fd2dd89a1c5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -271,8 +271,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; - int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e; + unsigned long vcpu_bitmap; + int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -296,6 +297,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) /* Preserve read-only fields */ old_remote_irr = e->fields.remote_irr; old_delivery_status = e->fields.delivery_status; + old_dest_id = e->fields.dest_id; + old_dest_mode = e->fields.dest_mode; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; @@ -321,7 +324,34 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_make_scan_ioapic_request(ioapic->kvm); + if (e->fields.delivery_mode == APIC_DM_FIXED) { + struct kvm_lapic_irq irq; + + irq.shorthand = 0; + irq.vector = e->fields.vector; + irq.delivery_mode = e->fields.delivery_mode << 8; + irq.dest_id = e->fields.dest_id; + irq.dest_mode = e->fields.dest_mode; + bitmap_zero(&vcpu_bitmap, 16); + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + if (old_dest_mode != e->fields.dest_mode || + old_dest_id != e->fields.dest_id) { + /* + * Update vcpu_bitmap with vcpus specified in + * the previous request as well. This is done to + * keep ioapic_handled_vectors synchronized. + */ + irq.dest_id = old_dest_id; + irq.dest_mode = old_dest_mode; + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + } + kvm_make_scan_ioapic_request_mask(ioapic->kvm, + &vcpu_bitmap); + } else { + kvm_make_scan_ioapic_request(ioapic->kvm); + } break; } } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1cc6c47dc77e..58767020de41 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -37,22 +37,50 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) { - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) +{ + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return 0; + + if (!kvm_register_is_available(vcpu, reg)) kvm_x86_ops->cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; } -static inline void kvm_register_write(struct kvm_vcpu *vcpu, - enum kvm_reg reg, +static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) { + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return; + vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_dirty(vcpu, reg); } static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) @@ -79,9 +107,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -109,8 +136,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - kvm_x86_ops->decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a3a6854dcca..cf9177b4a07f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,9 +66,10 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul static bool lapic_timer_advance_dynamic __read_mostly; -#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -110,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline u8 kvm_xapic_id(struct kvm_lapic *apic) -{ - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; @@ -561,60 +557,53 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, + struct kvm_lapic_irq *irq, u32 min) +{ + int i, count = 0; + struct kvm_vcpu *vcpu; + + if (min > map->max_apic_id) + return 0; + + for_each_set_bit(i, ipi_bitmap, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, irq, NULL); + } + } + + return count; +} + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { - int i; struct kvm_apic_map *map; - struct kvm_vcpu *vcpu; struct kvm_lapic_irq irq = {0}; int cluster_size = op_64_bit ? 64 : 32; - int count = 0; + int count; + + if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK)) + return -KVM_EINVAL; irq.vector = icr & APIC_VECTOR_MASK; irq.delivery_mode = icr & APIC_MODE_MASK; irq.level = (icr & APIC_INT_ASSERT) != 0; irq.trig_mode = icr & APIC_INT_LEVELTRIG; - if (icr & APIC_DEST_MASK) - return -KVM_EINVAL; - if (icr & APIC_SHORT_MASK) - return -KVM_EINVAL; - rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (unlikely(!map)) { - count = -EOPNOTSUPP; - goto out; - } - - if (min > map->max_apic_id) - goto out; - /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } - } - - min += cluster_size; - - if (min > map->max_apic_id) - goto out; - - for_each_set_bit(i, &ipi_bitmap_high, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } + count = -EOPNOTSUPP; + if (likely(map)) { + count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min); + min += cluster_size; + count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min); } -out: rcu_read_unlock(); return count; } @@ -1128,6 +1117,50 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } +/* + * This routine identifies the destination vcpus mask meant to receive the + * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find + * out the destination vcpus array and set the bitmap or it traverses to + * each available vcpu to identify the same. + */ +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic **dest_vcpu = NULL; + struct kvm_lapic *src = NULL; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + unsigned long bitmap; + int i, vcpu_idx; + bool ret; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu, + &bitmap); + if (ret) { + for_each_set_bit(i, &bitmap, 16) { + if (!dest_vcpu[i]) + continue; + vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx; + __set_bit(vcpu_idx, vcpu_bitmap); + } + } else { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + if (!kvm_apic_match_dest(vcpu, NULL, + irq->delivery_mode, + irq->dest_id, + irq->dest_mode)) + continue; + __set_bit(i, vcpu_bitmap); + } + } + rcu_read_unlock(); +} + int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; @@ -1504,8 +1537,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2302,7 +2335,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; @@ -2713,7 +2746,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs * and leave the INIT pending. */ - if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { + if (kvm_vcpu_latch_init(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2aad7e226fc0..39925afdfcdc 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -226,6 +226,9 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap); + bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, @@ -242,4 +245,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11f8ec89433b..d55674f44a18 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5269aa057dfa..6f92b40d798c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/pat.h> @@ -47,6 +48,35 @@ #include <asm/kvm_page_track.h> #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +#ifdef CONFIG_PREEMPT_RT +/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ +static uint __read_mostly nx_huge_pages_recovery_ratio = 0; +#else +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; +#endif + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -83,7 +113,17 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +259,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +343,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -320,10 +359,32 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -461,7 +522,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -1164,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1181,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1589,16 +1668,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1617,10 +1696,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1631,6 +1710,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2622,7 +2706,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2761,6 +2845,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return list_unstable; } @@ -2968,7 +3055,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -2980,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3200,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3215,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3225,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3273,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3517,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3555,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4141,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4151,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -4181,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4262,7 +4395,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } /* @@ -5881,9 +6014,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -6122,10 +6255,59 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + mutex_lock(&kvm->slots_lock); + kvm_mmu_zap_all_fast(kvm); + mutex_unlock(&kvm->slots_lock); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave @@ -6205,3 +6387,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c index 3521e2d176f2..3521e2d176f2 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d5cdb3af594..97b21e7fd013 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 46875bbd0419..d5e6d5b3f06f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -62,8 +62,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } @@ -76,8 +75,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); @@ -137,7 +135,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc_to_pmu(pmc)->event_count++; + clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); +} + +static void pmc_pause_counter(struct kvm_pmc *pmc) +{ + u64 counter = pmc->counter; + + if (!pmc->perf_event) + return; + + /* update counter, reset event value to avoid redundant accumulation */ + counter += perf_event_pause(pmc->perf_event, true); + pmc->counter = counter & pmc_bitmask(pmc); +} + +static bool pmc_resume_counter(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event) + return false; + + /* recalibrate sample period and check if it's accepted by perf core */ + if (perf_event_period(pmc->perf_event, + (-pmc->counter) & pmc_bitmask(pmc))) + return false; + + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ + perf_event_enable(pmc->perf_event); + + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + return true; } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) @@ -154,7 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc->eventsel = eventsel; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; @@ -193,6 +221,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (type == PERF_TYPE_RAW) config = eventsel & X86_RAW_EVENT_MASK; + if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = eventsel; pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), @@ -209,7 +243,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) struct kvm_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; @@ -224,6 +258,12 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) return; } + if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ @@ -253,27 +293,32 @@ EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 bitmask; int bit; - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + clear_bit(bit, pmu->reprogram_pmi); continue; } reprogram_counter(pmu, bit); } + + /* + * Unused perf_events are only released if the corresponding MSRs + * weren't accessed during the last vCPU time slice. kvm_arch_sched_in + * triggers KVM_REQ_PMU if cleanup is needed. + */ + if (unlikely(pmu->need_cleanup)) + kvm_pmu_cleanup(vcpu); } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); + return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -323,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask); + pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; @@ -339,7 +384,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); + return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) || + kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); +} + +static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr); + + if (pmc) + __set_bit(pmc->idx, pmu->pmc_in_use); } int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -349,6 +404,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); } @@ -376,9 +432,45 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); kvm_x86_ops->pmu_ops->init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + pmu->event_count = 0; + pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +/* Release perf_events for vPMCs that have been unused for a full time slice. */ +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); + int i; + + pmu->need_cleanup = false; + + bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, + pmu->pmc_in_use, X86_PMC_IDX_MAX); + + for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) + pmc_stop_counter(pmc); + } + + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); +} + void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 58265f761c3b..7ebb62326c14 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -25,9 +25,10 @@ struct kvm_pmu_ops { unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); - struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx, - u64 *mask); - int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); + struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); + int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -55,12 +56,21 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static inline void pmc_stop_counter(struct kvm_pmc *pmc) +static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static inline void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); } } @@ -79,6 +89,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc) return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); } +static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, + u64 data) +{ + return !(pmu->global_ctrl_mask & data); +} + /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a * paramenter to tell them apart. @@ -110,13 +126,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx); +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index c8388389a3b0..ce0b10fe5e2b 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -174,7 +174,7 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -184,7 +184,8 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) } /* idx is the ECX register of RDPMC instruction */ -static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask) +static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *counters; @@ -199,13 +200,19 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ + return false; +} + +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int ret = false; + struct kvm_pmc *pmc; - ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) || - get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); + pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); - return ret; + return pmc; } static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -272,6 +279,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; pmu->global_status = 0; + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) @@ -285,6 +293,7 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } } @@ -306,8 +315,9 @@ struct kvm_pmu_ops amd_pmu_ops = { .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, - .is_valid_msr_idx = amd_is_valid_msr_idx, + .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx, .is_valid_msr = amd_is_valid_msr, .get_msr = amd_pmu_get_msr, .set_msr = amd_pmu_set_msr, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8ecb6df5106..362e874297e4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,6 +38,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/rwsem.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -418,9 +419,13 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +static int sev_flush_asids(void); +static DECLARE_RWSEM(sev_deactivate_lock); +static DEFINE_MUTEX(sev_bitmap_lock); static unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; +static unsigned long *sev_reclaim_asid_bitmap; #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) struct enc_region { @@ -734,8 +739,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -1229,11 +1240,15 @@ static __init int sev_hardware_setup(void) /* Minimum ASID value that should be used for SEV guest */ min_sev_asid = cpuid_edx(0x8000001F); - /* Initialize SEV ASID bitmap */ + /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; + sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + if (!sev_reclaim_asid_bitmap) + return 1; + status = kmalloc(sizeof(*status), GFP_KERNEL); if (!status) return 1; @@ -1412,8 +1427,12 @@ static __exit void svm_hardware_unsetup(void) { int cpu; - if (svm_sev_enabled()) + if (svm_sev_enabled()) { bitmap_free(sev_asid_bitmap); + bitmap_free(sev_reclaim_asid_bitmap); + + sev_flush_asids(); + } for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1723,25 +1742,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static void __sev_asid_free(int asid) +static void sev_asid_free(int asid) { struct svm_cpu_data *sd; int cpu, pos; + mutex_lock(&sev_bitmap_lock); + pos = asid - 1; - clear_bit(pos, sev_asid_bitmap); + __set_bit(pos, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); sd->sev_vmcbs[pos] = NULL; } -} -static void sev_asid_free(struct kvm *kvm) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - __sev_asid_free(sev->asid); + mutex_unlock(&sev_bitmap_lock); } static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) @@ -1758,10 +1774,12 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) /* deactivate handle */ data->handle = handle; + + /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ + down_read(&sev_deactivate_lock); sev_guest_deactivate(data, NULL); + up_read(&sev_deactivate_lock); - wbinvd_on_all_cpus(); - sev_guest_df_flush(NULL); kfree(data); decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); @@ -1910,7 +1928,7 @@ static void sev_vm_destroy(struct kvm *kvm) mutex_unlock(&kvm->lock); sev_unbind_asid(kvm, sev->handle); - sev_asid_free(kvm); + sev_asid_free(sev->asid); } static void avic_vm_destroy(struct kvm *kvm) @@ -2364,7 +2382,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - BUG(); + WARN_ON_ONCE(1); } } @@ -2517,10 +2535,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { } -static void svm_decache_cr3(struct kvm_vcpu *vcpu) -{ -} - static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -4591,6 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); if (ldr == svm->ldr_reg) return 0; @@ -4598,7 +4613,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); if (ldr) - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + ret = avic_ldr_write(vcpu, id, ldr); if (!ret) svm->ldr_reg = ldr; @@ -4610,8 +4625,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) { u64 *old, *new; struct vcpu_svm *svm = to_svm(vcpu); - u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); - u32 id = (apic_id_reg >> 24) & 0xff; + u32 id = kvm_xapic_id(vcpu->arch.apic); if (vcpu->vcpu_id == id) return 0; @@ -4991,6 +5005,18 @@ static int handle_exit(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_RETPOLINE + if (exit_code == SVM_EXIT_MSR) + return msr_interception(svm); + else if (exit_code == SVM_EXIT_VINTR) + return interrupt_window_interception(svm); + else if (exit_code == SVM_EXIT_INTR) + return intr_interception(svm); + else if (exit_code == SVM_EXIT_HLT) + return halt_interception(svm); + else if (exit_code == SVM_EXIT_NPF) + return npf_interception(svm); +#endif return svm_exit_handlers[exit_code](svm); } @@ -5086,8 +5112,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (svm_nested_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -5104,9 +5129,9 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } -static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool svm_get_enable_apicv(struct kvm *kvm) { - return avic && irqchip_split(vcpu->kvm); + return avic && irqchip_split(kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -5628,7 +5653,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.timer_advance_ns) @@ -5778,7 +5803,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -5887,6 +5912,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES); + /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); @@ -5962,7 +5990,7 @@ static bool svm_mpx_supported(void) static bool svm_xsaves_supported(void) { - return false; + return boot_cpu_has(X86_FEATURE_XSAVES); } static bool svm_umip_emulated(void) @@ -6264,18 +6292,73 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int sev_flush_asids(void) +{ + int ret, error; + + /* + * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, + * so it must be guarded. + */ + down_write(&sev_deactivate_lock); + + wbinvd_on_all_cpus(); + ret = sev_guest_df_flush(&error); + + up_write(&sev_deactivate_lock); + + if (ret) + pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + + return ret; +} + +/* Must be called with the sev_bitmap_lock held */ +static bool __sev_recycle_asids(void) +{ + int pos; + + /* Check if there are any ASIDs to reclaim before performing a flush */ + pos = find_next_bit(sev_reclaim_asid_bitmap, + max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return false; + + if (sev_flush_asids()) + return false; + + bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, + max_sev_asid); + bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + + return true; +} + static int sev_asid_new(void) { + bool retry = true; int pos; + mutex_lock(&sev_bitmap_lock); + /* * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. */ +again: pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) + if (pos >= max_sev_asid) { + if (retry && __sev_recycle_asids()) { + retry = false; + goto again; + } + mutex_unlock(&sev_bitmap_lock); return -EBUSY; + } + + __set_bit(pos, sev_asid_bitmap); + + mutex_unlock(&sev_bitmap_lock); - set_bit(pos, sev_asid_bitmap); return pos + 1; } @@ -6303,7 +6386,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return 0; e_free: - __sev_asid_free(asid); + sev_asid_free(asid); return ret; } @@ -6313,12 +6396,6 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) int asid = sev_get_asid(kvm); int ret; - wbinvd_on_all_cpus(); - - ret = sev_guest_df_flush(error); - if (ret) - return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -7208,7 +7285,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr3 = svm_decache_cr3, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 41abc62c9a8a..4aea7d304beb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -10,6 +10,7 @@ #include "hyperv.h" #include "mmu.h" #include "nested.h" +#include "pmu.h" #include "trace.h" #include "x86.h" @@ -27,6 +28,16 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) +#define SET_MSR_OR_WARN(vcpu, idx, data) \ +({ \ + bool failed = kvm_set_msr(vcpu, idx, data); \ + if (failed) \ + pr_warn_ratelimited( \ + "%s cannot write MSR (0x%x, 0x%llx)\n", \ + __func__, idx, data); \ + failed; \ +}) + /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -257,7 +268,7 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_shadow_vmcs12 = NULL; /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -929,6 +940,57 @@ fail: return i + 1; } +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, + u32 msr_index, + u64 *data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If the L0 hypervisor stored a more accurate value for the TSC that + * does not include the time taken for emulation of the L2->L1 + * VM-exit in L0, use the more accurate value. + */ + if (msr_index == MSR_IA32_TSC) { + int index = vmx_find_msr_index(&vmx->msr_autostore.guest, + MSR_IA32_TSC); + + if (index >= 0) { + u64 val = vmx->msr_autostore.guest.val[index].value; + + *data = kvm_read_l1_tsc(vcpu, val); + return true; + } + } + + if (kvm_get_msr(vcpu, msr_index, data)) { + pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, + msr_index); + return false; + } + return true; +} + +static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, + struct vmx_msr_entry *e) +{ + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(*e), + e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(*e)); + return false; + } + if (nested_vmx_store_msr_check(vcpu, e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e->index, e->reserved); + return false; + } + return true; +} + static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u64 data; @@ -940,26 +1002,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (unlikely(i >= max_msr_list_size)) return -EINVAL; - if (kvm_vcpu_read_guest(vcpu, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return -EINVAL; - } - if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - return -EINVAL; - } - if (kvm_get_msr(vcpu, e.index, &data)) { - pr_debug_ratelimited( - "%s cannot read MSR (%u, 0x%x)\n", - __func__, i, e.index); + + if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) return -EINVAL; - } + if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), @@ -973,6 +1021,60 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 count = vmcs12->vm_exit_msr_store_count; + u64 gpa = vmcs12->vm_exit_msr_store_addr; + struct vmx_msr_entry e; + u32 i; + + for (i = 0; i < count; i++) { + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) + return false; + + if (e.index == msr_index) + return true; + } + return false; +} + +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, + u32 msr_index) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + bool in_vmcs12_store_list; + int msr_autostore_index; + bool in_autostore_list; + int last; + + msr_autostore_index = vmx_find_msr_index(autostore, msr_index); + in_autostore_list = msr_autostore_index >= 0; + in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); + + if (in_vmcs12_store_list && !in_autostore_list) { + if (autostore->nr == NR_LOADSTORE_MSRS) { + /* + * Emulated VMEntry does not fail here. Instead a less + * accurate value will be returned by + * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() + * instead of reading the value from the vmcs02 VMExit + * MSR-store area. + */ + pr_warn_ratelimited( + "Not enough msr entries in msr_autostore. Can't add msr %x\n", + msr_index); + return; + } + last = autostore->nr++; + autostore->val[last].index = msr_index; + } else if (!in_vmcs12_store_list && in_autostore_list) { + last = --autostore->nr; + autostore->val[msr_autostore_index] = autostore->val[last]; + } +} + static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long invalid_mask; @@ -1012,7 +1114,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne kvm_mmu_new_cr3(vcpu, cr3, false); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_init_mmu(vcpu, false); @@ -1024,7 +1126,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * populated by L2 differently than TLB entries populated * by L1. * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * If L0 uses EPT, L1 and L2 run with different EPTP because + * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries + * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid @@ -1034,7 +1138,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - return nested_cpu_has_ept(vmcs12) || + return enable_ept || (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } @@ -2018,7 +2122,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2073,6 +2177,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 @@ -2285,6 +2390,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); } + /* + * Make sure the msr_autostore list is up to date before we set the + * count in the vmcs02. + */ + prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -2381,9 +2493,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); - else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmx_flush_tlb(vcpu, true); /* * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -2418,6 +2527,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, entry_failure_code)) return -EINVAL; + /* + * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 + * on nested VM-Exit, which can occur without actually running L2 and + * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the + * transition to HLT instead of running L2. + */ + if (enable_ept) + vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -2430,6 +2549,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl)) + return -EINVAL; + kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; @@ -2610,7 +2734,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, /* VM-entry exception error code */ if (CC(has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ @@ -2664,6 +2788,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->host_ia32_perf_global_ctrl))) + return -EINVAL; + #ifdef CONFIG_X86_64 ia32e = !!(vcpu->arch.efer & EFER_LMA); #else @@ -2779,6 +2908,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->guest_ia32_perf_global_ctrl))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -2917,7 +3051,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2933,23 +3067,22 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * to it so we can release it later. */ if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ if (!is_error_page(page)) { vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; } } @@ -2994,6 +3127,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + return true; } /* @@ -3032,13 +3166,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail + * + * Returns: + * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_ENTRY_VMFAIL: Consistency check VMFail + * NVMX_ENTRY_VMEXIT: Consistency check VMExit + * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error */ -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3081,11 +3217,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); + if (unlikely(!nested_get_vmcs12_pages(vcpu))) + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; + return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) @@ -3149,7 +3286,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ - return 0; + return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's @@ -3165,14 +3302,14 @@ vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) - return 1; + return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; - return 1; + return NVMX_VMENTRY_VMEXIT; } /* @@ -3182,9 +3319,9 @@ vmentry_fail_vmexit: static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3244,13 +3381,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3281,6 +3414,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -3453,6 +3595,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; } @@ -3856,8 +3999,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -3976,7 +4119,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -4104,6 +4247,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (vmx->nested.l1_tpr_threshold != -1) + vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -4111,15 +4256,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); - } else if (!nested_cpu_has_ept(vmcs12) && - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb(vcpu, true); } /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -4319,6 +4460,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx; + + if (!nested_vmx_allowed(vcpu)) + return; + + vmx = to_vmx(vcpu); + if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + vmx->nested.msrs.entry_ctls_high |= + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high |= + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + } else { + vmx->nested.msrs.entry_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + } +} + static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; @@ -5758,7 +5920,7 @@ error_guest_mode: return ret; } -void nested_vmx_vcpu_setup(void) +void nested_vmx_set_vmcs_shadowing_bitmap(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); @@ -6039,23 +6201,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) init_vmcs_shadow_fields(); } - exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, - exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, - exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, - exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, - exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, - exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, - exit_handlers[EXIT_REASON_VMON] = handle_vmon, - exit_handlers[EXIT_REASON_INVEPT] = handle_invept, - exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, - exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_INVEPT] = handle_invept; + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; kvm_x86_ops->check_nested_events = vmx_check_nested_events; kvm_x86_ops->get_nested_state = vmx_get_nested_state; kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, + kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..fc874d4ead0f 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -6,14 +6,25 @@ #include "vmcs12.h" #include "vmx.h" +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); -void nested_vmx_vcpu_setup(void); +void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); @@ -22,6 +33,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { @@ -245,7 +257,7 @@ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) return ((val & fixed1) | fixed0) == val; } -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -259,7 +271,7 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -267,7 +279,7 @@ static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..7023138b1cb0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -15,6 +15,7 @@ #include "x86.h" #include "cpuid.h" #include "lapic.h" +#include "nested.h" #include "pmu.h" static struct kvm_event_hw_type_mapping intel_arch_events[] = { @@ -46,6 +47,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) if (old_ctrl == new_ctrl) continue; + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); reprogram_fixed_counter(pmc, new_ctrl, i); } @@ -111,7 +113,7 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -122,8 +124,8 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) (fixed && idx >= pmu->nr_arch_fixed_counters); } -static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx, u64 *mask) +static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -162,6 +164,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) return ret; } +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, msr); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0); + + return pmc; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -223,7 +237,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_CTRL: if (pmu->global_ctrl == data) return 0; - if (!(data & pmu->global_ctrl_mask)) { + if (kvm_valid_perf_global_ctrl(pmu, data)) { global_ctrl_changed(pmu, data); return 0; } @@ -262,6 +276,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +298,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +311,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } @@ -314,6 +331,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + + bitmap_set(pmu->all_valid_pmc_idx, + 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, + INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); + + nested_vmx_pmu_entry_exit_ctls_update(vcpu); } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -325,12 +349,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].current_config = 0; } } @@ -363,8 +389,9 @@ struct kvm_pmu_ops intel_pmu_ops = { .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, .is_valid_msr = intel_is_valid_msr, .get_msr = intel_pmu_get_msr, .set_msr = intel_pmu_set_msr, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4575ffb3cec..d175429c91b0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -106,8 +106,6 @@ module_param(enable_apicv, bool, S_IRUGO); static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); -static u64 __read_mostly host_xss; - bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); @@ -209,6 +207,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -445,6 +448,7 @@ const u32 vmx_msr_index[] = { MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, }; #if IS_ENABLED(CONFIG_HYPERV) @@ -633,6 +637,23 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } +static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +{ + int ret = 0; + + u64 old_msr_data = msr->data; + msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); + preempt_enable(); + if (ret) + msr->data = old_msr_data; + } + return ret; +} + void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); @@ -721,8 +742,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; @@ -830,7 +851,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -static int find_msr(struct vmx_msrs *m, unsigned int msr) +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -864,7 +885,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -872,7 +893,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = find_msr(&m->host, msr); + i = vmx_find_msr_index(&m->host, msr); if (i < 0) return; @@ -931,12 +952,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (!entry_only) - j = find_msr(&m->host, msr); + j = vmx_find_msr_index(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || - (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { + if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -964,17 +985,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -1271,6 +1284,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1286,6 +1311,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); +after_clear_sn: + /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the @@ -1294,7 +1321,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ smp_mb__after_atomic(); - if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } @@ -1407,35 +1434,44 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { + if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; + save_rflags = vmx->rmode.save_rflags; rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; } - to_vmx(vcpu)->rflags = rflags; + vmx->rflags = rflags; } - return to_vmx(vcpu)->rflags; + return vmx->rflags; } void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = vmx_get_rflags(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long old_rflags; - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; + if (enable_unrestricted_guest) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + vmx->rflags = rflags; + vmcs_writel(GUEST_RFLAGS, rflags); + return; + } + + old_rflags = vmx_get_rflags(vcpu); + vmx->rflags = rflags; + if (vmx->rmode.vm86_active) { + vmx->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); - if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) + vmx->emulation_required = emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -1672,6 +1708,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_TSC_AUX); if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; vmx->guest_msrs_ready = false; @@ -1771,6 +1810,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + goto find_shared_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1815,14 +1859,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; @@ -1873,8 +1909,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - /* Else, falls through */ + goto find_shared_msr; default: + find_shared_msr: msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; @@ -1990,6 +2027,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) + return 1; + goto find_shared_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2058,25 +2102,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - /* - * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. - */ - if (data != 0) - return 1; - vcpu->arch.ia32_xss = data; - if (vcpu->arch.ia32_xss != host_xss) - add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_XSS); - break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || vmx_rtit_ctl_check(vcpu, data) || @@ -2141,23 +2166,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - /* Else, falls through */ + goto find_shared_msr; + default: + find_shared_msr: msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); + if (msr) + ret = vmx_set_guest_msr(vmx, msr, data); + else + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2165,7 +2182,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); @@ -2177,7 +2195,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR3: + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + break; default: + WARN_ON_ONCE(1); break; } } @@ -2848,13 +2871,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; } -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -2867,8 +2883,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) + if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) return; if (is_pae_paging(vcpu)) { @@ -2890,10 +2905,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -2902,8 +2914,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | @@ -2984,6 +2996,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; + bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; @@ -3000,15 +3013,20 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); } - if (enable_unrestricted_guest || is_paging(vcpu) || - is_guest_mode(vcpu)) - guest_cr3 = kvm_read_cr3(vcpu); - else + /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */ + if (is_guest_mode(vcpu)) + update_guest_cr3 = false; + else if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + guest_cr3 = vcpu->arch.cr3; + else /* vmcs01.GUEST_CR3 is already up-to-date. */ + update_guest_cr3 = false; ept_load_pdptrs(vcpu); } - vmcs_writel(GUEST_CR3, guest_cr3); + if (update_guest_cr3) + vmcs_writel(GUEST_CR3, guest_cr3); } int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -3742,7 +3760,7 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) } } -static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool vmx_get_enable_apicv(struct kvm *kvm) { return enable_apicv; } @@ -4035,6 +4053,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + vcpu->arch.xsaves_enabled = xsaves_enabled; + if (!xsaves_enabled) exec_control &= ~SECONDARY_EXEC_XSAVES; @@ -4147,14 +4167,13 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 /* - * Sets up the vmcs for emulated real mode. + * Noting that the initialization of Guest-state Area of VMCS is in + * vmx_vcpu_reset(). */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void init_vmcs(struct vcpu_vmx *vmx) { - int i; - if (nested) - nested_vmx_vcpu_setup(); + nested_vmx_set_vmcs_shadowing_bitmap(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -4163,7 +4182,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; exec_controls_set(vmx, vmx_exec_control(vmx)); @@ -4212,21 +4230,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ @@ -4237,6 +4240,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) set_cr4_guest_host_mask(vmx); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + if (vmx_xsaves_supported()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); @@ -4339,9 +4345,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx->vcpu.arch.cr0 = cr0; vmx_set_cr0(vcpu, cr0); /* enter rmode */ @@ -4696,7 +4699,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu) +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; @@ -4968,21 +4971,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) vmcs_writel(GUEST_DR7, val); } -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_rdmsr(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wrmsr(vcpu); -} - static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); @@ -4999,11 +4987,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - static int handle_vmcall(struct kvm_vcpu *vcpu) { return kvm_emulate_hypercall(vcpu); @@ -5538,14 +5521,6 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } -static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", - vmcs_read32(VM_EXIT_REASON)); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5559,11 +5534,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_CPUID] = kvm_emulate_cpuid, + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, @@ -5597,15 +5572,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, - [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, - [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, - [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -5940,9 +5911,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) + && kvm_vmx_exit_handlers[exit_reason]) { +#ifdef CONFIG_RETPOLINE + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); +#endif return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { + } else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); dump_vmcs(); @@ -6028,17 +6013,17 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int tpr_threshold; if (is_guest_mode(vcpu) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; + if (is_guest_mode(vcpu)) + to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; + else + vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -6152,7 +6137,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); @@ -6182,7 +6167,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { - return pi_test_on(vcpu_to_pi_desc(vcpu)); + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -6512,9 +6500,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); @@ -6537,7 +6525,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && @@ -6644,7 +6632,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6698,7 +6686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) int err; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int cpu; + int i, cpu; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, "struct kvm_vcpu must be at offset 0 for arch usercopy region"); @@ -6750,6 +6738,34 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->guest_msrs) goto free_pml; + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + switch (index) { + case MSR_IA32_TSX_CTRL: + /* + * No need to pass TSX_CTRL_CPUID_CLEAR through, so + * let's avoid changing CPUID bits under the host + * kernel's feet. + */ + vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + break; + default: + vmx->guest_msrs[j].mask = -1ull; + break; + } + ++vmx->nmsrs; + } + err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_msrs; @@ -6774,7 +6790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); + init_vmcs(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { @@ -6994,6 +7010,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); #undef cr4_fixed1_update } @@ -7088,6 +7105,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ + vcpu->arch.xsaves_enabled = false; + if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); vmcs_set_secondary_exec_control(vmx); @@ -7095,10 +7115,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX); if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); @@ -7108,6 +7130,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); + + if (boot_cpu_has(X86_FEATURE_RTM)) { + struct shared_msr_entry *msr; + msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + if (msr) { + bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + } + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7596,9 +7627,6 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7779,7 +7807,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -7995,12 +8022,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bee16687dc0b..7c1b978b2df4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -22,11 +22,11 @@ extern u32 get_umwait_control_msr(void); #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) -#define NR_AUTOLOAD_MSRS 8 +#define NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; }; struct shared_msr_entry { @@ -167,6 +167,9 @@ struct nested_vmx { u64 vmcs01_debugctl; u64 vmcs01_guest_bndcfgs; + /* to migrate it to L1 if L2 writes to L1's CR8 directly */ + int l1_tpr_threshold; + u16 vpid02; u16 last_vpid; @@ -230,6 +233,10 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; + struct msr_autostore { + struct vmx_msrs guest; + } msr_autostore; + struct { int vm86_active; ulong save_rflags; @@ -334,6 +341,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -355,6 +363,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, @@ -373,6 +386,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed07d8d2caa..3ed167e039e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -176,6 +176,8 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +static u64 __read_mostly host_xss; + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -212,7 +214,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -259,23 +262,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } -static void shared_msr_update(unsigned slot, u32 msr) -{ - u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); @@ -287,10 +273,16 @@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { - unsigned i; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + u64 value; + int i; - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); + for (i = 0; i < shared_msrs_global.nr; ++i) { + rdmsrl_safe(shared_msrs_global.msrs[i], &value); + smsr->values[i].host = value; + smsr->values[i].curr = value; + } } int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) @@ -299,13 +291,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -360,8 +353,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - if (!kvm_rebooting) - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -709,10 +701,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + out: return ret; @@ -722,7 +712,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs); bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - bool changed = true; int offset; gfn_t gfn; int r; @@ -730,8 +719,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) if (!is_pae_paging(vcpu)) return false; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) return true; gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; @@ -739,11 +727,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -out: + return true; - return changed; + return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; } EXPORT_SYMBOL_GPL(pdptrs_changed); @@ -812,27 +798,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } } -EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { - if (vcpu->guest_xcr0_loaded) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, host_xss); } + } -EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -885,34 +878,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -976,7 +977,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); return 0; } @@ -1125,13 +1126,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1161,13 +1164,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, - MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, - MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, - MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, - MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, - MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, - MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, - MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, @@ -1177,18 +1173,12 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, - MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, - MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, - MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, - MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, - MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, - MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1227,7 +1217,7 @@ static u32 emulated_msrs[] = { * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather - * than in msrs_to_save. + * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1246,13 +1236,14 @@ static u32 emulated_msrs[] = { MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1277,6 +1268,7 @@ static u32 msr_based_features[] = { MSR_IA32_ARCH_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; static u64 kvm_get_arch_capabilities(void) @@ -1287,6 +1279,14 @@ static u64 kvm_get_arch_capabilities(void) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us @@ -1305,6 +1305,17 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + /* + * On TAA affected systems: + * - nothing to do if TSX is disabled on the host. + * - we emulate TSX_CTRL if present on the host. + * This lets the guest use VERW to clear CPU buffers. + */ + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; + return data; } @@ -1451,8 +1462,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1527,20 +1538,25 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) } #ifdef CONFIG_X86_64 +struct pvclock_clock { + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; +}; + struct pvclock_gtod_data { seqcount_t seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - } clock; + struct pvclock_clock clock; /* extract of a clocksource struct */ + struct pvclock_clock raw_clock; /* extract of a clocksource struct */ + u64 boot_ns_raw; u64 boot_ns; u64 nsec_base; u64 wall_time_sec; + u64 monotonic_raw_nsec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1548,9 +1564,10 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; + u64 boot_ns, boot_ns_raw; boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); + boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); @@ -1561,11 +1578,20 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_clock.mask = tk->tkr_raw.mask; + vdata->raw_clock.mult = tk->tkr_raw.mult; + vdata->raw_clock.shift = tk->tkr_raw.shift; + vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; vdata->wall_time_sec = tk->xtime_sec; + vdata->boot_ns_raw = boot_ns_raw; + vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; + write_seqcount_end(&vdata->seq); } #endif @@ -1989,21 +2015,21 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) +static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, + int *mode) { long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; u64 tsc_pg_val; - switch (gtod->clock.vclock_mode) { + switch (clock->vclock_mode) { case VCLOCK_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ *mode = VCLOCK_HVCLOCK; - v = (tsc_pg_val - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (tsc_pg_val - clock->cycle_last) & + clock->mask; } else { /* TSC page invalid */ *mode = VCLOCK_NONE; @@ -2012,8 +2038,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) case VCLOCK_TSC: *mode = VCLOCK_TSC; *tsc_timestamp = read_tsc(); - v = (*tsc_timestamp - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (*tsc_timestamp - clock->cycle_last) & + clock->mask; break; default: *mode = VCLOCK_NONE; @@ -2022,10 +2048,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) if (*mode == VCLOCK_NONE) *tsc_timestamp = v = 0; - return v * gtod->clock.mult; + return v * clock->mult; } -static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) +static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2034,10 +2060,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); - ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns = gtod->monotonic_raw_nsec; + ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; - ns += gtod->boot_ns; + ns += gtod->boot_ns_raw; } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -2055,7 +2081,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -2072,7 +2098,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, tsc_timestamp)); } @@ -2543,6 +2569,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) @@ -2694,6 +2721,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + /* + * We do support PT if kvm_x86_ops->pt_supported(), but we do + * not support IA32_XSS[bit 8]. Guests will have to use + * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT + * MSRs. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; @@ -2708,8 +2749,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; - kvmclock_reset(vcpu); - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { bool tmp = (msr == MSR_KVM_SYSTEM_TIME); @@ -2723,14 +2762,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else vcpu->arch.pv_time_enabled = true; break; @@ -3024,6 +3062,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -3801,12 +3845,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - } + } + + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } @@ -4397,6 +4442,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; struct kvm_nested_state kvm_state; + int idx; r = -EINVAL; if (!kvm_x86_ops->set_nested_state) @@ -4420,7 +4466,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_GET_SUPPORTED_HV_CPUID: { @@ -4922,9 +4970,6 @@ set_identity_unlock: if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); - if (r) - goto set_irqchip_out; - r = 0; set_irqchip_out: kfree(chip); break; @@ -5097,23 +5142,28 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, - "Please update the fixed PMCs in msrs_to_save[]"); - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, - "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + "Please update the fixed PMCs in msrs_to_saved_all[]"); + + perf_get_x86_pmu_capability(&x86_pmu); - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; @@ -5141,43 +5191,43 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { if (!kvm_x86_ops->pt_supported() || - msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -6107,7 +6157,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7837,6 +7887,19 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap) +{ + cpumask_var_t cpus; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, + vcpu_bitmap, cpus); + + free_cpumask_var(cpus); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -7914,7 +7977,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) */ put_page(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) { @@ -7937,8 +7999,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -8669,8 +8735,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) goto out; - /* INITs are latched while in SMM */ - if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + /* + * KVM_MP_STATE_INIT_RECEIVED means the processor is in + * INIT state; latched init should be reported using + * KVM_SET_VCPU_EVENTS, so reject it here. + */ + if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -8714,10 +8784,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8736,7 +8802,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -8766,7 +8832,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); @@ -9293,6 +9359,9 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_msr_list(); return 0; } @@ -9346,7 +9415,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; @@ -9415,7 +9484,13 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + vcpu->arch.l1tf_flush_l1d = true; + if (pmu->version && unlikely(pmu->event_count)) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } kvm_x86_ops->sched_in(vcpu, cpu); } @@ -9427,6 +9502,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9455,6 +9531,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return kvm_x86_ops->vm_init(kvm); } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); @@ -9556,6 +9637,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index dbf7442a822b..29391af8871d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -238,8 +238,7 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg) { unsigned long val = kvm_register_read(vcpu, reg); @@ -247,8 +246,7 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, } static inline void kvm_register_writel(struct kvm_vcpu *vcpu, - enum kvm_reg reg, - unsigned long val) + int reg, unsigned long val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; @@ -260,6 +258,11 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } +static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) +{ + return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu); +} + void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); @@ -366,7 +369,7 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index b7375dc6898f..c126571e5e2e 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops) __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* - * AMD, like Intel, supports the EAX hint and EAX=0xf - * means, do not enter any deep C-state and we use it + * AMD, like Intel's MWAIT version, supports the EAX hint and + * EAX=0xf0 means, do not enter any deep C-state and we use it * here in delay() to minimize wakeup latency. */ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 84373dc9b341..bbc68a54795e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 79eb55ce69a9..49d7814b59a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -193,8 +193,8 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", - f->addr, f->count, !!f->old_presence); + pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), @@ -341,8 +341,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 000000000000..f5b85bdc0535 --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#ifdef CONFIG_X86_64 +static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + /* + * Range covering the highest possible canonical userspace address + * as well as non-canonical address range. For the canonical range + * we also need to include the userspace guard page. + */ + return vaddr < TASK_SIZE_MAX + PAGE_SIZE || + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; +} +#else +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + return vaddr < TASK_SIZE_MAX; +} +#endif + +long probe_kernel_read_strict(void *dst, const void *src, size_t size) +{ + if (unlikely(invalid_probe_range((unsigned long)src))) + return -EFAULT; + + return __probe_kernel_read(dst, src, size); +} + +long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) +{ + if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) + return -EFAULT; + + return __strncpy_from_unsafe(dst, unsafe_addr, count); +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index b8ef8557d4b3..673de6063345 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -394,7 +394,7 @@ static void enter_uniprocessor(void) } out: if (num_online_cpus() > 1) - pr_warning("multiple CPUs still online, may miss events.\n"); + pr_warn("multiple CPUs still online, may miss events.\n"); } static void leave_uniprocessor(void) @@ -418,8 +418,8 @@ static void leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning("multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warn("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index abffa0be80da..7f1d2034df1e 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -438,7 +438,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) goto no_emu; if (numa_cleanup_meminfo(&ei) < 0) { - pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); goto no_emu; } @@ -449,7 +449,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (!phys) { - pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } memblock_reserve(phys, phys_size); diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index a8bd952e136d..92153d054d6c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -127,9 +127,9 @@ static int __init init(void) return -ENXIO; } - pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " - "and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); do_test_bulk_ioremapping(); pr_info("All done.\n"); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 991549a1c5f3..b8be18427277 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -9,9 +9,11 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <linux/bpf.h> - +#include <linux/memory.h> +#include <asm/extable.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/text-patching.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -96,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2) /* * The following table maps BPF registers to x86-64 registers. @@ -104,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * Also x86-64 register R9 is unused. x86-64 register R10 is - * used for blinding (if enabled). + * x86-64 register R9 is not used by BPF programs, but can be used by BPF + * trampoline. x86-64 register R10 is used for blinding (if enabled). */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* RAX */ @@ -121,6 +124,20 @@ static const int reg2hex[] = { [BPF_REG_FP] = 5, /* RBP readonly */ [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ + [X86_REG_R9] = 1, /* R9 register, 6th function argument */ +}; + +static const int reg2pt_regs[] = { + [BPF_REG_0] = offsetof(struct pt_regs, ax), + [BPF_REG_1] = offsetof(struct pt_regs, di), + [BPF_REG_2] = offsetof(struct pt_regs, si), + [BPF_REG_3] = offsetof(struct pt_regs, dx), + [BPF_REG_4] = offsetof(struct pt_regs, cx), + [BPF_REG_5] = offsetof(struct pt_regs, r8), + [BPF_REG_6] = offsetof(struct pt_regs, bx), + [BPF_REG_7] = offsetof(struct pt_regs, r13), + [BPF_REG_8] = offsetof(struct pt_regs, r14), + [BPF_REG_9] = offsetof(struct pt_regs, r15), }; /* @@ -135,6 +152,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_7) | BIT(BPF_REG_8) | BIT(BPF_REG_9) | + BIT(X86_REG_R9) | BIT(BPF_REG_AX)); } @@ -186,7 +204,10 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define PROLOGUE_SIZE 20 +/* Number of bytes emit_patch() needs to generate instructions */ +#define X86_PATCH_SIZE 5 + +#define PROLOGUE_SIZE 25 /* * Emit x86-64 prologue code for BPF program and check its size. @@ -195,8 +216,13 @@ struct jit_context { static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; - int cnt = 0; + int cnt = X86_PATCH_SIZE; + /* BPF trampoline can be made to work without these nops, + * but let's waste 5 bytes for now and optimize later + */ + memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); + prog += cnt; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ @@ -213,6 +239,89 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + X86_PATCH_SIZE); + if (!is_simm32(offset)) { + pr_err("Target call %p is out of range\n", func); + return -ERANGE; + } + EMIT1_off32(opcode, offset); + *pprog = prog; + return 0; +} + +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr, + const bool text_live) +{ + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + u8 old_insn[X86_PATCH_SIZE]; + u8 new_insn[X86_PATCH_SIZE]; + u8 *prog; + int ret; + + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + if (old_addr) { + prog = old_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, old_addr, ip) : + emit_jump(&prog, old_addr, ip); + if (ret) + return ret; + } + + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + if (new_addr) { + prog = new_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, new_addr, ip) : + emit_jump(&prog, new_addr, ip); + if (ret) + return ret; + } + + ret = -EBUSY; + mutex_lock(&text_mutex); + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) + goto out; + if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + } + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) + /* BPF poking in modules is not supported */ + return -EINVAL; + + return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); +} + /* * Generate the following code: * @@ -227,7 +336,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog) { u8 *prog = *pprog; int label1, label2, label3; @@ -294,6 +403,68 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } +static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, + u8 **pprog, int addr, u8 *image) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT2(X86_JA, 14); /* ja out */ + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + + poke->ip = image + (addr - X86_PATCH_SIZE); + poke->adj_off = PROLOGUE_SIZE; + + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; + /* out: */ + + *pprog = prog; +} + +static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) +{ + struct bpf_jit_poke_descriptor *poke; + struct bpf_array *array; + struct bpf_prog *target; + int i, ret; + + for (i = 0; i < prog->aux->size_poke_tab; i++) { + poke = &prog->aux->poke_tab[i]; + WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + + array = container_of(poke->tail_call.map, struct bpf_array, map); + mutex_lock(&array->aux->poke_mutex); + target = array->ptrs[poke->tail_call.key]; + if (target) { + /* Plain memcpy is used when image is not live yet + * and still not locked as read-only. Once poke + * location is active (poke->ip_stable), any parallel + * bpf_arch_text_poke() might occur still on the + * read-write image until we finally locked it as + * read-only. Both modifications on the given image + * are under text_mutex to avoid interference. + */ + ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + (u8 *)target->bpf_func + + poke->adj_off, false); + BUG_ON(ret < 0); + } + WRITE_ONCE(poke->ip_stable, true); + mutex_unlock(&array->aux->poke_mutex); + } +} + static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -377,6 +548,96 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* LDX: dst_reg = *(u8*)(src_reg + off) */ +static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'movzx rax, byte ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); + break; + case BPF_H: + /* Emit 'movzx rax, word ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); + break; + case BPF_W: + /* Emit 'mov eax, dword ptr [rax+0x14]' */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); + else + EMIT1(0x8B); + break; + case BPF_DW: + /* Emit 'mov rax, qword ptr [rax+0x14]' */ + EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); + break; + } + /* + * If insn->off == 0 we can save one extra byte, but + * special case of x86 R13 which always needs an offset + * is not worth the hassle + */ + if (is_imm8(off)) + EMIT2(add_2reg(0x40, src_reg, dst_reg), off); + else + EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + *pprog = prog; +} + +/* STX: *(u8*)(dst_reg + off) = src_reg */ +static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'mov byte ptr [rax + off], al' */ + if (is_ereg(dst_reg) || is_ereg(src_reg) || + /* We have to add extra byte for x86 SIL, DIL regs */ + src_reg == BPF_REG_1 || src_reg == BPF_REG_2) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); + else + EMIT1(0x88); + break; + case BPF_H: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT2(0x66, 0x89); + break; + case BPF_W: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT1(0x89); + break; + case BPF_DW: + EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); + break; + } + if (is_imm8(off)) + EMIT2(add_2reg(0x40, dst_reg, src_reg), off); + else + EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + *pprog = prog; +} + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + u32 reg = x->fixup >> 8; + + /* jump over faulting load and clear dest register */ + *(unsigned long *)((void *)regs + reg) = 0; + regs->ip += x->fixup & 0xff; + return true; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -384,7 +645,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0; + int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; @@ -747,64 +1008,64 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: - /* Emit 'mov byte ptr [rax + off], al' */ - if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* We have to add extra byte for x86 SIL, DIL regs */ - src_reg == BPF_REG_1 || src_reg == BPF_REG_2) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); - else - EMIT1(0x88); - goto stx; case BPF_STX | BPF_MEM | BPF_H: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT2(0x66, 0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_W: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT1(0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_DW: - EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); -stx: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); break; /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: - /* Emit 'movzx rax, byte ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_H: - /* Emit 'movzx rax, word ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_W: - /* Emit 'mov eax, dword ptr [rax+0x14]' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); - else - EMIT1(0x8B); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: - /* Emit 'mov rax, qword ptr [rax+0x14]' */ - EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); -ldx: /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), - insn->off); + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + struct exception_table_entry *ex; + u8 *_insn = image + proglen; + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("ex gen bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + if (!is_simm32(delta)) { + pr_err("extable->insn doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->insn = delta; + + delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + if (!is_simm32(delta)) { + pr_err("extable->handler doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->handler = delta; + + if (dst_reg > BPF_REG_9) { + pr_err("verifier error\n"); + return -EFAULT; + } + /* + * Compute size of x86 insn and its target dest x86 register. + * ex_handler_bpf() will use lower 8 bits to adjust + * pt_regs->ip to jump over this x86 instruction + * and upper bits to figure out which pt_regs to zero out. + * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" + * of 4 bytes will be ignored and rbx will be zero inited. + */ + ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + } break; /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ @@ -827,17 +1088,16 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - jmp_offset = func - (image + addrs[i]); - if (!imm32 || !is_simm32(jmp_offset)) { - pr_err("unsupported BPF func %d addr %p image %p\n", - imm32, func, image); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) return -EINVAL; - } - EMIT1_off32(0xE8, jmp_offset); break; case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + if (imm32) + emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + &prog, addrs[i], image); + else + emit_bpf_tail_call_indirect(&prog); break; /* cond jump */ @@ -909,6 +1169,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: + /* test dst_reg, dst_reg to save one extra byte */ + if (imm32 == 0) { + if (BPF_CLASS(insn->code) == BPF_JMP) + EMIT1(add_2mod(0x48, dst_reg, dst_reg)); + else if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); + goto emit_cond_jmp; + } + /* cmp dst_reg, imm8/32 */ if (BPF_CLASS(insn->code) == BPF_JMP) EMIT1(add_1mod(0x48, dst_reg)); @@ -1048,9 +1318,218 @@ emit_jmp: addrs[i] = proglen; prog = temp; } + + if (image && excnt != bpf_prog->aux->num_exentries) { + pr_err("extable is not populated\n"); + return -EFAULT; + } return proglen; } +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), + BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); +} + +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); +} + +static int invoke_bpf(struct btf_func_model *m, u8 **pprog, + struct bpf_prog **progs, int prog_cnt, int stack_size) +{ + u8 *prog = *pprog; + int cnt = 0, i; + + for (i = 0; i < prog_cnt; i++) { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* arg1: lea rdi, [rbp - stack_size] */ + EMIT4(0x48, 0x8D, 0x7D, -stack_size); + /* arg2: progs[i]->insnsi for interpreter */ + if (!progs[i]->jited) + emit_mov_imm64(&prog, BPF_REG_2, + (long) progs[i]->insnsi >> 32, + (u32) (long) progs[i]->insnsi); + /* call JITed bpf program or interpreter */ + if (emit_call(&prog, progs[i]->bpf_func, prog)) + return -EINVAL; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32, + (u32) (long) progs[i]); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +/* Example: + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); + * its 'struct btf_func_model' will be nr_args=2 + * The assembly code when eth_type_trans is executing after trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 16 // space for skb and dev + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 16], rdi // save skb pointer to stack + * mov qword ptr [rbp - 8], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 16] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack + * pop rbx + * leave + * ret + * + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be + * replaced with 'call generated_bpf_trampoline'. When it returns + * eth_type_trans will continue executing with original skb and dev pointers. + * + * The assembly code when eth_type_trans is called from trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 24 // space for skb, dev, return value + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 24], rdi // save skb pointer to stack + * mov qword ptr [rbp - 16], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time if bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack + * call eth_type_trans+5 // execute body of eth_type_trans + * mov qword ptr [rbp - 8], rax // save return value + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value + * pop rbx + * leave + * add rsp, 8 // skip eth_type_trans's frame + * ret // return to its caller + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + int cnt = 0, nr_args = m->nr_args; + int stack_size = nr_args * 8; + u8 *prog; + + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ + if (nr_args > 6) + return -ENOTSUPP; + + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += 8; /* room for return value of orig_call */ + + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip patched call instruction and point orig_call to actual + * body of the kernel function. + */ + orig_call += X86_PATCH_SIZE; + + prog = image; + + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ + EMIT1(0x53); /* push rbx */ + + save_regs(m, &prog, nr_args, stack_size); + + if (fentry_cnt) + if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + if (fentry_cnt) + restore_regs(m, &prog, nr_args, stack_size); + + /* call original function */ + if (emit_call(&prog, orig_call, prog)) + return -EINVAL; + /* remember return value in a stack for bpf prog to access */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + } + + if (fexit_cnt) + if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_RESTORE_REGS) + restore_regs(m, &prog, nr_args, stack_size); + + if (flags & BPF_TRAMP_F_CALL_ORIG) + /* restore original return value back into RAX */ + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + + EMIT1(0x5B); /* pop rbx */ + EMIT1(0xC9); /* leave */ + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip our return address and return to parent */ + EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ + EMIT1(0xC3); /* ret */ + /* One half of the page has active running trampoline. + * Another half is an area for next trampoline. + * Make sure the trampoline generation logic doesn't overflow. + */ + if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + return -EFAULT; + return 0; +} + struct x64_jit_data { struct bpf_binary_header *header; int *addrs; @@ -1148,12 +1627,24 @@ out_image: break; } if (proglen == oldproglen) { - header = bpf_jit_binary_alloc(proglen, &image, - 1, jit_fill_hole); + /* + * The number of entries in extable is the number of BPF_LDX + * insns that access kernel memory via "pointer to BTF type". + * The verifier changed their opcode from LDX|MEM|size + * to LDX|PROBE_MEM|size to make JITing easier. + */ + u32 align = __alignof__(struct exception_table_entry); + u32 extable_size = prog->aux->num_exentries * + sizeof(struct exception_table_entry); + + /* allocate module memory for x86 insns and extable */ + header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, + &image, align, jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; } + prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; cond_resched(); @@ -1164,6 +1655,7 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + bpf_tail_call_direct_fixup(prog); bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 71e8a67337e2..276cf79b5d24 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -67,13 +67,13 @@ static inline void op_x86_warn_in_use(int counter) * cannot be monitored by any other counter, contact your * hardware or BIOS vendor. */ - pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", - counter, smp_processor_id()); + pr_warn("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); } static inline void op_x86_warn_reserved(int counter) { - pr_warning("oprofile: counter #%d is already reserved\n", counter); + pr_warn("oprofile: counter #%d is already reserved\n", counter); } extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c202e1b07e29..425e025341db 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -917,9 +917,6 @@ static void __init kexec_enter_virtual_mode(void) if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); - - /* clean DUMMY object */ - efi_delete_dummy_variable(); #endif } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 6d193bb36021..089413cd944e 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -39,7 +39,7 @@ static int set_lid_wake_behavior(bool wake_on_close) status = acpi_execute_simple_method(NULL, "\\_SB.PCI0.LID.LIDW", wake_on_close); if (ACPI_FAILURE(status)) { - pr_warning(PFX "failed to set lid behavior\n"); + pr_warn(PFX "failed to set lid behavior\n"); return 1; } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index bf6016f8db4e..6259563760f9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -26,8 +26,7 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static void __init mp_sfi_register_lapic(u8 id) { if (MAX_LOCAL_APIC - id <= 0) { - pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_LOCAL_APIC); + pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 0d3365cb64de..a04551ee5568 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -57,19 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void) return NULL; /* Here we know that Xen runs on EFI platform. */ - - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; + xen_efi_runtime_setup(); efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 750f46ad018a..205b1176084f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -269,19 +269,41 @@ void xen_reboot(int reason) BUG(); } +static int reboot_reason = SHUTDOWN_reboot; +static bool xen_legacy_crash; void xen_emergency_restart(void) { - xen_reboot(SHUTDOWN_reboot); + xen_reboot(reboot_reason); } static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - if (!kexec_crash_loaded()) - xen_reboot(SHUTDOWN_crash); + if (!kexec_crash_loaded()) { + if (xen_legacy_crash) + xen_reboot(SHUTDOWN_crash); + + reboot_reason = SHUTDOWN_crash; + + /* + * If panic_timeout==0 then we are supposed to wait forever. + * However, to preserve original dom0 behavior we have to drop + * into hypervisor. (domU behavior is controlled by its + * config file) + */ + if (panic_timeout == 0) + panic_timeout = -1; + } return NOTIFY_DONE; } +static int __init parse_xen_legacy_crash(char *arg) +{ + xen_legacy_crash = true; + return 0; +} +early_param("xen_legacy_crash", parse_xen_legacy_crash); + static struct notifier_block xen_panic_block = { .notifier_call = xen_panic_event, .priority = INT_MIN diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 58f79ab32358..5bfea374a160 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -117,6 +117,14 @@ static void __init xen_banner(void) printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); + +#ifdef CONFIG_X86_32 + pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n" + "Support for running as 32-bit PV-guest under Xen will soon be removed\n" + "from the Linux kernel!\n" + "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n" + "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"); +#endif } static void __init xen_pv_init_platform(void) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 548d1e0a5ba1..33b0e20df7fc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -412,7 +412,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { - pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, cur_pfn + left, nr_pages); break; |