diff options
Diffstat (limited to 'arch/x86')
178 files changed, 7971 insertions, 4109 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d6e1faa28c58..b89eb1f0c0d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE - select ARCH_SUPPORTS_INT128 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY select MODULES_USE_ELF_RELA @@ -932,36 +932,6 @@ config GART_IOMMU If unsure, say Y. -config CALGARY_IOMMU - bool "IBM Calgary IOMMU support" - select IOMMU_HELPER - select SWIOTLB - depends on X86_64 && PCI - ---help--- - Support for hardware IOMMUs in IBM's xSeries x366 and x460 - systems. Needed to run systems with more than 3GB of memory - properly with 32-bit PCI devices that do not support DAC - (Double Address Cycle). Calgary also supports bus level - isolation, where all DMAs pass through the IOMMU. This - prevents them from going anywhere except their intended - destination. This catches hard-to-find kernel bugs and - mis-behaving drivers and devices that do not use the DMA-API - properly to set up their DMA buffers. The IOMMU can be - turned off at boot time with the iommu=off parameter. - Normally the kernel will make the right choice by itself. - If unsure, say Y. - -config CALGARY_IOMMU_ENABLED_BY_DEFAULT - def_bool y - prompt "Should Calgary be enabled by default?" - depends on CALGARY_IOMMU - ---help--- - Should Calgary be enabled by default? if you choose 'y', Calgary - will be used (if it exists). If you choose 'n', Calgary will not be - used even if it exists. If you choose 'n' and would like to use - Calgary anyway, pass 'iommu=calgary' on the kernel command line. - If unsure, say Y. - config MAXSMP bool "Enable Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL @@ -1000,8 +970,8 @@ config NR_CPUS_RANGE_END config NR_CPUS_RANGE_END int depends on X86_64 - default 8192 if SMP && ( MAXSMP || CPUMASK_OFFSTACK) - default 512 if SMP && (!MAXSMP && !CPUMASK_OFFSTACK) + default 8192 if SMP && CPUMASK_OFFSTACK + default 512 if SMP && !CPUMASK_OFFSTACK default 1 if !SMP config NR_CPUS_DEFAULT @@ -1492,6 +1462,7 @@ config X86_PAE config X86_5LEVEL bool "Enable 5-level page tables support" + default y select DYNAMIC_MEMORY_LAYOUT select SPARSEMEM_VMEMMAP depends on X86_64 @@ -1751,7 +1722,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool depends on MODIFY_LDT_SYSCALL - prompt "Math emulation" if X86_32 + prompt "Math emulation" if X86_32 && (M486SX || MELAN) ---help--- Linux can emulate a math coprocessor (used for floating point operations) if you don't have one. 486DX and Pentium processors have @@ -1880,16 +1851,16 @@ config X86_SMAP If unsure, say Y. -config X86_INTEL_UMIP +config X86_UMIP def_bool y - depends on CPU_SUP_INTEL - prompt "Intel User Mode Instruction Prevention" if EXPERT + depends on CPU_SUP_INTEL || CPU_SUP_AMD + prompt "User Mode Instruction Prevention" if EXPERT ---help--- - The User Mode Instruction Prevention (UMIP) is a security - feature in newer Intel processors. If enabled, a general - protection fault is issued if the SGDT, SLDT, SIDT, SMSW - or STR instructions are executed in user mode. These instructions - unnecessarily expose information about the hardware state. + User Mode Instruction Prevention (UMIP) is a security feature in + some x86 processors. If enabled, a general protection fault is + issued if the SGDT, SLDT, SIDT, SMSW or STR instructions are + executed in user mode. These instructions unnecessarily expose + information about the hardware state. The vast majority of applications do not use these instructions. For the very few that do, software emulation is provided in @@ -1940,6 +1911,51 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS If unsure, say y. +choice + prompt "TSX enable mode" + depends on CPU_SUP_INTEL + default X86_INTEL_TSX_MODE_OFF + help + Intel's TSX (Transactional Synchronization Extensions) feature + allows to optimize locking protocols through lock elision which + can lead to a noticeable performance boost. + + On the other hand it has been shown that TSX can be exploited + to form side channel attacks (e.g. TAA) and chances are there + will be more of those attacks discovered in the future. + + Therefore TSX is not enabled by default (aka tsx=off). An admin + might override this decision by tsx=on the command line parameter. + Even with TSX enabled, the kernel will attempt to enable the best + possible TAA mitigation setting depending on the microcode available + for the particular machine. + + This option allows to set the default tsx mode between tsx=on, =off + and =auto. See Documentation/admin-guide/kernel-parameters.txt for more + details. + + Say off if not sure, auto if TSX is in use but it should be used on safe + platforms or on if TSX is in use and the security aspect of tsx is not + relevant. + +config X86_INTEL_TSX_MODE_OFF + bool "off" + help + TSX is disabled if possible - equals to tsx=off command line parameter. + +config X86_INTEL_TSX_MODE_ON + bool "on" + help + TSX is always enabled on TSX capable HW - equals the tsx=on command + line parameter. + +config X86_INTEL_TSX_MODE_AUTO + bool "auto" + help + TSX is enabled on TSX capable HW that is believed to be safe against + side channel attacks- equals the tsx=auto command line parameter. +endchoice + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8e29c991ba3e..af9c967782f6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -50,12 +50,19 @@ choice See each option's help text for additional details. If you don't know what to do, choose "486". +config M486SX + bool "486SX" + depends on X86_32 + ---help--- + Select this for an 486-class CPU without an FPU such as + AMD/Cyrix/IBM/Intel SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5S. + config M486 - bool "486" + bool "486DX" depends on X86_32 ---help--- Select this for an 486-class CPU such as AMD/Cyrix/IBM/Intel - 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. + 486DX/DX2/DX4 and UMC U5D. config M586 bool "586/K5/5x86/6x86/6x86MX" @@ -312,20 +319,20 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if MELAN || M486 || MGEODEGX1 + default "4" if MELAN || M486SX || M486 || MGEODEGX1 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX config X86_F00F_BUG def_bool y - depends on M586MMX || M586TSC || M586 || M486 + depends on M586MMX || M586TSC || M586 || M486SX || M486 config X86_INVD_BUG def_bool y - depends on M486 + depends on M486SX || M486 config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -378,7 +385,7 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486) && !UML + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EXPERT @@ -402,7 +409,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) + depends on M486SX || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -470,7 +477,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on M486 || (EXPERT && !64BIT) + depends on M486SX || M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index bf9cd83de777..409c00f74e60 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -316,10 +316,6 @@ config UNWINDER_FRAME_POINTER unwinder, but the kernel text size will grow by ~3% and the kernel's overall performance will degrade by roughly 5-10%. - This option is recommended if you want to use the livepatch - consistency model, as this is currently the only way to get a - reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). - config UNWINDER_GUESS bool "Guess unwinder" depends on EXPERT diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 1f5faf8606b4..cd3056759880 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -10,6 +10,7 @@ else tune = $(call cc-option,-mcpu=$(1),$(2)) endif +cflags-$(CONFIG_M486SX) += -march=i486 cflags-$(CONFIG_M486) += -march=i486 cflags-$(CONFIG_M586) += -march=i586 cflags-$(CONFIG_M586TSC) += -march=i586 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 3e9f5544820a..95410d6ee2ff 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -88,7 +88,7 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) [ABCDGRSTVW] \(startup_32\|startup_64\|efi32_stub_entry\|efi64_stub_entry\|efi_pe_entry\|input_data\|kernel_info\|_end\|_ehead\|_text\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index b246f18c5857..aa976adb7094 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -73,8 +73,8 @@ $(obj)/../voffset.h: vmlinux FORCE $(obj)/misc.o: $(obj)/../voffset.h -vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \ - $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ +vmlinux-objs-y := $(obj)/vmlinux.lds $(obj)/kernel_info.o $(obj)/head_$(BITS).o \ + $(obj)/misc.o $(obj)/string.o $(obj)/cmdline.o $(obj)/error.o \ $(obj)/piggy.o $(obj)/cpuflags.o vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 149795c369f2..25019d42ae93 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -21,30 +21,6 @@ struct mem_vector immovable_mem[MAX_NUMNODES*2]; /* - * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex - * digits, and '\0' for termination. - */ -#define MAX_ADDR_LEN 19 - -static acpi_physical_address get_cmdline_acpi_rsdp(void) -{ - acpi_physical_address addr = 0; - -#ifdef CONFIG_KEXEC - char val[MAX_ADDR_LEN] = { }; - int ret; - - ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); - if (ret < 0) - return 0; - - if (kstrtoull(val, 16, &addr)) - return 0; -#endif - return addr; -} - -/* * Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and * ACPI_TABLE_GUID are found, take the former, which has more features. */ @@ -298,6 +274,30 @@ acpi_physical_address get_rsdp_addr(void) } #if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE) +/* + * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex + * digits, and '\0' for termination. + */ +#define MAX_ADDR_LEN 19 + +static acpi_physical_address get_cmdline_acpi_rsdp(void) +{ + acpi_physical_address addr = 0; + +#ifdef CONFIG_KEXEC + char val[MAX_ADDR_LEN] = { }; + int ret; + + ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN); + if (ret < 0) + return 0; + + if (kstrtoull(val, 16, &addr)) + return 0; +#endif + return addr; +} + /* Compute SRAT address from RSDP. */ static unsigned long get_acpi_srat_table(void) { diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index d6662fdef300..82bc60c8acb2 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -13,6 +13,7 @@ #include <asm/e820/types.h> #include <asm/setup.h> #include <asm/desc.h> +#include <asm/boot.h> #include "../string.h" #include "eboot.h" @@ -813,7 +814,8 @@ efi_main(struct efi_config *c, struct boot_params *boot_params) status = efi_relocate_kernel(sys_table, &bzimage_addr, hdr->init_size, hdr->init_size, hdr->pref_address, - hdr->kernel_alignment); + hdr->kernel_alignment, + LOAD_PHYSICAL_ADDR); if (status != EFI_SUCCESS) { efi_printk(sys_table, "efi_relocate_kernel() failed!\n"); goto fail; diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c index 2e53c056ba20..bb9bfef174ae 100644 --- a/arch/x86/boot/compressed/kaslr.c +++ b/arch/x86/boot/compressed/kaslr.c @@ -459,6 +459,18 @@ static bool mem_avoid_overlap(struct mem_vector *img, is_overlapping = true; } + if (ptr->type == SETUP_INDIRECT && + ((struct setup_indirect *)ptr->data)->type != SETUP_INDIRECT) { + avoid.start = ((struct setup_indirect *)ptr->data)->addr; + avoid.size = ((struct setup_indirect *)ptr->data)->len; + + if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) { + *overlap = avoid; + earliest = overlap->start; + is_overlapping = true; + } + } + ptr = (struct setup_data *)(unsigned long)ptr->next; } diff --git a/arch/x86/boot/compressed/kernel_info.S b/arch/x86/boot/compressed/kernel_info.S new file mode 100644 index 000000000000..f818ee8fba38 --- /dev/null +++ b/arch/x86/boot/compressed/kernel_info.S @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/bootparam.h> + + .section ".rodata.kernel_info", "a" + + .global kernel_info + +kernel_info: + /* Header, Linux top (structure). */ + .ascii "LToP" + /* Size. */ + .long kernel_info_var_len_data - kernel_info + /* Size total. */ + .long kernel_info_end - kernel_info + + /* Maximal allowed type for setup_data and setup_indirect structs. */ + .long SETUP_TYPE_MAX + +kernel_info_var_len_data: + /* Empty for time being... */ +kernel_info_end: diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 53ac0cb2396d..9652d5c2afda 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -345,6 +345,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, { const unsigned long kernel_total_size = VO__end - VO__text; unsigned long virt_addr = LOAD_PHYSICAL_ADDR; + unsigned long needed_size; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -379,26 +380,38 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + /* + * The memory hole needed for the kernel is the larger of either + * the entire decompressed kernel plus relocation table, or the + * entire decompressed kernel plus .bss and .brk sections. + * + * On X86_64, the memory is mapped with PMD pages. Round the + * size up so that the full extent of PMD pages mapped is + * included in the check against the valid memory table + * entries. This ensures the full mapped area is usable RAM + * and doesn't include any reserved areas. + */ + needed_size = max(output_len, kernel_total_size); +#ifdef CONFIG_X86_64 + needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN); +#endif + /* Report initial kernel position details. */ debug_putaddr(input_data); debug_putaddr(input_len); debug_putaddr(output); debug_putaddr(output_len); debug_putaddr(kernel_total_size); + debug_putaddr(needed_size); #ifdef CONFIG_X86_64 /* Report address of 32-bit trampoline */ debug_putaddr(trampoline_32bit); #endif - /* - * The memory hole needed for the kernel is the larger of either - * the entire decompressed kernel plus relocation table, or the - * entire decompressed kernel plus .bss and .brk sections. - */ choose_random_location((unsigned long)input_data, input_len, (unsigned long *)&output, - max(output_len, kernel_total_size), + needed_size, &virt_addr); /* Validate memory location choices. */ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 2c11c0f45d49..97d9b6d6c1af 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -300,7 +300,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x020d # header version number (>= 0x0105) + .word 0x020f # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -567,6 +567,7 @@ pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr init_size: .long INIT_SIZE # kernel initialization size handover_offset: .long 0 # Filled in by build.c +kernel_info_offset: .long 0 # Filled in by build.c # End of setup header ##################################################### diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index a93d44e58f9c..55e669d29e54 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -56,6 +56,7 @@ u8 buf[SETUP_SECT_MAX*512]; unsigned long efi32_stub_entry; unsigned long efi64_stub_entry; unsigned long efi_pe_entry; +unsigned long kernel_info; unsigned long startup_64; /*----------------------------------------------------------------------*/ @@ -321,6 +322,7 @@ static void parse_zoffset(char *fname) PARSE_ZOFS(p, efi32_stub_entry); PARSE_ZOFS(p, efi64_stub_entry); PARSE_ZOFS(p, efi_pe_entry); + PARSE_ZOFS(p, kernel_info); PARSE_ZOFS(p, startup_64); p = strchr(p, '\n'); @@ -410,6 +412,9 @@ int main(int argc, char ** argv) efi_stub_entry_update(); + /* Update kernel_info offset. */ + put_unaligned_le32(kernel_info, &buf[0x268]); + crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, dest) != i) die("Writing setup failed"); diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index d0a5ffeae8df..0b9654c7a05c 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -25,7 +25,6 @@ CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_SMP=y -CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 CONFIG_SCHED_SMT=y CONFIG_PREEMPT_VOLUNTARY=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 759b1a927826..958440eae27e 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_CRYPTO_AEGIS128_AESNI_SSE2) += aegis128-aesni.o obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o +obj-$(CONFIG_CRYPTO_CURVE25519_X86) += curve25519-x86_64.o # These modules require assembler to support AVX. ifeq ($(avx_supported),yes) @@ -48,6 +49,7 @@ ifeq ($(avx_supported),yes) obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_AVX_X86_64) += twofish-avx-x86_64.o obj-$(CONFIG_CRYPTO_SERPENT_AVX_X86_64) += serpent-avx-x86_64.o + obj-$(CONFIG_CRYPTO_BLAKE2S_X86) += blake2s-x86_64.o endif # These modules require assembler to support AVX2. @@ -70,6 +72,7 @@ serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o +blake2s-x86_64-y := blake2s-core.o blake2s-glue.o ifeq ($(avx_supported),yes) camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ diff --git a/arch/x86/crypto/blake2s-core.S b/arch/x86/crypto/blake2s-core.S new file mode 100644 index 000000000000..24910b766bdd --- /dev/null +++ b/arch/x86/crypto/blake2s-core.S @@ -0,0 +1,258 @@ +/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <linux/linkage.h> + +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 +.align 32 +IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 + .octa 0x5BE0CD191F83D9AB9B05688C510E527F +.section .rodata.cst16.ROT16, "aM", @progbits, 16 +.align 16 +ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 +.section .rodata.cst16.ROR328, "aM", @progbits, 16 +.align 16 +ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 +.align 64 +SIGMA: +.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 +.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 +.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 +.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 +.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 +.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 +.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 +.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 +.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 +#ifdef CONFIG_AS_AVX512 +.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 +.align 64 +SIGMA2: +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 +#endif /* CONFIG_AS_AVX512 */ + +.text +#ifdef CONFIG_AS_SSSE3 +SYM_FUNC_START(blake2s_compress_ssse3) + testq %rdx,%rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + leaq SIGMA+0xa0(%rip),%r8 + jmp .Lbeginofloop + .align 32 +.Lbeginofloop: + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + leaq SIGMA(%rip),%rcx +.Lroundloop: + movzbl (%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0x1(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x2(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x3(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + punpckldq %xmm5,%xmm4 + punpckldq %xmm7,%xmm6 + punpcklqdq %xmm6,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0x4(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0x5(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x6(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0x7(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + punpckldq %xmm6,%xmm5 + punpckldq %xmm4,%xmm7 + punpcklqdq %xmm7,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movzbl 0x8(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + movzbl 0x9(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xa(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xb(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + punpckldq %xmm7,%xmm6 + punpckldq %xmm5,%xmm4 + punpcklqdq %xmm4,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movzbl 0xc(%rcx),%eax + movd (%rsi,%rax,4),%xmm7 + movzbl 0xd(%rcx),%eax + movd (%rsi,%rax,4),%xmm4 + movzbl 0xe(%rcx),%eax + movd (%rsi,%rax,4),%xmm5 + movzbl 0xf(%rcx),%eax + movd (%rsi,%rax,4),%xmm6 + punpckldq %xmm4,%xmm7 + punpckldq %xmm6,%xmm5 + punpcklqdq %xmm5,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + addq $0x10,%rcx + cmpq %r8,%rcx + jnz .Lroundloop + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) +.Lendofloop: + ret +SYM_FUNC_END(blake2s_compress_ssse3) +#endif /* CONFIG_AS_SSSE3 */ + +#ifdef CONFIG_AS_AVX512 +SYM_FUNC_START(blake2s_compress_avx512) + vmovdqu (%rdi),%xmm0 + vmovdqu 0x10(%rdi),%xmm1 + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop +.align 32 +.Lblake2s_compress_avx512_mainloop: + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA2(%rip),%rax + movb $0xa,%cl +.Lblake2s_compress_avx512_roundloop: + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x10,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0xc,%xmm1,%xmm1 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + vprord $0x8,%xmm3,%xmm3 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + vprord $0x7,%xmm1,%xmm1 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 + decb %cl + jne .Lblake2s_compress_avx512_roundloop + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm2,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop + vmovdqu %xmm0,(%rdi) + vmovdqu %xmm1,0x10(%rdi) + vmovdqu %xmm4,0x20(%rdi) + vzeroupper + retq +SYM_FUNC_END(blake2s_compress_avx512) +#endif /* CONFIG_AS_AVX512 */ diff --git a/arch/x86/crypto/blake2s-glue.c b/arch/x86/crypto/blake2s-glue.c new file mode 100644 index 000000000000..4a37ba7cdbe5 --- /dev/null +++ b/arch/x86/crypto/blake2s-glue.c @@ -0,0 +1,233 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT +/* + * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + */ + +#include <crypto/internal/blake2s.h> +#include <crypto/internal/simd.h> +#include <crypto/internal/hash.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/fpu/api.h> +#include <asm/processor.h> +#include <asm/simd.h> + +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); +asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, + const u8 *block, const size_t nblocks, + const u32 inc); + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_ssse3); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(blake2s_use_avx512); + +void blake2s_compress_arch(struct blake2s_state *state, + const u8 *block, size_t nblocks, + const u32 inc) +{ + /* SIMD disables preemption, so relax after processing each page. */ + BUILD_BUG_ON(PAGE_SIZE / BLAKE2S_BLOCK_SIZE < 8); + + if (!static_branch_likely(&blake2s_use_ssse3) || !crypto_simd_usable()) { + blake2s_compress_generic(state, block, nblocks, inc); + return; + } + + for (;;) { + const size_t blocks = min_t(size_t, nblocks, + PAGE_SIZE / BLAKE2S_BLOCK_SIZE); + + kernel_fpu_begin(); + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&blake2s_use_avx512)) + blake2s_compress_avx512(state, block, blocks, inc); + else + blake2s_compress_ssse3(state, block, blocks, inc); + kernel_fpu_end(); + + nblocks -= blocks; + if (!nblocks) + break; + block += blocks * BLAKE2S_BLOCK_SIZE; + } +} +EXPORT_SYMBOL(blake2s_compress_arch); + +static int crypto_blake2s_setkey(struct crypto_shash *tfm, const u8 *key, + unsigned int keylen) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(tfm); + + if (keylen == 0 || keylen > BLAKE2S_KEY_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(tctx->key, key, keylen); + tctx->keylen = keylen; + + return 0; +} + +static int crypto_blake2s_init(struct shash_desc *desc) +{ + struct blake2s_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); + struct blake2s_state *state = shash_desc_ctx(desc); + const int outlen = crypto_shash_digestsize(desc->tfm); + + if (tctx->keylen) + blake2s_init_key(state, outlen, tctx->key, tctx->keylen); + else + blake2s_init(state, outlen); + + return 0; +} + +static int crypto_blake2s_update(struct shash_desc *desc, const u8 *in, + unsigned int inlen) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + const size_t fill = BLAKE2S_BLOCK_SIZE - state->buflen; + + if (unlikely(!inlen)) + return 0; + if (inlen > fill) { + memcpy(state->buf + state->buflen, in, fill); + blake2s_compress_arch(state, state->buf, 1, BLAKE2S_BLOCK_SIZE); + state->buflen = 0; + in += fill; + inlen -= fill; + } + if (inlen > BLAKE2S_BLOCK_SIZE) { + const size_t nblocks = DIV_ROUND_UP(inlen, BLAKE2S_BLOCK_SIZE); + /* Hash one less (full) block than strictly possible */ + blake2s_compress_arch(state, in, nblocks - 1, BLAKE2S_BLOCK_SIZE); + in += BLAKE2S_BLOCK_SIZE * (nblocks - 1); + inlen -= BLAKE2S_BLOCK_SIZE * (nblocks - 1); + } + memcpy(state->buf + state->buflen, in, inlen); + state->buflen += inlen; + + return 0; +} + +static int crypto_blake2s_final(struct shash_desc *desc, u8 *out) +{ + struct blake2s_state *state = shash_desc_ctx(desc); + + blake2s_set_lastblock(state); + memset(state->buf + state->buflen, 0, + BLAKE2S_BLOCK_SIZE - state->buflen); /* Padding */ + blake2s_compress_arch(state, state->buf, 1, state->buflen); + cpu_to_le32_array(state->h, ARRAY_SIZE(state->h)); + memcpy(out, state->h, state->outlen); + memzero_explicit(state, sizeof(*state)); + + return 0; +} + +static struct shash_alg blake2s_algs[] = {{ + .base.cra_name = "blake2s-128", + .base.cra_driver_name = "blake2s-128-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_128_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-160", + .base.cra_driver_name = "blake2s-160-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_160_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-224", + .base.cra_driver_name = "blake2s-224-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_224_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}, { + .base.cra_name = "blake2s-256", + .base.cra_driver_name = "blake2s-256-x86", + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), + .base.cra_priority = 200, + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, + .base.cra_module = THIS_MODULE, + + .digestsize = BLAKE2S_256_HASH_SIZE, + .setkey = crypto_blake2s_setkey, + .init = crypto_blake2s_init, + .update = crypto_blake2s_update, + .final = crypto_blake2s_final, + .descsize = sizeof(struct blake2s_state), +}}; + +static int __init blake2s_mod_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_SSSE3)) + return 0; + + static_branch_enable(&blake2s_use_ssse3); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | + XFEATURE_MASK_AVX512, NULL)) + static_branch_enable(&blake2s_use_avx512); + + return crypto_register_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +static void __exit blake2s_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_shashes(blake2s_algs, ARRAY_SIZE(blake2s_algs)); +} + +module_init(blake2s_mod_init); +module_exit(blake2s_mod_exit); + +MODULE_ALIAS_CRYPTO("blake2s-128"); +MODULE_ALIAS_CRYPTO("blake2s-128-x86"); +MODULE_ALIAS_CRYPTO("blake2s-160"); +MODULE_ALIAS_CRYPTO("blake2s-160-x86"); +MODULE_ALIAS_CRYPTO("blake2s-224"); +MODULE_ALIAS_CRYPTO("blake2s-224-x86"); +MODULE_ALIAS_CRYPTO("blake2s-256"); +MODULE_ALIAS_CRYPTO("blake2s-256-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 388f95a4ec24..a94e30b6f941 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -7,7 +7,7 @@ */ #include <crypto/algapi.h> -#include <crypto/chacha.h> +#include <crypto/internal/chacha.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> @@ -21,24 +21,24 @@ asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds); -#ifdef CONFIG_AS_AVX2 + asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx2; -#ifdef CONFIG_AS_AVX512 + asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src, unsigned int len, int nrounds); -static bool chacha_use_avx512vl; -#endif -#endif + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(chacha_use_avx512vl); static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) { @@ -49,9 +49,8 @@ static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks) static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { -#ifdef CONFIG_AS_AVX2 -#ifdef CONFIG_AS_AVX512 - if (chacha_use_avx512vl) { + if (IS_ENABLED(CONFIG_AS_AVX512) && + static_branch_likely(&chacha_use_avx512vl)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx512vl(state, dst, src, bytes, nrounds); @@ -79,8 +78,9 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif - if (chacha_use_avx2) { + + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&chacha_use_avx2)) { while (bytes >= CHACHA_BLOCK_SIZE * 8) { chacha_8block_xor_avx2(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 8; @@ -104,7 +104,7 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, return; } } -#endif + while (bytes >= CHACHA_BLOCK_SIZE * 4) { chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds); bytes -= CHACHA_BLOCK_SIZE * 4; @@ -123,37 +123,76 @@ static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src, } } -static int chacha_simd_stream_xor(struct skcipher_walk *walk, +void hchacha_block_arch(const u32 *state, u32 *stream, int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable()) { + hchacha_block_generic(state, stream, nrounds); + } else { + kernel_fpu_begin(); + hchacha_block_ssse3(state, stream, nrounds); + kernel_fpu_end(); + } +} +EXPORT_SYMBOL(hchacha_block_arch); + +void chacha_init_arch(u32 *state, const u32 *key, const u8 *iv) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + chacha_init_generic(state, key, iv); +} +EXPORT_SYMBOL(chacha_init_arch); + +void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, + int nrounds) +{ + state = PTR_ALIGN(state, CHACHA_STATE_ALIGN); + + if (!static_branch_likely(&chacha_use_simd) || !crypto_simd_usable() || + bytes <= CHACHA_BLOCK_SIZE) + return chacha_crypt_generic(state, dst, src, bytes, nrounds); + + kernel_fpu_begin(); + chacha_dosimd(state, dst, src, bytes, nrounds); + kernel_fpu_end(); +} +EXPORT_SYMBOL(chacha_crypt_arch); + +static int chacha_simd_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { u32 *state, state_buf[16 + 2] __aligned(8); - int next_yield = 4096; /* bytes until next FPU yield */ - int err = 0; + struct skcipher_walk walk; + int err; + + err = skcipher_walk_virt(&walk, req, false); BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, iv); - - while (walk->nbytes > 0) { - unsigned int nbytes = walk->nbytes; + chacha_init_generic(state, ctx->key, iv); - if (nbytes < walk->total) { - nbytes = round_down(nbytes, walk->stride); - next_yield -= nbytes; - } + while (walk.nbytes > 0) { + unsigned int nbytes = walk.nbytes; - chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr, - nbytes, ctx->nrounds); + if (nbytes < walk.total) + nbytes = round_down(nbytes, walk.stride); - if (next_yield <= 0) { - /* temporarily allow preemption */ - kernel_fpu_end(); + if (!static_branch_likely(&chacha_use_simd) || + !crypto_simd_usable()) { + chacha_crypt_generic(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + } else { kernel_fpu_begin(); - next_yield = 4096; + chacha_dosimd(state, walk.dst.virt.addr, + walk.src.virt.addr, nbytes, + ctx->nrounds); + kernel_fpu_end(); } - - err = skcipher_walk_done(walk, walk->nbytes - nbytes); + err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; @@ -163,55 +202,34 @@ static int chacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_chacha_crypt(req); - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; - - kernel_fpu_begin(); - err = chacha_simd_stream_xor(&walk, ctx, req->iv); - kernel_fpu_end(); - return err; + return chacha_simd_stream_xor(req, ctx, req->iv); } static int xchacha_simd(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - struct skcipher_walk walk; - struct chacha_ctx subctx; u32 *state, state_buf[16 + 2] __aligned(8); + struct chacha_ctx subctx; u8 real_iv[16]; - int err; - - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) - return crypto_xchacha_crypt(req); - - err = skcipher_walk_virt(&walk, req, true); - if (err) - return err; BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16); state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN); - crypto_chacha_init(state, ctx, req->iv); - - kernel_fpu_begin(); - - hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + chacha_init_generic(state, ctx->key, req->iv); + + if (req->cryptlen > CHACHA_BLOCK_SIZE && crypto_simd_usable()) { + kernel_fpu_begin(); + hchacha_block_ssse3(state, subctx.key, ctx->nrounds); + kernel_fpu_end(); + } else { + hchacha_block_generic(state, subctx.key, ctx->nrounds); + } subctx.nrounds = ctx->nrounds; memcpy(&real_iv[0], req->iv + 24, 8); memcpy(&real_iv[8], req->iv + 16, 8); - err = chacha_simd_stream_xor(&walk, &subctx, real_iv); - - kernel_fpu_end(); - - return err; + return chacha_simd_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { @@ -227,7 +245,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = chacha_simd, .decrypt = chacha_simd, }, { @@ -242,7 +260,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha20_setkey, + .setkey = chacha20_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, { @@ -257,7 +275,7 @@ static struct skcipher_alg algs[] = { .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, - .setkey = crypto_chacha12_setkey, + .setkey = chacha12_setkey, .encrypt = xchacha_simd, .decrypt = xchacha_simd, }, @@ -266,24 +284,28 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_SSSE3)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); -#ifdef CONFIG_AS_AVX512 - chacha_use_avx512vl = chacha_use_avx2 && - boot_cpu_has(X86_FEATURE_AVX512VL) && - boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */ -#endif -#endif + return 0; + + static_branch_enable(&chacha_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + static_branch_enable(&chacha_use_avx2); + + if (IS_ENABLED(CONFIG_AS_AVX512) && + boot_cpu_has(X86_FEATURE_AVX512VL) && + boot_cpu_has(X86_FEATURE_AVX512BW)) /* kmovq */ + static_branch_enable(&chacha_use_avx512vl); + } return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_simd_mod_fini(void) { - crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); + if (boot_cpu_has(X86_FEATURE_SSSE3)) + crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(chacha_simd_mod_init); diff --git a/arch/x86/crypto/curve25519-x86_64.c b/arch/x86/crypto/curve25519-x86_64.c new file mode 100644 index 000000000000..a52a3fb15727 --- /dev/null +++ b/arch/x86/crypto/curve25519-x86_64.c @@ -0,0 +1,2475 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* + * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved. + * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. + * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. + */ + +#include <crypto/curve25519.h> +#include <crypto/internal/kpp.h> + +#include <linux/types.h> +#include <linux/jump_label.h> +#include <linux/kernel.h> +#include <linux/module.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_bmi2); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(curve25519_use_adx); + +enum { NUM_WORDS_ELTFP25519 = 4 }; +typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519]; +typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519]; + +#define mul_eltfp25519_1w_adx(c, a, b) do { \ + mul_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_1w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_1w_bmi2(c, a, b) do { \ + mul_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_1w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_adx(a) do { \ + sqr_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_1w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_1w_bmi2(a) do { \ + sqr_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_1w_bmi2(a, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_adx(c, a, b) do { \ + mul2_256x256_integer_adx(m.buffer, a, b); \ + red_eltfp25519_2w_adx(c, m.buffer); \ +} while (0) + +#define mul_eltfp25519_2w_bmi2(c, a, b) do { \ + mul2_256x256_integer_bmi2(m.buffer, a, b); \ + red_eltfp25519_2w_bmi2(c, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_adx(a) do { \ + sqr2_256x256_integer_adx(m.buffer, a); \ + red_eltfp25519_2w_adx(a, m.buffer); \ +} while (0) + +#define sqr_eltfp25519_2w_bmi2(a) do { \ + sqr2_256x256_integer_bmi2(m.buffer, a); \ + red_eltfp25519_2w_bmi2(a, m.buffer); \ +} while (0) + +#define sqrn_eltfp25519_1w_adx(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_adx(a); \ +} while (0) + +#define sqrn_eltfp25519_1w_bmi2(a, times) do { \ + int ____counter = (times); \ + while (____counter-- > 0) \ + sqr_eltfp25519_1w_bmi2(a); \ +} while (0) + +#define copy_eltfp25519_1w(C, A) do { \ + (C)[0] = (A)[0]; \ + (C)[1] = (A)[1]; \ + (C)[2] = (A)[2]; \ + (C)[3] = (A)[3]; \ +} while (0) + +#define setzero_eltfp25519_1w(C) do { \ + (C)[0] = 0; \ + (C)[1] = 0; \ + (C)[2] = 0; \ + (C)[3] = 0; \ +} while (0) + +__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = { + /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL, + 0xffffffffffffffffUL, 0x5fffffffffffffffUL, + /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL, + 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL, + /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL, + 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL, + /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL, + 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL, + /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL, + 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL, + /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL, + 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL, + /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL, + 0xc1c20d06231f7614UL, 0x2938218da274f972UL, + /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL, + 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL, + /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL, + 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL, + /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL, + 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL, + /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL, + 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL, + /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL, + 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL, + /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL, + 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL, + /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL, + 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL, + /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL, + 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL, + /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL, + 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL, + /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL, + 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL, + /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL, + 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL, + /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL, + 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL, + /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL, + 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL, + /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL, + 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL, + /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL, + 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL, + /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL, + 0x23758739f630a257UL, 0x295a407a01a78580UL, + /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL, + 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL, + /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL, + 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL, + /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL, + 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL, + /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL, + 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL, + /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL, + 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL, + /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL, + 0x74b4c4ceab102f64UL, 0x183abadd10139845UL, + /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL, + 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL, + /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL, + 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL, + /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL, + 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL, + /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL, + 0xd88768e4904032d8UL, 0x131384427b3aaeecUL, + /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL, + 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL, + /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL, + 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL, + /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL, + 0xa401760b882c797aUL, 0x1fc223e28dc88730UL, + /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL, + 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL, + /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL, + 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL, + /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL, + 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL, + /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL, + 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL, + /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL, + 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL, + /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL, + 0x5c217736fa279374UL, 0x7dde05734afeb1faUL, + /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL, + 0xe6053bf89595bf7aUL, 0x394faf38da245530UL, + /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL, + 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL, + /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL, + 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL, + /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL, + 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL, + /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL, + 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL, + /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL, + 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL, + /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL, + 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL, + /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL, + 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL, + /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL, + 0xc189218075e91436UL, 0x6d9284169b3b8484UL, + /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL, + 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL, + /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL, + 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL, + /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL, + 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL, + /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL, + 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL, + /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL, + 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL, + /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL, + 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL, + /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL, + 0xf826842130f5ad28UL, 0x3ea988f75301a441UL, + /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL, + 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL, + /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL, + 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL, + /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL, + 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL, + /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL, + 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL, + /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL, + 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL, + /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL, + 0x25232973322dbef4UL, 0x445dc4758c17f770UL, + /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL, + 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL, + /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL, + 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL, + /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL, + 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL, + /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL, + 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL, + /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL, + 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL, + /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL, + 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL, + /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL, + 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL, + /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL, + 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL, + /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL, + 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL, + /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL, + 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL, + /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL, + 0x674f1288f8e11217UL, 0x5682250f329f93d0UL, + /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL, + 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL, + /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL, + 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL, + /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL, + 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL, + /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL, + 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL, + /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL, + 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL, + /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL, + 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL, + /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL, + 0x894d1d855ae52359UL, 0x68e122157b743d69UL, + /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL, + 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL, + /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL, + 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL, + /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL, + 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL, + /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL, + 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL, + /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL, + 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL, + /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL, + 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL, + /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL, + 0x45adb16e76cefcf2UL, 0x01f768aead232999UL, + /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL, + 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL, + /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL, + 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL, + /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL, + 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL, + /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL, + 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL, + /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL, + 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL, + /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL, + 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL, + /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL, + 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL, + /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL, + 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL, + /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL, + 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL, + /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL, + 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL, + /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL, + 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL, + /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL, + 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL, + /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL, + 0x4a497962066e6043UL, 0x705b3aab41355b44UL, + /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL, + 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL, + /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL, + 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL, + /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL, + 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL, + /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL, + 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL, + /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL, + 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL, + /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL, + 0x2088ce1570033c68UL, 0x7fba1f495c837987UL, + /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL, + 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL, + /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL, + 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL, + /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL, + 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL, + /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL, + 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL, + /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL, + 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL, + /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL, + 0x00f52e3f67280294UL, 0x566d4fc14730c509UL, + /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL, + 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL, + /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL, + 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL, + /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL, + 0x508e862f121692fcUL, 0x3a81907fa093c291UL, + /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL, + 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL, + /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL, + 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL, + /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL, + 0xe488de11d761e352UL, 0x0e878a01a085545cUL, + /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL, + 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL, + /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL, + 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL, + /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL, + 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL, + /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL, + 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL, + /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL, + 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL, + /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL, + 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL, + /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL, + 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL, + /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL, + 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL, + /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL, + 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL, + /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL, + 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL, + /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL, + 0x904659bb686e3772UL, 0x7215c371746ba8c8UL, + /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL, + 0x266fd5809208f294UL, 0x5c847085619a26b9UL, + /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL, + 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL, + /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL, + 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL, + /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL, + 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL, + /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL, + 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL, + /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL, + 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL, + /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL, + 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL, + /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL, + 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL, + /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL, + 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL, + /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL, + 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL, + /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL, + 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL, + /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL, + 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL, + /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL, + 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL, + /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL, + 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL, + /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL, + 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL, + /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL, + 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL, + /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL, + 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL, + /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL, + 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL, + /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL, + 0x52d17436309d4253UL, 0x356f97e13efae576UL, + /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL, + 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL, + /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL, + 0x66124c6f97bda770UL, 0x0f81a0290654124aUL, + /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL, + 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL, + /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL, + 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL, + /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL, + 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL, + /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL, + 0x5da643cb4bf30035UL, 0x77db28d63940f721UL, + /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL, + 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL, + /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL, + 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL, + /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL, + 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL, + /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL, + 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL, + /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL, + 0x497d723f802e88e1UL, 0x30684dea602f408dUL, + /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL, + 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL, + /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL, + 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL, + /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL, + 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL, + /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL, + 0x026df551dbb85c20UL, 0x74fcd91047e21901UL, + /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL, + 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL, + /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL, + 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL, + /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL, + 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL, + /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL, + 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL, + /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL, + 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL, + /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL, + 0x13033ac001f66697UL, 0x273b24fe3b367d75UL, + /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL, + 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL, + /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL, + 0xacc63ca34b8ec145UL, 0x74621888fee66574UL, + /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL, + 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL, + /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL, + 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL, + /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL, + 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL, + /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL, + 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL, + /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL, + 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL, + /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL, + 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL, + /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL, + 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL, + /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL, + 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL, + /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL, + 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL, + /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL, + 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL, + /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL, + 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL, + /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL, + 0x81004b71e33cc191UL, 0x44e6be345122803cUL, + /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL, + 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL, + /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL, + 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL, + /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL, + 0x12bc8d6915783712UL, 0x498194c0fc620abbUL, + /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL, + 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL, + /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL, + 0x1e60c24598c71fffUL, 0x59f2f014979983efUL, + /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL, + 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL, + /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL, + 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL, + /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL, + 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL, + /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL, + 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL, + /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL, + 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL, + /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL, + 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL, + /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL, + 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL, + /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL, + 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL, + /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL, + 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL, + /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL, + 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL, + /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL, + 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL, + /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL, + 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL, + /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL, + 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL, + /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL, + 0x33979624f0e917beUL, 0x2c018dc527356b30UL, + /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL, + 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL, + /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL, + 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL, + /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL, + 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL, + /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL, + 0x345ead5e972d091eUL, 0x18c8df11a83103baUL, + /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL, + 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL, + /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL, + 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL, + /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL, + 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL, + /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL, + 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL, + /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL, + 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL, + /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL, + 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL, + /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL, + 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL, + /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL, + 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL, + /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL, + 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL, + /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL, + 0x1df4c0af01314a60UL, 0x09a62dab89289527UL, + /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL, + 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL, + /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL, + 0x49b96853d7a7084aUL, 0x4980a319601420a8UL, + /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL, + 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL, + /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL, + 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL, + /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL, + 0xddeb34a061615d99UL, 0x5129cecceb64b773UL, + /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL, + 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL, + /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL, + 0x680bd77c73edad2eUL, 0x487c02354edd9041UL, + /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL, + 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL, + /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL, + 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL, + /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL, + 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL, + /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL, + 0xe9834262d13921edUL, 0x27fedafaa54bb592UL, + /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL, + 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL, + /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL, + 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL, + /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL, + 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL, + /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL, + 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL, + /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL, + 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL, + /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL, + 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL, + /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL, + 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL, + /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL, + 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL, + /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL, + 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL, + /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL, + 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL, + /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL, + 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL, + /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL, + 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL, + /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL, + 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL, + /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL, + 0x2c43ecea0107c1ddUL, 0x526028809372de35UL, + /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL, + 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL, + /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL, + 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL, + /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL, + 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL, + /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL, + 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL, + /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL, + 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL, + /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL, + 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL, + /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL, + 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL, + /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL, + 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL, + /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL, + 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL +}; + +/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7] + * a is two 256-bit integers: a0[0:3] and a1[4:7] + * b is two 256-bit integers: b0[0:3] and b1[4:7] + */ +static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "xorl %%r14d, %%r14d ;" + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "adox %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adox %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "xorl %%r10d, %%r10d ;" + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, 64(%0);" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "adox %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adox %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adox %%r10, %%rbx ;" + /******************************************/ + "adox %%r14, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "adox %%r15, %%r8 ;" + "movq %%r8, 72(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rax ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rbx ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rcx ;" + /******************************************/ + "adox %%r14, %%r15 ;" + "adcx %%r14, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "adox %%rax, %%r8 ;" + "movq %%r8, 80(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rbx ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%rcx ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%r15 ;" + /******************************************/ + "adox %%r14, %%rax ;" + "adcx %%r14, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "xorl %%r10d, %%r10d ;" + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "adox %%rbx, %%r8 ;" + "movq %%r8, 88(%0);" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adox %%r10, %%r9 ;" + "adcx %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adox %%r8, %%r11 ;" + "adcx %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adox %%r10, %%r13 ;" + "adcx %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + /******************************************/ + "adox %%r14, %%rbx ;" + "adcx %%r14, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + + "movq 32(%1), %%rdx; " /* C[0] */ + "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */ + "movq %%r8, 64(%0) ;" + "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ + "addq %%r10, %%r15 ;" + "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ + "adcq %%r8, %%rax ;" + "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 40(%1), %%rdx; " /* C[1] */ + "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 72(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 48(%1), %%rdx; " /* C[2] */ + "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 80(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 56(%1), %%rdx; " /* C[3] */ + "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 88(%0) ;" + "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ + "adcq %%r10, %%r9 ;" + "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ + "adcq %%r8, %%r11 ;" + "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 96(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 104(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 112(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 120(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ + "xorl %%r15d, %%r15d;" + "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ + "adcx %%r14, %%r9 ;" + "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ + "adcx %%rax, %%r10 ;" + "movq 56(%1), %%rdx ;" /* B[3] */ + "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ + "adcx %%rax, %%rbx ;" + "movq 40(%1), %%rdx ;" /* B[1] */ + "adcx %%r15, %%r13 ;" + "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq 32(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + + "movq 40(%1), %%rdx ;" /* B[1] */ + "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */ + "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */ + "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */ + + "movq 48(%1), %%rdx ;" /* B[2] */ + "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */ + "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq 32(%1), %%rdx ;" /* B[0] */ + "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */ + /*******************/ + "movq %%rax, 64(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 72(%0) ;" + "movq 40(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 80(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 88(%0) ;" + "movq 48(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 96(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 104(%0) ;" + "movq 56(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 112(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 120(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox 64(%1), %%r8 ;" + "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 72(%1), %%r9 ;" + "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 80(%1), %%r10 ;" + "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 88(%1), %%r11 ;" + /****************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + + "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /****************************************/ + "adcq $0, %%rcx ;" + "addq 64(%1), %%r8 ;" + "adcq 72(%1), %%r9 ;" + "adcq 80(%1), %%r10 ;" + "adcq 88(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 40(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 48(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 56(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 32(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void mul_256x256_integer_adx(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ + "xorl %%r10d, %%r10d ;" + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ + "adox %%r9, %%r10 ;" + "movq %%r10, 8(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */ + "adox %%r11, %%r15 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ + "adox %%r13, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 8(%0), %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 16(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 16(%0), %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 24(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "xorl %%r10d, %%r10d ;" + "adcx 24(%0), %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adox %%r9, %%r10 ;" + "adcx %%r15, %%r10 ;" + "movq %%r10, 32(%0) ;" + "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */ + "adox %%r11, %%r15 ;" + "adcx %%r14, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq $0, %%r8 ;" + "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ + "adox %%r13, %%r14 ;" + "adcx %%rax, %%r14 ;" + "movq %%r14, 48(%0) ;" + "movq $0, %%rax ;" + /******************************************/ + "adox %%rdx, %%rax ;" + "adcx %%r8, %%rax ;" + "movq %%rax, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", + "%r13", "%r14", "%r15"); +} + +static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a, + const u64 *const b) +{ + asm volatile( + "movq (%1), %%rdx; " /* A[0] */ + "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */ + "movq %%r8, (%0) ;" + "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ + "addq %%r10, %%r15 ;" + "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ + "adcq %%r8, %%rax ;" + "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ + "adcq %%r10, %%rbx ;" + /******************************************/ + "adcq $0, %%rcx ;" + + "movq 8(%1), %%rdx; " /* A[1] */ + "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ + "addq %%r15, %%r8 ;" + "movq %%r8, 8(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%r15 ;" + + "addq %%r9, %%rax ;" + "adcq %%r11, %%rbx ;" + "adcq %%r13, %%rcx ;" + "adcq $0, %%r15 ;" + + "movq 16(%1), %%rdx; " /* A[2] */ + "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ + "addq %%rax, %%r8 ;" + "movq %%r8, 16(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rax ;" + + "addq %%r9, %%rbx ;" + "adcq %%r11, %%rcx ;" + "adcq %%r13, %%r15 ;" + "adcq $0, %%rax ;" + + "movq 24(%1), %%rdx; " /* A[3] */ + "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ + "addq %%rbx, %%r8 ;" + "movq %%r8, 24(%0) ;" + "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ + "adcq %%r10, %%r9 ;" + "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ + "adcq %%r8, %%r11 ;" + "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ + "adcq %%r10, %%r13 ;" + /******************************************/ + "adcq $0, %%rbx ;" + + "addq %%r9, %%rcx ;" + "movq %%rcx, 32(%0) ;" + "adcq %%r11, %%r15 ;" + "movq %%r15, 40(%0) ;" + "adcq %%r13, %%rax ;" + "movq %%rax, 48(%0) ;" + "adcq $0, %%rbx ;" + "movq %%rbx, 56(%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r15"); +} + +static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ + "xorl %%r15d, %%r15d;" + "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ + "adcx %%r14, %%r9 ;" + "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ + "adcx %%rax, %%r10 ;" + "movq 24(%1), %%rdx ;" /* A[3] */ + "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */ + "adcx %%rcx, %%r11 ;" + "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ + "adcx %%rax, %%rbx ;" + "movq 8(%1), %%rdx ;" /* A[1] */ + "adcx %%r15, %%r13 ;" + "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ + "movq $0, %%r14 ;" + /******************************************/ + "adcx %%r15, %%r14 ;" + + "xorl %%r15d, %%r15d;" + "adox %%rax, %%r10 ;" + "adcx %%r8, %%r8 ;" + "adox %%rcx, %%r11 ;" + "adcx %%r9, %%r9 ;" + "adox %%r15, %%rbx ;" + "adcx %%r10, %%r10 ;" + "adox %%r15, %%r13 ;" + "adcx %%r11, %%r11 ;" + "adox %%r15, %%r14 ;" + "adcx %%rbx, %%rbx ;" + "adcx %%r13, %%r13 ;" + "adcx %%r14, %%r14 ;" + + "movq (%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%rbx ;" + "movq %%rbx, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11", "%r13", "%r14", "%r15"); +} + +static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movq 8(%1), %%rdx ;" /* A[1] */ + "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */ + "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */ + "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */ + + "movq 16(%1), %%rdx ;" /* A[2] */ + "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */ + "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */ + + "addq %%rax, %%r9 ;" + "adcq %%rdx, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq %%r14, %%r15 ;" + "adcq $0, %%r13 ;" + "movq $0, %%r14 ;" + "adcq $0, %%r14 ;" + + "movq (%1), %%rdx ;" /* A[0] */ + "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */ + + "addq %%rax, %%r10 ;" + "adcq %%rcx, %%r11 ;" + "adcq $0, %%r15 ;" + "adcq $0, %%r13 ;" + "adcq $0, %%r14 ;" + + "shldq $1, %%r13, %%r14 ;" + "shldq $1, %%r15, %%r13 ;" + "shldq $1, %%r11, %%r15 ;" + "shldq $1, %%r10, %%r11 ;" + "shldq $1, %%r9, %%r10 ;" + "shldq $1, %%r8, %%r9 ;" + "shlq $1, %%r8 ;" + + /*******************/ + "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */ + /*******************/ + "movq %%rax, 0(%0) ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, 8(%0) ;" + "movq 8(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */ + "adcq %%rax, %%r9 ;" + "movq %%r9, 16(%0) ;" + "adcq %%rcx, %%r10 ;" + "movq %%r10, 24(%0) ;" + "movq 16(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */ + "adcq %%rax, %%r11 ;" + "movq %%r11, 32(%0) ;" + "adcq %%rcx, %%r15 ;" + "movq %%r15, 40(%0) ;" + "movq 24(%1), %%rdx ;" + "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */ + "adcq %%rax, %%r13 ;" + "movq %%r13, 48(%0) ;" + "adcq %%rcx, %%r14 ;" + "movq %%r14, 56(%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11", "%r13", "%r14", "%r15"); +} + +static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "xorl %%ebx, %%ebx ;" + "adox (%1), %%r8 ;" + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "adcx %%r10, %%r9 ;" + "adox 8(%1), %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcx %%r11, %%r10 ;" + "adox 16(%1), %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcx %%rax, %%r11 ;" + "adox 24(%1), %%r11 ;" + /***************************************/ + "adcx %%rbx, %%rcx ;" + "adox %%rbx, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */ + "adcx %%rcx, %%r8 ;" + "adcx %%rbx, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rbx, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rbx, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", + "%r10", "%r11"); +} + +static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + asm volatile( + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */ + "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ + "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ + "addq %%r10, %%r9 ;" + "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ + "adcq %%r11, %%r10 ;" + "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ + "adcq %%rax, %%r11 ;" + /***************************************/ + "adcq $0, %%rcx ;" + "addq (%1), %%r8 ;" + "adcq 8(%1), %%r9 ;" + "adcq 16(%1), %%r10 ;" + "adcq 24(%1), %%r11 ;" + "adcq $0, %%rcx ;" + "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */ + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "xorl %%ecx, %%ecx ;" + "movq (%2), %%r8 ;" + "adcx (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcx 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcx 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcx 24(%1), %%r11 ;" + "cmovc %%eax, %%ecx ;" + "xorl %%eax, %%eax ;" + "adcx %%rcx, %%r8 ;" + "adcx %%rax, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcx %%rax, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcx %%rax, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $38, %%ecx ;" + "cmovc %%ecx, %%eax ;" + "addq %%rax, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%2), %%r8 ;" + "addq (%1), %%r8 ;" + "movq 8(%2), %%r9 ;" + "adcq 8(%1), %%r9 ;" + "movq 16(%2), %%r10 ;" + "adcq 16(%1), %%r10 ;" + "movq 24(%2), %%r11 ;" + "adcq 24(%1), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +static __always_inline void +sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b) +{ + asm volatile( + "mov $38, %%eax ;" + "movq (%1), %%r8 ;" + "subq (%2), %%r8 ;" + "movq 8(%1), %%r9 ;" + "sbbq 8(%2), %%r9 ;" + "movq 16(%1), %%r10 ;" + "sbbq 16(%2), %%r10 ;" + "movq 24(%1), %%r11 ;" + "sbbq 24(%2), %%r11 ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "sbbq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "sbbq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "sbbq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%eax, %%ecx ;" + "subq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(b) + : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"); +} + +/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */ +static __always_inline void +mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a) +{ + const u64 a24 = 121666; + asm volatile( + "movq %2, %%rdx ;" + "mulx (%1), %%r8, %%r10 ;" + "mulx 8(%1), %%r9, %%r11 ;" + "addq %%r10, %%r9 ;" + "mulx 16(%1), %%r10, %%rax ;" + "adcq %%r11, %%r10 ;" + "mulx 24(%1), %%r11, %%rcx ;" + "adcq %%rax, %%r11 ;" + /**************************/ + "adcq $0, %%rcx ;" + "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/ + "imul %%rdx, %%rcx ;" + "addq %%rcx, %%r8 ;" + "adcq $0, %%r9 ;" + "movq %%r9, 8(%0) ;" + "adcq $0, %%r10 ;" + "movq %%r10, 16(%0) ;" + "adcq $0, %%r11 ;" + "movq %%r11, 24(%0) ;" + "mov $0, %%ecx ;" + "cmovc %%edx, %%ecx ;" + "addq %%rcx, %%r8 ;" + "movq %%r8, (%0) ;" + : + : "r"(c), "r"(a), "r"(a24) + : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", + "%r11"); +} + +static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[4]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_adx(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 2); + mul_eltfp25519_1w_adx(T[0], a, T[2]); + mul_eltfp25519_1w_adx(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_adx(T[2], 1); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 10); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 20); + mul_eltfp25519_1w_adx(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_adx(T[3], 10); + mul_eltfp25519_1w_adx(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_adx(T[0], 50); + mul_eltfp25519_1w_adx(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 100); + mul_eltfp25519_1w_adx(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_adx(T[2], 50); + mul_eltfp25519_1w_adx(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_adx(T[2], 5); + mul_eltfp25519_1w_adx(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a) +{ + struct { + eltfp25519_1w_buffer buffer; + eltfp25519_1w x0, x1, x2; + } __aligned(32) m; + u64 *T[5]; + + T[0] = m.x0; + T[1] = c; /* x^(-1) */ + T[2] = m.x1; + T[3] = m.x2; + + copy_eltfp25519_1w(T[1], a); + sqrn_eltfp25519_1w_bmi2(T[1], 1); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 2); + mul_eltfp25519_1w_bmi2(T[0], a, T[2]); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]); + copy_eltfp25519_1w(T[2], T[1]); + sqrn_eltfp25519_1w_bmi2(T[2], 1); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 10); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + copy_eltfp25519_1w(T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 20); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]); + sqrn_eltfp25519_1w_bmi2(T[3], 10); + mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]); + copy_eltfp25519_1w(T[0], T[3]); + sqrn_eltfp25519_1w_bmi2(T[0], 50); + mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]); + copy_eltfp25519_1w(T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 100); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]); + sqrn_eltfp25519_1w_bmi2(T[2], 50); + mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]); + sqrn_eltfp25519_1w_bmi2(T[2], 5); + mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]); + + memzero_explicit(&m, sizeof(m)); +} + +/* Given c, a 256-bit number, fred_eltfp25519_1w updates c + * with a number such that 0 <= C < 2**255-19. + */ +static __always_inline void fred_eltfp25519_1w(u64 *const c) +{ + u64 tmp0 = 38, tmp1 = 19; + asm volatile( + "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */ + "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */ + + /* Add either 19 or 38 to c */ + "addq %4, %0 ;" + "adcq $0, %1 ;" + "adcq $0, %2 ;" + "adcq $0, %3 ;" + + /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */ + "movl $0, %k4 ;" + "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */ + "btrq $63, %3 ;" /* Clear bit 255 */ + + /* Subtract 19 if necessary */ + "subq %4, %0 ;" + "sbbq $0, %1 ;" + "sbbq $0, %2 ;" + "sbbq $0, %3 ;" + + : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0), + "+r"(tmp1) + : + : "memory", "cc"); +} + +static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py) +{ + u64 temp; + asm volatile( + "test %9, %9 ;" + "movq %0, %8 ;" + "cmovnzq %4, %0 ;" + "cmovnzq %8, %4 ;" + "movq %1, %8 ;" + "cmovnzq %5, %1 ;" + "cmovnzq %8, %5 ;" + "movq %2, %8 ;" + "cmovnzq %6, %2 ;" + "cmovnzq %8, %6 ;" + "movq %3, %8 ;" + "cmovnzq %7, %3 ;" + "cmovnzq %8, %7 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]), + "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]), + "=r"(temp) + : "r"(bit) + : "cc" + ); +} + +static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py) +{ + asm volatile( + "test %4, %4 ;" + "cmovnzq %5, %0 ;" + "cmovnzq %6, %1 ;" + "cmovnzq %7, %2 ;" + "cmovnzq %8, %3 ;" + : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]) + : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3]) + : "cc" + ); +} + +static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_adx(A, Qz); + mul_eltfp25519_1w_adx((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_adx(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_adx(A, Zr1); + mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE], + const u8 session_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[6 * NUM_WORDS_ELTFP25519]; + u8 session[CURVE25519_KEY_SIZE]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)m.session; + u64 *const key = (u64 *)m.private; + u64 *const Px = m.coordinates + 0; + u64 *const Pz = m.coordinates + 4; + u64 *const Qx = m.coordinates + 8; + u64 *const Qz = m.coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const D = m.workspace + 8; + u64 *const C = m.workspace + 12; + u64 *const DA = m.workspace + 16; + u64 *const CB = m.workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + + memcpy(m.private, private_key, sizeof(m.private)); + memcpy(m.session, session_key, sizeof(m.session)); + + curve25519_clamp_secret(m.private); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1; + + copy_eltfp25519_1w(Px, X1); + setzero_eltfp25519_1w(Pz); + setzero_eltfp25519_1w(Qx); + setzero_eltfp25519_1w(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */ + sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */ + add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */ + sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */ + mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cselect(swap, A, C); + cselect(swap, B, D); + + sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ + add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ + sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_eltfp25519_1w(X2, B); /* X2 = B^2 */ + sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */ + + mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */ + add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */ + mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ + --j; + } + j = 63; + } + + inv_eltfp25519_1w_bmi2(A, Qz); + mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A); + fred_eltfp25519_1w((u64 *)shared); + + memzero_explicit(&m, sizeof(m)); +} + +static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE], + const u8 private_key[CURVE25519_KEY_SIZE]) +{ + struct { + u64 buffer[4 * NUM_WORDS_ELTFP25519]; + u64 coordinates[4 * NUM_WORDS_ELTFP25519]; + u64 workspace[4 * NUM_WORDS_ELTFP25519]; + u8 private[CURVE25519_KEY_SIZE]; + } __aligned(32) m; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)m.private; + u64 *const Ur1 = m.coordinates + 0; + u64 *const Zr1 = m.coordinates + 4; + u64 *const Ur2 = m.coordinates + 8; + u64 *const Zr2 = m.coordinates + 12; + + u64 *const UZr1 = m.coordinates + 0; + u64 *const ZUr2 = m.coordinates + 8; + + u64 *const A = m.workspace + 0; + u64 *const B = m.workspace + 4; + u64 *const C = m.workspace + 8; + u64 *const D = m.workspace + 12; + + u64 *const AB = m.workspace + 0; + u64 *const CD = m.workspace + 8; + + const u64 *const P = table_ladder_8k; + + memcpy(m.private, private_key, sizeof(m.private)); + + curve25519_clamp_secret(m.private); + + setzero_eltfp25519_1w(Ur1); + setzero_eltfp25519_1w(Zr1); + setzero_eltfp25519_1w(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34UL; + Ur2[2] = 0xadc7a0b9235d48e2UL; + Ur2[1] = 0xbbf095ae14b2edf8UL; + Ur2[0] = 0x7e94e1fec82faabdUL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) { + while (j < ite[i]) { + u64 bit = (key[i] >> j) & 0x1; + k = (64 * i + j - q); + swap = swap ^ bit; + cswap(swap, Ur1, Ur2); + cswap(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */ + sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */ + mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */ + copy_eltfp25519_1w(C, B); /* C = B */ + sub_eltfp25519_1w(B, A, B); /* B = A-B */ + mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */ + add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */ + mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_eltfp25519_1w_bmi2(A, Zr1); + mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A); + fred_eltfp25519_1w((u64 *)session_key); + + memzero_explicit(&m, sizeof(m)); +} + +void curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE], + const u8 basepoint[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx(mypublic, secret, basepoint); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2(mypublic, secret, basepoint); + else + curve25519_generic(mypublic, secret, basepoint); +} +EXPORT_SYMBOL(curve25519_arch); + +void curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE], + const u8 secret[CURVE25519_KEY_SIZE]) +{ + if (static_branch_likely(&curve25519_use_adx)) + curve25519_adx_base(pub, secret); + else if (static_branch_likely(&curve25519_use_bmi2)) + curve25519_bmi2_base(pub, secret); + else + curve25519_generic(pub, secret, curve25519_base_point); +} +EXPORT_SYMBOL(curve25519_base_arch); + +static int curve25519_set_secret(struct crypto_kpp *tfm, const void *buf, + unsigned int len) +{ + u8 *secret = kpp_tfm_ctx(tfm); + + if (!len) + curve25519_generate_secret(secret); + else if (len == CURVE25519_KEY_SIZE && + crypto_memneq(buf, curve25519_null_point, CURVE25519_KEY_SIZE)) + memcpy(secret, buf, CURVE25519_KEY_SIZE); + else + return -EINVAL; + return 0; +} + +static int curve25519_generate_public_key(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (req->src) + return -EINVAL; + + curve25519_base_arch(buf, secret); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static int curve25519_compute_shared_secret(struct kpp_request *req) +{ + struct crypto_kpp *tfm = crypto_kpp_reqtfm(req); + const u8 *secret = kpp_tfm_ctx(tfm); + u8 public_key[CURVE25519_KEY_SIZE]; + u8 buf[CURVE25519_KEY_SIZE]; + int copied, nbytes; + + if (!req->src) + return -EINVAL; + + copied = sg_copy_to_buffer(req->src, + sg_nents_for_len(req->src, + CURVE25519_KEY_SIZE), + public_key, CURVE25519_KEY_SIZE); + if (copied != CURVE25519_KEY_SIZE) + return -EINVAL; + + curve25519_arch(buf, secret, public_key); + + /* might want less than we've got */ + nbytes = min_t(size_t, CURVE25519_KEY_SIZE, req->dst_len); + copied = sg_copy_from_buffer(req->dst, sg_nents_for_len(req->dst, + nbytes), + buf, nbytes); + if (copied != nbytes) + return -EINVAL; + return 0; +} + +static unsigned int curve25519_max_size(struct crypto_kpp *tfm) +{ + return CURVE25519_KEY_SIZE; +} + +static struct kpp_alg curve25519_alg = { + .base.cra_name = "curve25519", + .base.cra_driver_name = "curve25519-x86", + .base.cra_priority = 200, + .base.cra_module = THIS_MODULE, + .base.cra_ctxsize = CURVE25519_KEY_SIZE, + + .set_secret = curve25519_set_secret, + .generate_public_key = curve25519_generate_public_key, + .compute_shared_secret = curve25519_compute_shared_secret, + .max_size = curve25519_max_size, +}; + +static int __init curve25519_mod_init(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2)) + static_branch_enable(&curve25519_use_bmi2); + else if (boot_cpu_has(X86_FEATURE_ADX)) + static_branch_enable(&curve25519_use_adx); + else + return 0; + return crypto_register_kpp(&curve25519_alg); +} + +static void __exit curve25519_mod_exit(void) +{ + if (boot_cpu_has(X86_FEATURE_BMI2) || + boot_cpu_has(X86_FEATURE_ADX)) + crypto_unregister_kpp(&curve25519_alg); +} + +module_init(curve25519_mod_init); +module_exit(curve25519_mod_exit); + +MODULE_ALIAS_CRYPTO("curve25519"); +MODULE_ALIAS_CRYPTO("curve25519-x86"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 4a1c05dce950..370cd88068ec 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -7,47 +7,23 @@ #include <crypto/algapi.h> #include <crypto/internal/hash.h> +#include <crypto/internal/poly1305.h> #include <crypto/internal/simd.h> -#include <crypto/poly1305.h> #include <linux/crypto.h> +#include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <asm/simd.h> -struct poly1305_simd_desc_ctx { - struct poly1305_desc_ctx base; - /* derived key u set? */ - bool uset; -#ifdef CONFIG_AS_AVX2 - /* derived keys r^3, r^4 set? */ - bool wset; -#endif - /* derived Poly1305 key r^2 */ - u32 u[5]; - /* ... silently appended r^3 and r^4 when using AVX2 */ -}; - asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks); asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -#ifdef CONFIG_AS_AVX2 asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r, unsigned int blocks, const u32 *u); -static bool poly1305_use_avx2; -#endif - -static int poly1305_simd_init(struct shash_desc *desc) -{ - struct poly1305_simd_desc_ctx *sctx = shash_desc_ctx(desc); - sctx->uset = false; -#ifdef CONFIG_AS_AVX2 - sctx->wset = false; -#endif - - return crypto_poly1305_init(desc); -} +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd); +static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2); static void poly1305_simd_mult(u32 *a, const u32 *b) { @@ -60,72 +36,85 @@ static void poly1305_simd_mult(u32 *a, const u32 *b) poly1305_block_sse2(a, m, b, 1); } +static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx, + const u8 *src, unsigned int srclen) +{ + unsigned int datalen; + + if (unlikely(!dctx->sset)) { + datalen = crypto_poly1305_setdesckey(dctx, src, srclen); + src += srclen - datalen; + srclen = datalen; + } + if (srclen >= POLY1305_BLOCK_SIZE) { + poly1305_core_blocks(&dctx->h, dctx->r, src, + srclen / POLY1305_BLOCK_SIZE, 1); + srclen %= POLY1305_BLOCK_SIZE; + } + return srclen; +} + static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, unsigned int srclen) { - struct poly1305_simd_desc_ctx *sctx; unsigned int blocks, datalen; - BUILD_BUG_ON(offsetof(struct poly1305_simd_desc_ctx, base)); - sctx = container_of(dctx, struct poly1305_simd_desc_ctx, base); - if (unlikely(!dctx->sset)) { datalen = crypto_poly1305_setdesckey(dctx, src, srclen); src += srclen - datalen; srclen = datalen; } -#ifdef CONFIG_AS_AVX2 - if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) { - if (unlikely(!sctx->wset)) { - if (!sctx->uset) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (IS_ENABLED(CONFIG_AS_AVX2) && + static_branch_likely(&poly1305_use_avx2) && + srclen >= POLY1305_BLOCK_SIZE * 4) { + if (unlikely(dctx->rset < 4)) { + if (dctx->rset < 2) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); } - memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 5, dctx->r.r); - memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u + 10, dctx->r.r); - sctx->wset = true; + dctx->r[2] = dctx->r[1]; + poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r); + dctx->r[3] = dctx->r[2]; + poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r); + dctx->rset = 4; } blocks = srclen / (POLY1305_BLOCK_SIZE * 4); - poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks, + dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 4 * blocks; srclen -= POLY1305_BLOCK_SIZE * 4 * blocks; } -#endif + if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) { - if (unlikely(!sctx->uset)) { - memcpy(sctx->u, dctx->r.r, sizeof(sctx->u)); - poly1305_simd_mult(sctx->u, dctx->r.r); - sctx->uset = true; + if (unlikely(dctx->rset < 2)) { + dctx->r[1] = dctx->r[0]; + poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r); + dctx->rset = 2; } blocks = srclen / (POLY1305_BLOCK_SIZE * 2); - poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks, - sctx->u); + poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r, + blocks, dctx->r[1].r); src += POLY1305_BLOCK_SIZE * 2 * blocks; srclen -= POLY1305_BLOCK_SIZE * 2 * blocks; } if (srclen >= POLY1305_BLOCK_SIZE) { - poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1); + poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1); srclen -= POLY1305_BLOCK_SIZE; } return srclen; } -static int poly1305_simd_update(struct shash_desc *desc, - const u8 *src, unsigned int srclen) +void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key) { - struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); - unsigned int bytes; - - /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !crypto_simd_usable()) - return crypto_poly1305_update(desc, src, srclen); + poly1305_init_generic(desc, key); +} +EXPORT_SYMBOL(poly1305_init_arch); - kernel_fpu_begin(); +void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src, + unsigned int srclen) +{ + unsigned int bytes; if (unlikely(dctx->buflen)) { bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen); @@ -135,34 +124,84 @@ static int poly1305_simd_update(struct shash_desc *desc, dctx->buflen += bytes; if (dctx->buflen == POLY1305_BLOCK_SIZE) { - poly1305_simd_blocks(dctx, dctx->buf, - POLY1305_BLOCK_SIZE); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + poly1305_simd_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + kernel_fpu_end(); + } else { + poly1305_scalar_blocks(dctx, dctx->buf, + POLY1305_BLOCK_SIZE); + } dctx->buflen = 0; } } if (likely(srclen >= POLY1305_BLOCK_SIZE)) { - bytes = poly1305_simd_blocks(dctx, src, srclen); + if (static_branch_likely(&poly1305_use_simd) && + likely(crypto_simd_usable())) { + kernel_fpu_begin(); + bytes = poly1305_simd_blocks(dctx, src, srclen); + kernel_fpu_end(); + } else { + bytes = poly1305_scalar_blocks(dctx, src, srclen); + } src += srclen - bytes; srclen = bytes; } - kernel_fpu_end(); - if (unlikely(srclen)) { dctx->buflen = srclen; memcpy(dctx->buf, src, srclen); } +} +EXPORT_SYMBOL(poly1305_update_arch); + +void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *digest) +{ + poly1305_final_generic(desc, digest); +} +EXPORT_SYMBOL(poly1305_final_arch); + +static int crypto_poly1305_init(struct shash_desc *desc) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + poly1305_core_init(&dctx->h); + dctx->buflen = 0; + dctx->rset = 0; + dctx->sset = false; + + return 0; +} + +static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + + if (unlikely(!dctx->sset)) + return -ENOKEY; + + poly1305_final_generic(dctx, dst); + return 0; +} + +static int poly1305_simd_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc); + poly1305_update_arch(dctx, src, srclen); return 0; } static struct shash_alg alg = { .digestsize = POLY1305_DIGEST_SIZE, - .init = poly1305_simd_init, + .init = crypto_poly1305_init, .update = poly1305_simd_update, .final = crypto_poly1305_final, - .descsize = sizeof(struct poly1305_simd_desc_ctx), + .descsize = sizeof(struct poly1305_desc_ctx), .base = { .cra_name = "poly1305", .cra_driver_name = "poly1305-simd", @@ -175,16 +214,16 @@ static struct shash_alg alg = { static int __init poly1305_simd_mod_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2)) - return -ENODEV; - -#ifdef CONFIG_AS_AVX2 - poly1305_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) && - boot_cpu_has(X86_FEATURE_AVX2) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); - alg.descsize = sizeof(struct poly1305_simd_desc_ctx); - if (poly1305_use_avx2) - alg.descsize += 10 * sizeof(u32); -#endif + return 0; + + static_branch_enable(&poly1305_use_simd); + + if (IS_ENABLED(CONFIG_AS_AVX2) && + boot_cpu_has(X86_FEATURE_AVX) && + boot_cpu_has(X86_FEATURE_AVX2) && + cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + static_branch_enable(&poly1305_use_avx2); + return crypto_register_shash(&alg); } diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4bbcc5e64969..09fe5606a118 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -172,7 +172,7 @@ ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI .if \no_user_check == 0 /* coming from usermode? */ - testl $SEGMENT_RPL_MASK, PT_CS(%esp) + testl $USER_SEGMENT_RPL_MASK, PT_CS(%esp) jz .Lend_\@ .endif /* On user-cr3? */ @@ -205,64 +205,76 @@ #define CS_FROM_ENTRY_STACK (1 << 31) #define CS_FROM_USER_CR3 (1 << 30) #define CS_FROM_KERNEL (1 << 29) +#define CS_FROM_ESPFIX (1 << 28) .macro FIXUP_FRAME /* * The high bits of the CS dword (__csh) are used for CS_FROM_*. * Clear them in case hardware didn't do this for us. */ - andl $0x0000ffff, 3*4(%esp) + andl $0x0000ffff, 4*4(%esp) #ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, 4*4(%esp) + testl $X86_EFLAGS_VM, 5*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ #endif - testl $SEGMENT_RPL_MASK, 3*4(%esp) + testl $USER_SEGMENT_RPL_MASK, 4*4(%esp) jnz .Lfrom_usermode_no_fixup_\@ - orl $CS_FROM_KERNEL, 3*4(%esp) + orl $CS_FROM_KERNEL, 4*4(%esp) /* * When we're here from kernel mode; the (exception) stack looks like: * - * 5*4(%esp) - <previous context> - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 6*4(%esp) - <previous context> + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs * * Lets build a 5 entry IRET frame after that, such that struct pt_regs * is complete and in particular regs->sp is correct. This gives us - * the original 5 enties as gap: + * the original 6 enties as gap: * - * 12*4(%esp) - <previous context> - * 11*4(%esp) - gap / flags - * 10*4(%esp) - gap / cs - * 9*4(%esp) - gap / ip - * 8*4(%esp) - gap / orig_eax - * 7*4(%esp) - gap / gs / function - * 6*4(%esp) - ss - * 5*4(%esp) - sp - * 4*4(%esp) - flags - * 3*4(%esp) - cs - * 2*4(%esp) - ip - * 1*4(%esp) - orig_eax - * 0*4(%esp) - gs / function + * 14*4(%esp) - <previous context> + * 13*4(%esp) - gap / flags + * 12*4(%esp) - gap / cs + * 11*4(%esp) - gap / ip + * 10*4(%esp) - gap / orig_eax + * 9*4(%esp) - gap / gs / function + * 8*4(%esp) - gap / fs + * 7*4(%esp) - ss + * 6*4(%esp) - sp + * 5*4(%esp) - flags + * 4*4(%esp) - cs + * 3*4(%esp) - ip + * 2*4(%esp) - orig_eax + * 1*4(%esp) - gs / function + * 0*4(%esp) - fs */ pushl %ss # ss pushl %esp # sp (points at ss) - addl $6*4, (%esp) # point sp back at the previous context - pushl 6*4(%esp) # flags - pushl 6*4(%esp) # cs - pushl 6*4(%esp) # ip - pushl 6*4(%esp) # orig_eax - pushl 6*4(%esp) # gs / function + addl $7*4, (%esp) # point sp back at the previous context + pushl 7*4(%esp) # flags + pushl 7*4(%esp) # cs + pushl 7*4(%esp) # ip + pushl 7*4(%esp) # orig_eax + pushl 7*4(%esp) # gs / function + pushl 7*4(%esp) # fs .Lfrom_usermode_no_fixup_\@: .endm .macro IRET_FRAME + /* + * We're called with %ds, %es, %fs, and %gs from the interrupted + * frame, so we shouldn't use them. Also, we may be in ESPFIX + * mode and therefore have a nonzero SS base and an offset ESP, + * so any attempt to access the stack needs to use SS. (except for + * accesses through %esp, which automatically use SS.) + */ testl $CS_FROM_KERNEL, 1*4(%esp) jz .Lfinished_frame_\@ @@ -276,31 +288,40 @@ movl 5*4(%esp), %eax # (modified) regs->sp movl 4*4(%esp), %ecx # flags - movl %ecx, -4(%eax) + movl %ecx, %ss:-1*4(%eax) movl 3*4(%esp), %ecx # cs andl $0x0000ffff, %ecx - movl %ecx, -8(%eax) + movl %ecx, %ss:-2*4(%eax) movl 2*4(%esp), %ecx # ip - movl %ecx, -12(%eax) + movl %ecx, %ss:-3*4(%eax) movl 1*4(%esp), %ecx # eax - movl %ecx, -16(%eax) + movl %ecx, %ss:-4*4(%eax) popl %ecx - lea -16(%eax), %esp + lea -4*4(%eax), %esp popl %eax .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0 cld .if \skip_gs == 0 PUSH_GS .endif - FIXUP_FRAME pushl %fs + + pushl %eax + movl $(__KERNEL_PERCPU), %eax + movl %eax, %fs +.if \unwind_espfix > 0 + UNWIND_ESPFIX_STACK +.endif + popl %eax + + FIXUP_FRAME pushl %es pushl %ds pushl \pt_regs_ax @@ -313,8 +334,6 @@ movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es - movl $(__KERNEL_PERCPU), %edx - movl %edx, %fs .if \skip_gs == 0 SET_KERNEL_GS %edx .endif @@ -324,8 +343,8 @@ .endif .endm -.macro SAVE_ALL_NMI cr3_reg:req - SAVE_ALL +.macro SAVE_ALL_NMI cr3_reg:req unwind_espfix=0 + SAVE_ALL unwind_espfix=\unwind_espfix BUG_IF_WRONG_CR3 @@ -357,6 +376,7 @@ 2: popl %es 3: popl %fs POP_GS \pop + IRET_FRAME .pushsection .fixup, "ax" 4: movl $0, (%esp) jmp 1b @@ -395,7 +415,8 @@ .macro CHECK_AND_APPLY_ESPFIX #ifdef CONFIG_X86_ESPFIX32 -#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_OFFSET (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + GDT_ESPFIX_OFFSET ALTERNATIVE "jmp .Lend_\@", "", X86_BUG_ESPFIX @@ -1074,7 +1095,6 @@ restore_all: /* Restore user state */ RESTORE_REGS pop=4 # skip orig_eax/error_code .Lirq_return: - IRET_FRAME /* * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization * when returning from IPI handler and when returning from @@ -1128,30 +1148,43 @@ SYM_FUNC_END(entry_INT80_32) * We can't call C functions using the ESPFIX stack. This code reads * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. + * + * We might be on user CR3 here, so percpu data is not mapped and we can't + * access the GDT through the percpu segment. Instead, use SGDT to find + * the cpu_entry_area alias of the GDT. */ #ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ - mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ - mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ + pushl %ecx + subl $2*4, %esp + sgdt (%esp) + movl 2(%esp), %ecx /* GDT address */ + /* + * Careful: ECX is a linear pointer, so we need to force base + * zero. %cs is the only known-linear segment we have right now. + */ + mov %cs:GDT_ESPFIX_OFFSET + 4(%ecx), %al /* bits 16..23 */ + mov %cs:GDT_ESPFIX_OFFSET + 7(%ecx), %ah /* bits 24..31 */ shl $16, %eax + addl $2*4, %esp + popl %ecx addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS pushl %eax lss (%esp), %esp /* switch to the normal stack segment */ #endif .endm + .macro UNWIND_ESPFIX_STACK + /* It's safe to clobber %eax, all other regs need to be preserved */ #ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax - jne 27f - movl $__KERNEL_DS, %eax - movl %eax, %ds - movl %eax, %es + jne .Lno_fixup_\@ /* switch to normal stack */ FIXUP_ESPFIX_STACK -27: +.Lno_fixup_\@: #endif .endm @@ -1341,11 +1374,6 @@ SYM_CODE_END(spurious_interrupt_bug) #ifdef CONFIG_XEN_PV SYM_FUNC_START(xen_hypervisor_callback) - pushl $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL - ENCODE_FRAME_POINTER - TRACE_IRQS_OFF - /* * Check to see if we got the event in the critical * region in xen_iret_direct, after we've reenabled @@ -1353,16 +1381,17 @@ SYM_FUNC_START(xen_hypervisor_callback) * iret instruction's behaviour where it delivers a * pending interrupt when enabling interrupts: */ - movl PT_EIP(%esp), %eax - cmpl $xen_iret_start_crit, %eax + cmpl $xen_iret_start_crit, (%esp) jb 1f - cmpl $xen_iret_end_crit, %eax + cmpl $xen_iret_end_crit, (%esp) jae 1f - - jmp xen_iret_crit_fixup - -SYM_INNER_LABEL_ALIGN(xen_do_upcall, SYM_L_GLOBAL) -1: mov %esp, %eax + call xen_iret_crit_fixup +1: + pushl $-1 /* orig_ax = -1 => not a system call */ + SAVE_ALL + ENCODE_FRAME_POINTER + TRACE_IRQS_OFF + mov %esp, %eax call xen_evtchn_do_upcall #ifndef CONFIG_PREEMPTION call xen_maybe_preempt_hcall @@ -1449,10 +1478,9 @@ SYM_CODE_END(page_fault) SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1474,9 +1502,8 @@ SYM_CODE_END(common_exception_read_cr2) SYM_CODE_START_LOCAL_NOALIGN(common_exception) /* the function address is in %gs's slot on the stack */ - SAVE_ALL switch_stacks=1 skip_gs=1 + SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1 ENCODE_FRAME_POINTER - UNWIND_ESPFIX_STACK /* fixup %gs */ GS_TO_REG %ecx @@ -1515,6 +1542,10 @@ SYM_CODE_START(nmi) ASM_CLAC #ifdef CONFIG_X86_ESPFIX32 + /* + * ESPFIX_SS is only ever set on the return to user path + * after we've switched to the entry stack. + */ pushl %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax @@ -1550,6 +1581,11 @@ SYM_CODE_START(nmi) movl %ebx, %esp .Lnmi_return: +#ifdef CONFIG_X86_ESPFIX32 + testl $CS_FROM_ESPFIX, PT_CS(%esp) + jnz .Lnmi_from_espfix +#endif + CHECK_AND_APPLY_ESPFIX RESTORE_ALL_NMI cr3_reg=%edi pop=4 jmp .Lirq_return @@ -1557,23 +1593,42 @@ SYM_CODE_START(nmi) #ifdef CONFIG_X86_ESPFIX32 .Lnmi_espfix_stack: /* - * create the pointer to lss back + * Create the pointer to LSS back */ pushl %ss pushl %esp addl $4, (%esp) - /* copy the iret frame of 12 bytes */ - .rept 3 - pushl 16(%esp) - .endr - pushl %eax - SAVE_ALL_NMI cr3_reg=%edi + + /* Copy the (short) IRET frame */ + pushl 4*4(%esp) # flags + pushl 4*4(%esp) # cs + pushl 4*4(%esp) # ip + + pushl %eax # orig_ax + + SAVE_ALL_NMI cr3_reg=%edi unwind_espfix=1 ENCODE_FRAME_POINTER - FIXUP_ESPFIX_STACK # %eax == %esp + + /* clear CS_FROM_KERNEL, set CS_FROM_ESPFIX */ + xorl $(CS_FROM_ESPFIX | CS_FROM_KERNEL), PT_CS(%esp) + xorl %edx, %edx # zero error code - call do_nmi + movl %esp, %eax # pt_regs pointer + jmp .Lnmi_from_sysenter_stack + +.Lnmi_from_espfix: RESTORE_ALL_NMI cr3_reg=%edi - lss 12+4(%esp), %esp # back to espfix stack + /* + * Because we cleared CS_FROM_KERNEL, IRET_FRAME 'forgot' to + * fix up the gap and long frame: + * + * 3 - original frame (exception) + * 2 - ESPFIX block (above) + * 6 - gap (FIXUP_FRAME) + * 5 - long frame (FIXUP_FRAME) + * 1 - orig_ax + */ + lss (1+5+6)*4(%esp), %esp # back to espfix stack jmp .Lirq_return #endif SYM_CODE_END(nmi) diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index aa3336a7cb15..7d17b3addbbb 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -10,13 +10,11 @@ #ifdef CONFIG_IA32_EMULATION /* On X86_64, we use struct pt_regs * to pass parameters to syscalls */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); - -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); - +#define __sys_ni_syscall __ia32_sys_ni_syscall #else /* CONFIG_IA32_EMULATION */ #define __SYSCALL_I386(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); +#define __sys_ni_syscall sys_ni_syscall #endif /* CONFIG_IA32_EMULATION */ #include <asm/syscalls_32.h> @@ -29,6 +27,6 @@ __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_compat_max] = &sys_ni_syscall, + [0 ... __NR_syscall_compat_max] = &__sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index b1bf31713374..adf619a856e8 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -4,11 +4,17 @@ #include <linux/linkage.h> #include <linux/sys.h> #include <linux/cache.h> +#include <linux/syscalls.h> #include <asm/asm-offsets.h> #include <asm/syscall.h> -/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */ -extern asmlinkage long sys_ni_syscall(const struct pt_regs *); +extern asmlinkage long sys_ni_syscall(void); + +SYSCALL_DEFINE0(ni_syscall) +{ + return sys_ni_syscall(); +} + #define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *); #define __SYSCALL_X32(nr, sym, qual) __SYSCALL_64(nr, sym, qual) #include <asm/syscalls_64.h> @@ -23,7 +29,7 @@ asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_syscall_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; @@ -40,7 +46,7 @@ asmlinkage const sys_call_ptr_t x32_sys_call_table[__NR_syscall_x32_max+1] = { * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_x32_max] = &sys_ni_syscall, + [0 ... __NR_syscall_x32_max] = &__x64_sys_ni_syscall, #include <asm/syscalls_64.h> }; diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3fe02546aed3..15908eb9b17e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -124,13 +124,13 @@ 110 i386 iopl sys_iopl __ia32_sys_iopl 111 i386 vhangup sys_vhangup __ia32_sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys_ni_syscall +113 i386 vm86old sys_vm86old __ia32_sys_ni_syscall 114 i386 wait4 sys_wait4 __ia32_compat_sys_wait4 115 i386 swapoff sys_swapoff __ia32_sys_swapoff 116 i386 sysinfo sys_sysinfo __ia32_compat_sys_sysinfo 117 i386 ipc sys_ipc __ia32_compat_sys_ipc 118 i386 fsync sys_fsync __ia32_sys_fsync -119 i386 sigreturn sys_sigreturn sys32_sigreturn +119 i386 sigreturn sys_sigreturn __ia32_compat_sys_sigreturn 120 i386 clone sys_clone __ia32_compat_sys_x86_clone 121 i386 setdomainname sys_setdomainname __ia32_sys_setdomainname 122 i386 uname sys_newuname __ia32_sys_newuname @@ -177,14 +177,14 @@ 163 i386 mremap sys_mremap __ia32_sys_mremap 164 i386 setresuid sys_setresuid16 __ia32_sys_setresuid16 165 i386 getresuid sys_getresuid16 __ia32_sys_getresuid16 -166 i386 vm86 sys_vm86 sys_ni_syscall +166 i386 vm86 sys_vm86 __ia32_sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll __ia32_sys_poll 169 i386 nfsservctl 170 i386 setresgid sys_setresgid16 __ia32_sys_setresgid16 171 i386 getresgid sys_getresgid16 __ia32_sys_getresgid16 172 i386 prctl sys_prctl __ia32_sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn __ia32_compat_sys_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction __ia32_compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask __ia32_compat_sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending __ia32_compat_sys_rt_sigpending diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e7d35f60d53f..64c3e70b0556 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -5,12 +5,14 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/delay.h> +#include <linux/jiffies.h> #include <asm/apicdef.h> #include <asm/nmi.h> #include "../perf_event.h" -static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); +static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp); +static unsigned long perf_nmi_window; static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -641,11 +643,12 @@ static void amd_pmu_disable_event(struct perf_event *event) * handler when multiple PMCs are active or PMC overflow while handling some * other source of an NMI. * - * Attempt to mitigate this by using the number of active PMCs to determine - * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset - * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the - * number of active PMCs or 2. The value of 2 is used in case an NMI does not - * arrive at the LAPIC in time to be collapsed into an already pending NMI. + * Attempt to mitigate this by creating an NMI window in which un-handled NMIs + * received during this window will be claimed. This prevents extending the + * window past when it is possible that latent NMIs should be received. The + * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has + * handled a counter. When an un-handled NMI is received, it will be claimed + * only if arriving within that window. */ static int amd_pmu_handle_irq(struct pt_regs *regs) { @@ -663,21 +666,19 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) handled = x86_pmu_handle_irq(regs); /* - * If a counter was handled, record the number of possible remaining - * NMIs that can occur. + * If a counter was handled, record a timestamp such that un-handled + * NMIs will be claimed if arriving within that window. */ if (handled) { - this_cpu_write(perf_nmi_counter, - min_t(unsigned int, 2, active)); + this_cpu_write(perf_nmi_tstamp, + jiffies + perf_nmi_window); return handled; } - if (!this_cpu_read(perf_nmi_counter)) + if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp))) return NMI_DONE; - this_cpu_dec(perf_nmi_counter); - return NMI_HANDLED; } @@ -909,6 +910,9 @@ static int __init amd_core_pmu_init(void) if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; + /* Avoid calulating the value each time in the NMI handler */ + perf_nmi_window = msecs_to_jiffies(100); + switch (boot_cpu_data.x86) { case 0x15: pr_cont("Fam15h "); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 5b35b7ea5d72..26c36357c4c9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -377,7 +377,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { config &= ~perf_ibs->cnt_mask; - wrmsrl(hwc->config_base, config); + if (boot_cpu_data.x86 == 0x10) + wrmsrl(hwc->config_base, config); config &= ~perf_ibs->enable_mask; wrmsrl(hwc->config_base, config); } @@ -553,7 +554,8 @@ static struct perf_ibs perf_ibs_op = { }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, - .cnt_mask = IBS_OP_MAX_CNT, + .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | + IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, .max_period = IBS_OP_MAX_CNT << 4, @@ -614,7 +616,7 @@ fail: if (event->attr.sample_type & PERF_SAMPLE_RAW) offset_max = perf_ibs->offset_max; else if (check_rip) - offset_max = 2; + offset_max = 3; else offset_max = 1; do { diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 27ee47a7be66..937363b803c1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3323,8 +3323,19 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } +#ifdef CONFIG_RETPOLINE +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr); +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr); +#endif + struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { +#ifdef CONFIG_RETPOLINE + if (x86_pmu.guest_get_msrs == intel_guest_get_msrs) + return intel_guest_get_msrs(nr); + else if (x86_pmu.guest_get_msrs == core_guest_get_msrs) + return core_guest_get_msrs(nr); +#endif if (x86_pmu.guest_get_msrs) return x86_pmu.guest_get_msrs(nr); *nr = 0; @@ -4983,6 +4994,8 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: x86_add_quirk(intel_pebs_isolation_quirk); x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -5031,6 +5044,8 @@ __init int intel_pmu_init(void) /* fall through */ case INTEL_FAM6_ICELAKE_L: case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 9f2f39003d96..e1daf4151e11 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -45,46 +45,49 @@ * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM, - CNL + * CNL,KBL,CML * Scope: Core * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 - * Available model: SNB,IVB,HSW,BDW,SKL,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, + * ICL,TGL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 - * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL + * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL + * GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW - * SKL,KNL,GLM,CNL + * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 - * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL + * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, + * KBL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,KBL,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL * Scope: Package (physical package) * */ @@ -544,6 +547,19 @@ static const struct cstate_model cnl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model icl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -614,6 +630,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), @@ -625,8 +643,10 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 74e80ed9c6c4..05e43d0f430b 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -627,7 +627,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp) * link as the 2nd entry in the table */ if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) { - TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p); + TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT; TOPA_ENTRY(&tp->topa, 1)->end = 1; } diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 6fc2e06ab4c6..86467f85c383 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -502,10 +502,8 @@ void uncore_pmu_event_start(struct perf_event *event, int flags) local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); uncore_enable_event(box, event); - if (box->n_active == 1) { - uncore_enable_box(box); + if (box->n_active == 1) uncore_pmu_start_hrtimer(box); - } } void uncore_pmu_event_stop(struct perf_event *event, int flags) @@ -529,10 +527,8 @@ void uncore_pmu_event_stop(struct perf_event *event, int flags) WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - if (box->n_active == 0) { - uncore_disable_box(box); + if (box->n_active == 0) uncore_pmu_cancel_hrtimer(box); - } } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { @@ -778,6 +774,40 @@ static int uncore_pmu_event_init(struct perf_event *event) return ret; } +static void uncore_pmu_enable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->enable_box) + uncore_pmu->type->ops->enable_box(box); +} + +static void uncore_pmu_disable(struct pmu *pmu) +{ + struct intel_uncore_pmu *uncore_pmu; + struct intel_uncore_box *box; + + uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); + if (!uncore_pmu) + return; + + box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); + if (!box) + return; + + if (uncore_pmu->type->ops->disable_box) + uncore_pmu->type->ops->disable_box(box); +} + static ssize_t uncore_get_attr_cpumask(struct device *dev, struct device_attribute *attr, char *buf) { @@ -803,6 +833,8 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) pmu->pmu = (struct pmu) { .attr_groups = pmu->type->attr_groups, .task_ctx_nr = perf_invalid_context, + .pmu_enable = uncore_pmu_enable, + .pmu_disable = uncore_pmu_disable, .event_init = uncore_pmu_event_init, .add = uncore_pmu_event_add, .del = uncore_pmu_event_del, diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index f36f7bebbc1b..bbfdaa720b45 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -441,18 +441,6 @@ static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box, return -EINVAL; } -static inline void uncore_disable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->disable_box) - box->pmu->type->ops->disable_box(box); -} - -static inline void uncore_enable_box(struct intel_uncore_box *box) -{ - if (box->pmu->type->ops->enable_box) - box->pmu->type->ops->enable_box(box); -} - static inline void uncore_disable_event(struct intel_uncore_box *box, struct perf_event *event) { diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index b1afc77f0704..6f86650b3f77 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -89,7 +89,14 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: + case INTEL_FAM6_COMETLAKE_L: + case INTEL_FAM6_COMETLAKE: case INTEL_FAM6_ICELAKE_L: + case INTEL_FAM6_ICELAKE: + case INTEL_FAM6_ICELAKE_X: + case INTEL_FAM6_ICELAKE_D: + case INTEL_FAM6_TIGERLAKE_L: + case INTEL_FAM6_TIGERLAKE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 5c056b8aebef..40e0e322161d 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -194,10 +194,20 @@ do_ex_hypercall: static bool __send_ipi_one(int cpu, int vector) { - struct cpumask mask = CPU_MASK_NONE; + int vp = hv_cpu_number_to_vp_number(cpu); - cpumask_set_cpu(cpu, &mask); - return __send_ipi_mask(&mask, vector); + trace_hyperv_send_ipi_one(cpu, vector); + + if (!hv_hypercall_pg || (vp == VP_INVAL)) + return false; + + if ((vector < HV_IPI_LOW_VECTOR) || (vector > HV_IPI_HIGH_VECTOR)) + return false; + + if (vp >= 64) + return __send_ipi_mask_ex(cpumask_of(cpu), vector); + + return !hv_do_fast_hypercall16(HVCALL_SEND_IPI, vector, BIT_ULL(vp)); } static void hv_send_ipi(int cpu, int vector) @@ -260,11 +270,21 @@ void __init hv_apic_init(void) } if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) { - pr_info("Hyper-V: Using MSR based APIC access\n"); + pr_info("Hyper-V: Using enlightened APIC (%s mode)", + x2apic_enabled() ? "x2apic" : "xapic"); + /* + * With x2apic, architectural x2apic MSRs are equivalent to the + * respective synthetic MSRs, so there's no need to override + * the apic accessors. The only exception is + * hv_apic_eoi_write, because it benefits from lazy EOI when + * available, but it works for both xapic and x2apic modes. + */ apic_set_eoi_write(hv_apic_eoi_write); - apic->read = hv_apic_read; - apic->write = hv_apic_write; - apic->icr_write = hv_apic_icr_write; - apic->icr_read = hv_apic_icr_read; + if (!x2apic_enabled()) { + apic->read = hv_apic_read; + apic->write = hv_apic_write; + apic->icr_write = hv_apic_icr_write; + apic->icr_read = hv_apic_icr_read; + } } } diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 2db3972c0e0f..50ff030d9224 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -311,6 +311,12 @@ void __init hyperv_init(void) hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + hv_apic_init(); x86_init.pci.arch_init = hv_pci_init; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1cee10091b9f..30416d7f19d4 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -21,6 +21,7 @@ #include <linux/personality.h> #include <linux/compat.h> #include <linux/binfmts.h> +#include <linux/syscalls.h> #include <asm/ucontext.h> #include <linux/uaccess.h> #include <asm/fpu/internal.h> @@ -118,7 +119,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, return err; } -asmlinkage long sys32_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(sigreturn) { struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); @@ -144,7 +145,7 @@ badframe: return 0; } -asmlinkage long sys32_rt_sigreturn(void) +COMPAT_SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h deleted file mode 100644 index facd374a1bf7..000000000000 --- a/arch/x86/include/asm/calgary.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Derived from include/asm-powerpc/iommu.h - * - * Copyright IBM Corporation, 2006-2007 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#ifndef _ASM_X86_CALGARY_H -#define _ASM_X86_CALGARY_H - -#include <linux/spinlock.h> -#include <linux/device.h> -#include <linux/dma-mapping.h> -#include <linux/timer.h> -#include <asm/types.h> - -struct iommu_table { - const struct cal_chipset_ops *chip_ops; /* chipset specific funcs */ - unsigned long it_base; /* mapped address of tce table */ - unsigned long it_hint; /* Hint for next alloc */ - unsigned long *it_map; /* A simple allocation bitmap for now */ - void __iomem *bbar; /* Bridge BAR */ - u64 tar_val; /* Table Address Register */ - struct timer_list watchdog_timer; - spinlock_t it_lock; /* Protects it_map */ - unsigned int it_size; /* Size of iommu table in entries */ - unsigned char it_busno; /* Bus number this table belongs to */ -}; - -struct cal_chipset_ops { - void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev); - void (*tce_cache_blast)(struct iommu_table *tbl); - void (*dump_error_regs)(struct iommu_table *tbl); -}; - -#define TCE_TABLE_SIZE_UNSPECIFIED ~0 -#define TCE_TABLE_SIZE_64K 0 -#define TCE_TABLE_SIZE_128K 1 -#define TCE_TABLE_SIZE_256K 2 -#define TCE_TABLE_SIZE_512K 3 -#define TCE_TABLE_SIZE_1M 4 -#define TCE_TABLE_SIZE_2M 5 -#define TCE_TABLE_SIZE_4M 6 -#define TCE_TABLE_SIZE_8M 7 - -extern int use_calgary; - -#ifdef CONFIG_CALGARY_IOMMU -extern int detect_calgary(void); -#else -static inline int detect_calgary(void) { return -ENODEV; } -#endif - -#endif /* _ASM_X86_CALGARY_H */ diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index cff3f3f3bfe0..ea866c7bf31d 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPU_ENTRY_AREA_H #define _ASM_X86_CPU_ENTRY_AREA_H @@ -78,8 +78,12 @@ struct cpu_entry_area { /* * The GDT is just below entry_stack and thus serves (on x86_64) as - * a a read-only guard page. + * a read-only guard page. On 32-bit the GDT must be writeable, so + * it needs an extra guard page. */ +#ifdef CONFIG_X86_32 + char guard_entry_stack[PAGE_SIZE]; +#endif struct entry_stack_page entry_stack_page; /* @@ -94,7 +98,6 @@ struct cpu_entry_area { */ struct cea_exception_stacks estacks; #endif -#ifdef CONFIG_CPU_SUP_INTEL /* * Per CPU debug store for Intel performance monitoring. Wastes a * full page at the moment. @@ -105,11 +108,13 @@ struct cpu_entry_area { * Reserve enough fixmap PTEs. */ struct debug_store_buffers cpu_debug_buffers; -#endif }; -#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) -#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +/* Total size includes the readonly IDT mapping page as well: */ +#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); @@ -117,13 +122,14 @@ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); +/* Single page reserved for the readonly IDT mapping: */ #define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE #define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) #define CPU_ENTRY_AREA_MAP_SIZE \ - (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE) extern struct cpu_entry_area *get_cpu_entry_area(int cpu); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0652d3eed9bd..e9b62498fe75 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -292,6 +292,7 @@ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_RDPRU (13*32+ 4) /* Read processor register at user level */ #define X86_FEATURE_WBNOINVD (13*32+ 9) /* WBNOINVD instruction */ #define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */ #define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */ @@ -399,5 +400,7 @@ #define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ +#define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h index 0acf5ee45a21..f58de66091e5 100644 --- a/arch/x86/include/asm/crash.h +++ b/arch/x86/include/asm/crash.h @@ -2,10 +2,17 @@ #ifndef _ASM_X86_CRASH_H #define _ASM_X86_CRASH_H +struct kimage; + int crash_load_segments(struct kimage *image); -int crash_copy_backup_region(struct kimage *image); int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params); void crash_smp_send_stop(void); +#ifdef CONFIG_KEXEC_CORE +void __init crash_reserve_low_1M(void); +#else +static inline void __init crash_reserve_low_1M(void) { } +#endif + #endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index a5ea841cc6d2..8e1d0bb46361 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -22,7 +22,7 @@ # define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) #endif -#ifdef CONFIG_X86_INTEL_UMIP +#ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else # define DISABLE_UMIP (1<<(X86_FEATURE_UMIP & 31)) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 0c47aa82e2e2..28183ee3cc42 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -156,7 +156,7 @@ extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); -void native_set_fixmap(enum fixed_addresses idx, +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags); #ifndef CONFIG_PARAVIRT_XXL diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 7741e211f7f5..5f10f7f2098d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -86,6 +86,8 @@ #define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) /* AccessReenlightenmentControls privilege */ #define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) +/* AccessTscInvariantControls privilege */ +#define HV_X64_ACCESS_TSC_INVARIANT BIT(15) /* * Feature identification: indicates which flags were specified at partition @@ -278,6 +280,9 @@ #define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 #define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 +/* TSC invariant control */ +#define HV_X64_MSR_TSC_INVARIANT_CONTROL 0x40000118 + /* * Declare the MSR used to setup pages used to communicate with the hypervisor. */ diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h index f04622500da3..c606c0b70738 100644 --- a/arch/x86/include/asm/intel-family.h +++ b/arch/x86/include/asm/intel-family.h @@ -83,6 +83,9 @@ #define INTEL_FAM6_TIGERLAKE_L 0x8C #define INTEL_FAM6_TIGERLAKE 0x8D +#define INTEL_FAM6_COMETLAKE 0xA5 +#define INTEL_FAM6_COMETLAKE_L 0xA6 + /* "Small Core" Processors (Atom) */ #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 5e7d6b46de97..6802c59e8252 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -66,10 +66,6 @@ struct kimage; # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif -/* Memory to backup during crash kdump */ -#define KEXEC_BACKUP_SRC_START (0UL) -#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */ - /* * This function is responsible for capturing register states if coming * via panic otherwise just fix up the ss and sp if coming via kernel @@ -154,12 +150,6 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; - /* Details of backup region */ - unsigned long backup_src_start; - unsigned long backup_src_sz; - - /* Physical address of backup segment */ - unsigned long backup_load_addr; /* Core ELF header buffer */ void *elf_headers; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 23edf56cf577..b79cd6aa4075 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -156,10 +156,8 @@ enum kvm_reg { VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, - NR_VCPU_REGS -}; + NR_VCPU_REGS, -enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, @@ -219,13 +217,6 @@ enum { PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) -/* - * The mask used to denote special SPTEs, which can be either MMIO SPTEs or - * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting - * with the SVE bit in EPT PTEs. - */ -#define SPTE_SPECIAL_MASK (1ULL << 62) - /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* @@ -319,9 +310,12 @@ struct kvm_rmap_head { struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + bool unsync; u8 mmu_valid_gen; bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* * The following two entries are used to key the shadow page in the @@ -458,6 +452,11 @@ struct kvm_pmc { u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; + /* + * eventsel value for general purpose counters, + * ctrl value for fixed counters. + */ + u64 current_config; }; struct kvm_pmu { @@ -476,7 +475,21 @@ struct kvm_pmu { struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; - u64 reprogram_pmi; + DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); + DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); + DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); + + /* + * The gate to release perf_events not marked in + * pmc_in_use only once in a vcpu time slice. + */ + bool need_cleanup; + + /* + * The total number of programmed perf_events and it helps to avoid + * redundant check before cleanup if guest don't use vPMU at all. + */ + u8 event_count; }; struct kvm_pmu_ops; @@ -569,6 +582,7 @@ struct kvm_vcpu_arch { u64 smbase; u64 smi_count; bool tpr_access_reporting; + bool xsaves_enabled; u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; @@ -866,6 +880,7 @@ struct kvm_arch { */ struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; + struct list_head lpage_disallowed_mmu_pages; struct kvm_page_track_notifier_node mmu_sp_tracker; struct kvm_page_track_notifier_head track_notifier_head; @@ -940,6 +955,7 @@ struct kvm_arch { bool exception_payload_enabled; struct kvm_pmu_event_filter *pmu_event_filter; + struct task_struct *nx_lpage_recovery_thread; }; struct kvm_vm_stat { @@ -953,6 +969,7 @@ struct kvm_vm_stat { ulong mmu_unsync; ulong remote_tlb_flush; ulong lpages; + ulong nx_lpage_splits; ulong max_mmu_page_hash_collisions; }; @@ -1042,7 +1059,6 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr3)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -1091,7 +1107,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); + bool (*get_enable_apicv)(struct kvm *kvm); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1196,7 +1212,7 @@ struct kvm_x86_ops { int (*set_nested_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); - void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); @@ -1358,6 +1374,7 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); @@ -1578,6 +1595,8 @@ bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_make_mclock_inprogress_request(struct kvm *kvm); void kvm_make_scan_ioapic_request(struct kvm *kvm); +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap); void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 7948a17febb4..c215d2762488 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -15,6 +15,8 @@ struct mod_arch_specific { #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M486SX +#define MODULE_PROC_FAMILY "486SX " #elif defined CONFIG_M486 #define MODULE_PROC_FAMILY "486 " #elif defined CONFIG_M586 diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 20ce682a2540..084e98da04a7 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -93,6 +93,18 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ +#define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ +#define ARCH_CAP_TAA_NO BIT(8) /* + * Not susceptible to + * TSX Async Abort (TAA) vulnerabilities. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* @@ -103,6 +115,10 @@ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_IA32_TSX_CTRL 0x00000122 +#define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ +#define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 @@ -393,6 +409,8 @@ #define MSR_AMD_PSTATE_DEF_BASE 0xc0010064 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD_PPIN_CTL 0xc00102f0 +#define MSR_AMD_PPIN 0xc00102f1 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index e28f8b723b5c..9d5252c9685c 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -21,7 +21,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_LOOPS ((u32)-1) -#define MWAITX_DISABLE_CSTATES 0xf +#define MWAITX_DISABLE_CSTATES 0xf0 static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 80bc209c0708..5c24a7b35166 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -314,7 +314,7 @@ DECLARE_STATIC_KEY_FALSE(mds_idle_clear); #include <asm/segment.h> /** - * mds_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * This uses the otherwise unused and obsolete VERW instruction in * combination with microcode which triggers a CPU buffer flush when the @@ -337,7 +337,7 @@ static inline void mds_clear_cpu_buffers(void) } /** - * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability + * mds_user_clear_cpu_buffers - Mitigation for MDS and TAA vulnerability * * Clear CPU buffers if the corresponding static key is enabled */ diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index e662f987dfa2..90d0731fdcb6 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -12,8 +12,6 @@ #include <asm/pat.h> #include <asm/x86_init.h> -#ifdef __KERNEL__ - struct pci_sysdata { int domain; /* PCI domain */ int node; /* NUMA node */ @@ -118,11 +116,6 @@ void native_restore_msi_irqs(struct pci_dev *dev); #define native_setup_msi_irqs NULL #define native_teardown_msi_irq NULL #endif -#endif /* __KERNEL__ */ - -#ifdef CONFIG_X86_64 -#include <asm/pci_64.h> -#endif /* generic pci stuff */ #include <asm-generic/pci.h> diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h deleted file mode 100644 index f5411de0ae11..000000000000 --- a/arch/x86/include/asm/pci_64.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_PCI_64_H -#define _ASM_X86_PCI_64_H - -#ifdef __KERNEL__ - -#ifdef CONFIG_CALGARY_IOMMU -static inline void *pci_iommu(struct pci_bus *bus) -{ - struct pci_sysdata *sd = bus->sysdata; - return sd->iommu; -} - -static inline void set_pci_iommu(struct pci_bus *bus, void *val) -{ - struct pci_sysdata *sd = bus->sysdata; - sd->iommu = val; -} -#endif /* CONFIG_CALGARY_IOMMU */ - -extern int (*pci_config_read)(int seg, int bus, int dev, int fn, - int reg, int len, u32 *value); -extern int (*pci_config_write)(int seg, int bus, int dev, int fn, - int reg, int len, u32 value); - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index e3633795fb22..5afb5e0fe903 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -36,39 +36,41 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) #define pmd_read_atomic pmd_read_atomic /* - * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with - * a "*pmdp" dereference done by gcc. Problem is, in certain places - * where pte_offset_map_lock is called, concurrent page faults are + * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by GCC. Problem is, in certain places + * where pte_offset_map_lock() is called, concurrent page faults are * allowed, if the mmap_sem is hold for reading. An example is mincore * vs page faults vs MADV_DONTNEED. On the page fault side - * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_populate() rightfully does a set_64bit(), but if we're reading the * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen - * because gcc will not read the 64bit of the pmd atomically. To fix - * this all places running pmd_offset_map_lock() while holding the + * because GCC will not read the 64-bit value of the pmd atomically. + * + * To fix this all places running pte_offset_map_lock() while holding the * mmap_sem in read mode, shall read the pmdp pointer using this - * function to know if the pmd is null nor not, and in turn to know if - * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * function to know if the pmd is null or not, and in turn to know if + * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the pmd can only - * transition from null to not null while pmd_read_atomic runs. So + * Without THP if the mmap_sem is held for reading, the pmd can only + * transition from null to not null while pmd_read_atomic() runs. So * we can always return atomic pmd values with this function. * - * With THP if the mmap_sem is hold for reading, the pmd can become + * With THP if the mmap_sem is held for reading, the pmd can become * trans_huge or none or point to a pte (and in turn become "stable") - * at any time under pmd_read_atomic. We could read it really - * atomically here with a atomic64_read for the THP enabled case (and + * at any time under pmd_read_atomic(). We could read it truly + * atomically here with an atomic64_read() for the THP enabled case (and * it would be a whole lot simpler), but to avoid using cmpxchg8b we * only return an atomic pmdval if the low part of the pmdval is later - * found stable (i.e. pointing to a pte). And we're returning a none - * pmdval if the low part of the pmd is none. In some cases the high - * and low part of the pmdval returned may not be consistent if THP is - * enabled (the low part may point to previously mapped hugepage, - * while the high part may point to a more recently mapped hugepage), - * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part - * of the pmd to be read atomically to decide if the pmd is unstable - * or not, with the only exception of when the low part of the pmd is - * zero in which case we return a none pmd. + * found to be stable (i.e. pointing to a pte). We are also returning a + * 'none' (zero) pmdval if the low part of the pmd is zero. + * + * In some cases the high and low part of the pmdval returned may not be + * consistent if THP is enabled (the low part may point to previously + * mapped hugepage, while the high part may point to a more recently + * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only + * needs the low part of the pmd to be read atomically to decide if the + * pmd is unstable or not, with the only exception when the low part + * of the pmd is zero, in which case we return a 'none' pmd. */ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0bc530c4eb13..ad97dc155195 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1463,6 +1463,12 @@ static inline bool arch_has_pfn_modify_check(void) return boot_cpu_has_bug(X86_BUG_L1TF); } +#define arch_faults_on_old_pte arch_faults_on_old_pte +static inline bool arch_faults_on_old_pte(void) +{ + return false; +} + #include <asm-generic/pgtable.h> #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index b0bc0fff5f1f..1636eb8e5a5b 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -44,11 +44,11 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c * to avoid include recursion hell */ -#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 39) -#define CPU_ENTRY_AREA_BASE \ - ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ - & PMD_MASK) +/* The +1 is for the readonly IDT page: */ +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_TOT_START - PAGE_SIZE*(CPU_ENTRY_AREA_PAGES+1)) & PMD_MASK) #define LDT_BASE_ADDR \ ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 790f250d39a8..94227da69da1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -93,7 +93,15 @@ struct cpuinfo_x86 { __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ int cpuid_level; - __u32 x86_capability[NCAPINTS + NBUGINTS]; + /* + * Align to size of unsigned long because the x86_capability array + * is passed to bitops which require the alignment. Use unnamed + * union to enforce the array is aligned to size of unsigned long. + */ + union { + __u32 x86_capability[NCAPINTS + NBUGINTS]; + unsigned long x86_capability_alignment; + }; char x86_vendor_id[16]; char x86_model_id[64]; /* in KB - valid for CPUS which support this call: */ @@ -988,4 +996,11 @@ enum mds_mitigations { MDS_MITIGATION_VMWERV, }; +enum taa_mitigations { + TAA_MITIGATION_OFF, + TAA_MITIGATION_UCODE_NEEDED, + TAA_MITIGATION_VERW, + TAA_MITIGATION_TSX_DISABLED, +}; + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h index 5df09a0b80b8..07375b476c4f 100644 --- a/arch/x86/include/asm/pti.h +++ b/arch/x86/include/asm/pti.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PTI_H #define _ASM_X86_PTI_H #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/purgatory.h b/arch/x86/include/asm/purgatory.h index 92c34e517da1..5528e9325049 100644 --- a/arch/x86/include/asm/purgatory.h +++ b/arch/x86/include/asm/purgatory.h @@ -6,16 +6,6 @@ #include <linux/purgatory.h> extern void purgatory(void); -/* - * These forward declarations serve two purposes: - * - * 1) Make sparse happy when checking arch/purgatory - * 2) Document that these are required to be global so the symbol - * lookup in kexec works - */ -extern unsigned long purgatory_backup_dest; -extern unsigned long purgatory_backup_src; -extern unsigned long purgatory_backup_sz; #endif /* __ASSEMBLY__ */ #endif /* _ASM_PURGATORY_H */ diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h deleted file mode 100644 index 0a21986d2238..000000000000 --- a/arch/x86/include/asm/rio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Derived from include/asm-x86/mach-summit/mach_mpparse.h - * and include/asm-x86/mach-default/bios_ebda.h - * - * Author: Laurent Vivier <Laurent.Vivier@bull.net> - */ - -#ifndef _ASM_X86_RIO_H -#define _ASM_X86_RIO_H - -#define RIO_TABLE_VERSION 3 - -struct rio_table_hdr { - u8 version; /* Version number of this data structure */ - u8 num_scal_dev; /* # of Scalability devices */ - u8 num_rio_dev; /* # of RIO I/O devices */ -} __attribute__((packed)); - -struct scal_detail { - u8 node_id; /* Scalability Node ID */ - u32 CBAR; /* Address of 1MB register space */ - u8 port0node; /* Node ID port connected to: 0xFF=None */ - u8 port0port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port1node; /* Node ID port connected to: 0xFF = None */ - u8 port1port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port2node; /* Node ID port connected to: 0xFF = None */ - u8 port2port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 chassis_num; /* 1 based Chassis number (1 = boot node) */ -} __attribute__((packed)); - -struct rio_detail { - u8 node_id; /* RIO Node ID */ - u32 BBAR; /* Address of 1MB register space */ - u8 type; /* Type of device */ - u8 owner_id; /* Node ID of Hurricane that owns this */ - /* node */ - u8 port0node; /* Node ID port connected to: 0xFF=None */ - u8 port0port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 port1node; /* Node ID port connected to: 0xFF=None */ - u8 port1port; /* Port num port connected to: 0,1,2, or */ - /* 0xFF=None */ - u8 first_slot; /* Lowest slot number below this Calgary */ - u8 status; /* Bit 0 = 1 : the XAPIC is used */ - /* = 0 : the XAPIC is not used, ie: */ - /* ints fwded to another XAPIC */ - /* Bits1:7 Reserved */ - u8 WP_index; /* instance index - lower ones have */ - /* lower slot numbers/PCI bus numbers */ - u8 chassis_num; /* 1 based Chassis number */ -} __attribute__((packed)); - -enum { - HURR_SCALABILTY = 0, /* Hurricane Scalability info */ - HURR_RIOIB = 2, /* Hurricane RIOIB info */ - COMPAT_CALGARY = 4, /* Compatibility Calgary */ - ALT_CALGARY = 5, /* Second Planar Calgary */ -}; - -#endif /* _ASM_X86_RIO_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index ac3892920419..6669164abadc 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -31,6 +31,18 @@ */ #define SEGMENT_RPL_MASK 0x3 +/* + * When running on Xen PV, the actual privilege level of the kernel is 1, + * not 0. Testing the Requested Privilege Level in a segment selector to + * determine whether the context is user mode or kernel mode with + * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level + * matches the 0x3 mask. + * + * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV + * kernels because privilege level 2 is never used. + */ +#define USER_SEGMENT_RPL_MASK 0x2 + /* User mode is privilege level 3: */ #define USER_RPL 0x3 diff --git a/arch/x86/include/asm/syscall_wrapper.h b/arch/x86/include/asm/syscall_wrapper.h index e046a405743d..e2389ce9bf58 100644 --- a/arch/x86/include/asm/syscall_wrapper.h +++ b/arch/x86/include/asm/syscall_wrapper.h @@ -6,6 +6,8 @@ #ifndef _ASM_X86_SYSCALL_WRAPPER_H #define _ASM_X86_SYSCALL_WRAPPER_H +struct pt_regs; + /* Mapping of registers to parameters for syscalls on x86-64 and x32 */ #define SC_X86_64_REGS_TO_ARGS(x, ...) \ __MAP(x,__SC_ARGS \ @@ -28,13 +30,21 @@ * kernel/sys_ni.c and SYS_NI in kernel/time/posix-stubs.c to cover this * case as well. */ +#define __IA32_COMPAT_SYS_STUB0(x, name) \ + asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__ia32_compat_sys_##name, ERRNO); \ + asmlinkage long __ia32_compat_sys_##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys_##name(); \ + } + #define __IA32_COMPAT_SYS_STUBx(x, name, ...) \ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs);\ ALLOW_ERROR_INJECTION(__ia32_compat_sys##name, ERRNO); \ asmlinkage long __ia32_compat_sys##name(const struct pt_regs *regs)\ { \ return __se_compat_sys##name(SC_IA32_REGS_TO_ARGS(x,__VA_ARGS__));\ - } \ + } #define __IA32_SYS_STUBx(x, name, ...) \ asmlinkage long __ia32_sys##name(const struct pt_regs *regs); \ @@ -48,16 +58,23 @@ * To keep the naming coherent, re-define SYSCALL_DEFINE0 to create an alias * named __ia32_sys_*() */ -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ - asmlinkage long __x64_sys_##sname(void) -#define COND_SYSCALL(name) \ - cond_syscall(__x64_sys_##name); \ - cond_syscall(__ia32_sys_##name) +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + SYSCALL_ALIAS(__ia32_sys_##sname, __x64_sys_##sname); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) + +#define COND_SYSCALL(name) \ + asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ + { \ + return sys_ni_syscall(); \ + } \ + asmlinkage __weak long __ia32_sys_##name(const struct pt_regs *__unused)\ + { \ + return sys_ni_syscall(); \ + } #define SYS_NI(name) \ SYSCALL_ALIAS(__x64_sys_##name, sys_ni_posix_timers); \ @@ -75,15 +92,24 @@ * of the x86-64-style parameter ordering of x32 syscalls. The syscalls common * with x86_64 obviously do not need such care. */ +#define __X32_COMPAT_SYS_STUB0(x, name, ...) \ + asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs);\ + ALLOW_ERROR_INJECTION(__x32_compat_sys_##name, ERRNO); \ + asmlinkage long __x32_compat_sys_##name(const struct pt_regs *regs)\ + { \ + return __se_compat_sys_##name();\ + } + #define __X32_COMPAT_SYS_STUBx(x, name, ...) \ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs);\ ALLOW_ERROR_INJECTION(__x32_compat_sys##name, ERRNO); \ asmlinkage long __x32_compat_sys##name(const struct pt_regs *regs)\ { \ return __se_compat_sys##name(SC_X86_64_REGS_TO_ARGS(x,__VA_ARGS__));\ - } \ + } #else /* CONFIG_X86_X32 */ +#define __X32_COMPAT_SYS_STUB0(x, name) #define __X32_COMPAT_SYS_STUBx(x, name, ...) #endif /* CONFIG_X86_X32 */ @@ -94,6 +120,17 @@ * mapping of registers to parameters, we need to generate stubs for each * of them. */ +#define COMPAT_SYSCALL_DEFINE0(name) \ + static long __se_compat_sys_##name(void); \ + static inline long __do_compat_sys_##name(void); \ + __IA32_COMPAT_SYS_STUB0(x, name) \ + __X32_COMPAT_SYS_STUB0(x, name) \ + static long __se_compat_sys_##name(void) \ + { \ + return __do_compat_sys_##name(); \ + } \ + static inline long __do_compat_sys_##name(void) + #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ static long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ @@ -181,15 +218,19 @@ * macros to work correctly. */ #ifndef SYSCALL_DEFINE0 -#define SYSCALL_DEFINE0(sname) \ - SYSCALL_METADATA(_##sname, 0); \ - asmlinkage long __x64_sys_##sname(void); \ - ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ - asmlinkage long __x64_sys_##sname(void) +#define SYSCALL_DEFINE0(sname) \ + SYSCALL_METADATA(_##sname, 0); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused);\ + ALLOW_ERROR_INJECTION(__x64_sys_##sname, ERRNO); \ + asmlinkage long __x64_sys_##sname(const struct pt_regs *__unused) #endif #ifndef COND_SYSCALL -#define COND_SYSCALL(name) cond_syscall(__x64_sys_##name) +#define COND_SYSCALL(name) \ + asmlinkage __weak long __x64_sys_##name(const struct pt_regs *__unused) \ + { \ + return sys_ni_syscall(); \ + } #endif #ifndef SYS_NI @@ -201,7 +242,6 @@ * For VSYSCALLS, we need to declare these three syscalls with the new * pt_regs-based calling convention for in-kernel use. */ -struct pt_regs; asmlinkage long __x64_sys_getcpu(const struct pt_regs *regs); asmlinkage long __x64_sys_gettimeofday(const struct pt_regs *regs); asmlinkage long __x64_sys_time(const struct pt_regs *regs); diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h deleted file mode 100644 index 6ed2deacf1d0..000000000000 --- a/arch/x86/include/asm/tce.h +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * This file is derived from asm-powerpc/tce.h. - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - * Author: Jon Mason <jdmason@us.ibm.com> - */ - -#ifndef _ASM_X86_TCE_H -#define _ASM_X86_TCE_H - -extern unsigned int specified_table_size; -struct iommu_table; - -#define TCE_ENTRY_SIZE 8 /* in bytes */ - -#define TCE_READ_SHIFT 0 -#define TCE_WRITE_SHIFT 1 -#define TCE_HUBID_SHIFT 2 /* unused */ -#define TCE_RSVD_SHIFT 8 /* unused */ -#define TCE_RPN_SHIFT 12 -#define TCE_UNUSED_SHIFT 48 /* unused */ - -#define TCE_RPN_MASK 0x0000fffffffff000ULL - -extern void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction); -extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages); -extern void * __init alloc_tce_table(void); -extern void __init free_tce_table(void *tbl); -extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar); - -#endif /* _ASM_X86_TCE_H */ diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 5e8319bb207a..23c626a742e8 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -26,10 +26,11 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define POKE_MAX_OPCODE_SIZE 5 struct text_poke_loc { - void *detour; void *addr; - size_t len; - const char opcode[POKE_MAX_OPCODE_SIZE]; + int len; + s32 rel32; + u8 opcode; + const u8 text[POKE_MAX_OPCODE_SIZE]; }; extern void text_poke_early(void *addr, const void *opcode, size_t len); @@ -51,8 +52,10 @@ extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); +extern void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate); extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; @@ -63,8 +66,17 @@ static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } -#define INT3_INSN_SIZE 1 -#define CALL_INSN_SIZE 5 +#define INT3_INSN_SIZE 1 +#define INT3_INSN_OPCODE 0xCC + +#define CALL_INSN_SIZE 5 +#define CALL_INSN_OPCODE 0xE8 + +#define JMP32_INSN_SIZE 5 +#define JMP32_INSN_OPCODE 0xE9 + +#define JMP8_INSN_SIZE 2 +#define JMP8_INSN_OPCODE 0xEB static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { diff --git a/arch/x86/include/asm/trace/hyperv.h b/arch/x86/include/asm/trace/hyperv.h index ace464f09681..4d705cb4d63b 100644 --- a/arch/x86/include/asm/trace/hyperv.h +++ b/arch/x86/include/asm/trace/hyperv.h @@ -71,6 +71,21 @@ TRACE_EVENT(hyperv_send_ipi_mask, __entry->ncpus, __entry->vector) ); +TRACE_EVENT(hyperv_send_ipi_one, + TP_PROTO(int cpu, + int vector), + TP_ARGS(cpu, vector), + TP_STRUCT__entry( + __field(int, cpu) + __field(int, vector) + ), + TP_fast_assign(__entry->cpu = cpu; + __entry->vector = vector; + ), + TP_printk("cpu %d vector %x", + __entry->cpu, __entry->vector) + ); + #endif /* CONFIG_HYPERV */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c225ede0e4..61d93f062a36 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -734,5 +734,28 @@ do { \ if (unlikely(__gu_err)) goto err_label; \ } while (0) +/* + * We want the unsafe accessors to always be inlined and use + * the error labels - thus the macro games. + */ +#define unsafe_copy_loop(dst, src, len, type, label) \ + while (len >= sizeof(type)) { \ + unsafe_put_user(*(type *)src,(type __user *)dst,label); \ + dst += sizeof(type); \ + src += sizeof(type); \ + len -= sizeof(type); \ + } + +#define unsafe_copy_to_user(_dst,_src,_len,label) \ +do { \ + char __user *__ucu_dst = (_dst); \ + const char *__ucu_src = (_src); \ + size_t __ucu_len = (_len); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ + unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ +} while (0) + #endif /* _ASM_X86_UACCESS_H */ diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h index db43f2a0d92c..aeed98c3c9e1 100644 --- a/arch/x86/include/asm/umip.h +++ b/arch/x86/include/asm/umip.h @@ -4,9 +4,9 @@ #include <linux/types.h> #include <asm/ptrace.h> -#ifdef CONFIG_X86_INTEL_UMIP +#ifdef CONFIG_X86_UMIP bool fixup_umip_exception(struct pt_regs *regs); #else static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; } -#endif /* CONFIG_X86_INTEL_UMIP */ +#endif /* CONFIG_X86_UMIP */ #endif /* _ASM_X86_UMIP_H */ diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 6e7caf65fa40..389174eaec79 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -138,7 +138,7 @@ extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); -extern void uv_bios_init(void); +extern int uv_bios_init(void); extern unsigned long sn_rtc_cycles_per_second; extern int uv_type; diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6bc6d89d8e2a..45ea95ce79b4 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -12,6 +12,16 @@ struct mm_struct; #ifdef CONFIG_X86_UV #include <linux/efi.h> +#define UV_PROC_NODE "sgi_uv" + +static inline int uv(int uvtype) +{ + /* uv(0) is "any" */ + if (uvtype >= 0 && uvtype <= 30) + return 1 << uvtype; + return 1; +} + extern unsigned long uv_systab_phys; extern enum uv_system_type get_uv_system_type(void); @@ -20,7 +30,8 @@ static inline bool is_early_uv_system(void) return uv_systab_phys && uv_systab_phys != EFI_INVALID_TABLE_ADDR; } extern int is_uv_system(void); -extern int is_uv_hubless(void); +extern int is_uv_hubbed(int uvtype); +extern int is_uv_hubless(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -32,7 +43,8 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } -static inline int is_uv_hubless(void) { return 0; } +static inline int is_uv_hubbed(int uv) { return 0; } +static inline int is_uv_hubless(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 44cf6d6deb7a..950cd1395d5d 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -19,6 +19,7 @@ #include <linux/topology.h> #include <asm/types.h> #include <asm/percpu.h> +#include <asm/uv/uv.h> #include <asm/uv/uv_mmrs.h> #include <asm/uv/bios.h> #include <asm/irq_vectors.h> @@ -243,83 +244,61 @@ static inline int uv_hub_info_check(int version) #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -#ifdef UV1_HUB_IS_SUPPORTED +/* WARNING: UVx_HUB_IS_SUPPORTED defines are deprecated and will be removed */ static inline int is_uv1_hub(void) { - return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE; -} +#ifdef UV1_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(1)); #else -static inline int is_uv1_hub(void) -{ return 0; -} #endif +} -#ifdef UV2_HUB_IS_SUPPORTED static inline int is_uv2_hub(void) { - return ((uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) && - (uv_hub_info->hub_revision < UV3_HUB_REVISION_BASE)); -} +#ifdef UV2_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(2)); #else -static inline int is_uv2_hub(void) -{ return 0; -} #endif +} -#ifdef UV3_HUB_IS_SUPPORTED static inline int is_uv3_hub(void) { - return ((uv_hub_info->hub_revision >= UV3_HUB_REVISION_BASE) && - (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE)); -} +#ifdef UV3_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(3)); #else -static inline int is_uv3_hub(void) -{ return 0; -} #endif +} /* First test "is UV4A", then "is UV4" */ -#ifdef UV4A_HUB_IS_SUPPORTED -static inline int is_uv4a_hub(void) -{ - return (uv_hub_info->hub_revision >= UV4A_HUB_REVISION_BASE); -} -#else static inline int is_uv4a_hub(void) { +#ifdef UV4A_HUB_IS_SUPPORTED + if (is_uv_hubbed(uv(4))) + return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); +#endif return 0; } -#endif -#ifdef UV4_HUB_IS_SUPPORTED static inline int is_uv4_hub(void) { - return uv_hub_info->hub_revision >= UV4_HUB_REVISION_BASE; -} +#ifdef UV4_HUB_IS_SUPPORTED + return is_uv_hubbed(uv(4)); #else -static inline int is_uv4_hub(void) -{ return 0; -} #endif +} static inline int is_uvx_hub(void) { - if (uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE) - return uv_hub_info->hub_revision; - - return 0; + return (is_uv_hubbed(-2) >= uv(2)); } static inline int is_uv_hub(void) { -#ifdef UV1_HUB_IS_SUPPORTED - return uv_hub_info->hub_revision; -#endif - return is_uvx_hub(); + return is_uv1_hub() || is_uvx_hub(); } union uvh_apicid { diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h index e00c9e875933..ac9fc51e2b18 100644 --- a/arch/x86/include/asm/vmware.h +++ b/arch/x86/include/asm/vmware.h @@ -4,6 +4,7 @@ #include <asm/cpufeatures.h> #include <asm/alternative.h> +#include <linux/stringify.h> /* * The hypercall definitions differ in the low word of the %edx argument @@ -20,8 +21,8 @@ */ /* Old port-based version */ -#define VMWARE_HYPERVISOR_PORT "0x5658" -#define VMWARE_HYPERVISOR_PORT_HB "0x5659" +#define VMWARE_HYPERVISOR_PORT 0x5658 +#define VMWARE_HYPERVISOR_PORT_HB 0x5659 /* Current vmcall / vmmcall version */ #define VMWARE_HYPERVISOR_HB BIT(0) @@ -29,7 +30,8 @@ /* The low bandwidth call. The low word of edx is presumed clear. */ #define VMWARE_HYPERCALL \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT ", %%dx; inl (%%dx)", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \ + "inl (%%dx), %%eax", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) @@ -38,7 +40,8 @@ * HB and OUT bits set. */ #define VMWARE_HYPERCALL_HB_OUT \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep outsb", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep outsb", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) @@ -47,7 +50,8 @@ * HB bit set. */ #define VMWARE_HYPERCALL_HB_IN \ - ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep insb", \ + ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \ + "rep insb", \ "vmcall", X86_FEATURE_VMCALL, \ "vmmcall", X86_FEATURE_VMW_VMMCALL) #endif diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c895df5482c5..8669c6bdbb84 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_BOOTPARAM_H #define _ASM_X86_BOOTPARAM_H -/* setup_data types */ +/* setup_data/setup_indirect types */ #define SETUP_NONE 0 #define SETUP_E820_EXT 1 #define SETUP_DTB 2 @@ -11,6 +11,11 @@ #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 +#define SETUP_INDIRECT (1<<31) + +/* SETUP_INDIRECT | max(SETUP_*) */ +#define SETUP_TYPE_MAX (SETUP_INDIRECT | SETUP_JAILHOUSE) + /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 @@ -49,6 +54,14 @@ struct setup_data { __u8 data[0]; }; +/* extensible setup indirect data node */ +struct setup_indirect { + __u32 type; + __u32 reserved; /* Reserved, must be set to zero. */ + __u64 len; + __u64 addr; +}; + struct setup_header { __u8 setup_sects; __u16 root_flags; @@ -88,6 +101,7 @@ struct setup_header { __u64 pref_address; __u32 init_size; __u32 handover_offset; + __u32 kernel_info_offset; } __attribute__((packed)); struct sys_desc_table { @@ -139,15 +153,22 @@ struct boot_e820_entry { * setup data structure. */ struct jailhouse_setup_data { - __u16 version; - __u16 compatible_version; - __u16 pm_timer_address; - __u16 num_cpus; - __u64 pci_mmconfig_base; - __u32 tsc_khz; - __u32 apic_khz; - __u8 standard_ioapic; - __u8 cpu_ids[255]; + struct { + __u16 version; + __u16 compatible_version; + } __attribute__((packed)) hdr; + struct { + __u16 pm_timer_address; + __u16 num_cpus; + __u64 pci_mmconfig_base; + __u32 tsc_khz; + __u32 apic_khz; + __u8 standard_ioapic; + __u8 cpu_ids[255]; + } __attribute__((packed)) v1; + struct { + __u32 flags; + } __attribute__((packed)) v2; } __attribute__((packed)); /* The so-called "zeropage" */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3578ad248bc9..32acb970f416 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -134,7 +134,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -obj-$(CONFIG_X86_INTEL_UMIP) += umip.o +obj-$(CONFIG_X86_UMIP) += umip.o obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o @@ -146,7 +146,6 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o - obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o obj-$(CONFIG_MMCONF_FAM10H) += mmconf-fam10h_64.o obj-y += vsmp_64.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9d3a971ea364..9ec463fe96f2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -956,16 +956,15 @@ NOKPROBE_SYMBOL(patch_cmp); int poke_int3_handler(struct pt_regs *regs) { struct text_poke_loc *tp; - unsigned char int3 = 0xcc; void *ip; /* * Having observed our INT3 instruction, we now must observe * bp_patching.nr_entries. * - * nr_entries != 0 INT3 - * WMB RMB - * write INT3 if (nr_entries) + * nr_entries != 0 INT3 + * WMB RMB + * write INT3 if (nr_entries) * * Idem for other elements in bp_patching. */ @@ -978,9 +977,9 @@ int poke_int3_handler(struct pt_regs *regs) return 0; /* - * Discount the sizeof(int3). See text_poke_bp_batch(). + * Discount the INT3. See text_poke_bp_batch(). */ - ip = (void *) regs->ip - sizeof(int3); + ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. @@ -997,8 +996,28 @@ int poke_int3_handler(struct pt_regs *regs) return 0; } - /* set up the specified breakpoint detour */ - regs->ip = (unsigned long) tp->detour; + ip += tp->len; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + /* + * Someone poked an explicit INT3, they'll want to handle it, + * do not consume. + */ + return 0; + + case CALL_INSN_OPCODE: + int3_emulate_call(regs, (long)ip + tp->rel32); + break; + + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + int3_emulate_jmp(regs, (long)ip + tp->rel32); + break; + + default: + BUG(); + } return 1; } @@ -1014,7 +1033,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * synchronization using int3 breakpoint. * * The way it is done: - * - For each entry in the vector: + * - For each entry in the vector: * - add a int3 trap to the address that will be patched * - sync cores * - For each entry in the vector: @@ -1027,9 +1046,9 @@ NOKPROBE_SYMBOL(poke_int3_handler); */ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) { - int patched_all_but_first = 0; - unsigned char int3 = 0xcc; + unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; + int do_sync; lockdep_assert_held(&text_mutex); @@ -1053,16 +1072,16 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) /* * Second step: update all but the first byte of the patched range. */ - for (i = 0; i < nr_entries; i++) { + for (do_sync = 0, i = 0; i < nr_entries; i++) { if (tp[i].len - sizeof(int3) > 0) { text_poke((char *)tp[i].addr + sizeof(int3), - (const char *)tp[i].opcode + sizeof(int3), + (const char *)tp[i].text + sizeof(int3), tp[i].len - sizeof(int3)); - patched_all_but_first++; + do_sync++; } } - if (patched_all_but_first) { + if (do_sync) { /* * According to Intel, this core syncing is very likely * not necessary and we'd be safe even without it. But @@ -1075,10 +1094,17 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * Third step: replace the first byte (int3) by the first byte of * replacing opcode. */ - for (i = 0; i < nr_entries; i++) - text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); + for (do_sync = 0, i = 0; i < nr_entries; i++) { + if (tp[i].text[0] == INT3_INSN_OPCODE) + continue; + + text_poke(tp[i].addr, tp[i].text, sizeof(int3)); + do_sync++; + } + + if (do_sync) + on_each_cpu(do_sync_core, NULL, 1); - on_each_cpu(do_sync_core, NULL, 1); /* * sync_core() implies an smp_mb() and orders this store against * the writing of the new instruction. @@ -1087,6 +1113,60 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) bp_patching.nr_entries = 0; } +void text_poke_loc_init(struct text_poke_loc *tp, void *addr, + const void *opcode, size_t len, const void *emulate) +{ + struct insn insn; + + if (!opcode) + opcode = (void *)tp->text; + else + memcpy((void *)tp->text, opcode, len); + + if (!emulate) + emulate = opcode; + + kernel_insn_init(&insn, emulate, MAX_INSN_SIZE); + insn_get_length(&insn); + + BUG_ON(!insn_complete(&insn)); + BUG_ON(len != insn.length); + + tp->addr = addr; + tp->len = len; + tp->opcode = insn.opcode.bytes[0]; + + switch (tp->opcode) { + case INT3_INSN_OPCODE: + break; + + case CALL_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case JMP8_INSN_OPCODE: + tp->rel32 = insn.immediate.value; + break; + + default: /* assume NOP */ + switch (len) { + case 2: /* NOP2 -- emulate as JMP8+0 */ + BUG_ON(memcmp(emulate, ideal_nops[len], len)); + tp->opcode = JMP8_INSN_OPCODE; + tp->rel32 = 0; + break; + + case 5: /* NOP5 -- emulate as JMP32+0 */ + BUG_ON(memcmp(emulate, ideal_nops[NOP_ATOMIC5], len)); + tp->opcode = JMP32_INSN_OPCODE; + tp->rel32 = 0; + break; + + default: /* unknown instruction */ + BUG(); + } + break; + } +} + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -1098,20 +1178,10 @@ void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) * dynamically allocated memory. This function should be used when it is * not possible to allocate memory. */ -void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp = { - .detour = handler, - .addr = addr, - .len = len, - }; - - if (len > POKE_MAX_OPCODE_SIZE) { - WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); - return; - } - - memcpy((void *)tp.opcode, opcode, len); + struct text_poke_loc tp; + text_poke_loc_init(&tp, addr, opcode, len, emulate); text_poke_bp_batch(&tp, 1); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index a6ac3712db8b..4bbccb9d16dc 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -510,10 +510,9 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { - pr_warning( - "PCI-DMA: Warning: Small IOMMU %luMB." + pr_warn("PCI-DMA: Warning: Small IOMMU %luMB." " Consider increasing the AGP aperture in BIOS\n", - iommu_size >> 20); + iommu_size >> 20); } return iommu_size; @@ -665,8 +664,7 @@ static __init int init_amd_gatt(struct agp_kern_info *info) nommu: /* Should not happen anymore */ - pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n" - "falling back to iommu=soft.\n"); + pr_warn("PCI-DMA: More than 4GB of RAM and no IOMMU - falling back to iommu=soft.\n"); return -1; } @@ -733,8 +731,8 @@ int __init gart_iommu_init(void) !gart_iommu_aperture || (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { - pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); - pr_warning("falling back to iommu=soft.\n"); + pr_warn("More than 4GB of memory but GART IOMMU not available.\n"); + pr_warn("falling back to iommu=soft.\n"); } return 0; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 9e2dd2b296cd..28446fa6bf18 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -780,8 +780,8 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)deltapm) * mult) >> 22; do_div(res, 1000000); - pr_warning("APIC calibration not consistent " - "with PM-Timer: %ldms instead of 100ms\n",(long)res); + pr_warn("APIC calibration not consistent " + "with PM-Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ res = (((u64)(*delta)) * pm_100ms); @@ -977,7 +977,7 @@ static int __init calibrate_APIC_clock(void) */ if (lapic_timer_period < (1000000 / HZ)) { local_irq_enable(); - pr_warning("APIC frequency too slow, disabling apic timer\n"); + pr_warn("APIC frequency too slow, disabling apic timer\n"); return -1; } @@ -1021,7 +1021,7 @@ static int __init calibrate_APIC_clock(void) local_irq_enable(); if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - pr_warning("APIC timer disabled due to verification failure\n"); + pr_warn("APIC timer disabled due to verification failure\n"); return -1; } @@ -1095,8 +1095,8 @@ static void local_apic_timer_interrupt(void) * spurious. */ if (!evt->event_handler) { - pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", - smp_processor_id()); + pr_warn("Spurious LAPIC timer interrupt on cpu %d\n", + smp_processor_id()); /* Switch it off */ lapic_timer_shutdown(evt); return; @@ -1586,9 +1586,6 @@ static void setup_local_APIC(void) { int cpu = smp_processor_id(); unsigned int value; -#ifdef CONFIG_X86_32 - int logical_apicid, ldr_apicid; -#endif if (disable_apic) { disable_ioapic_support(); @@ -1626,16 +1623,21 @@ static void setup_local_APIC(void) apic->init_apic_ldr(); #ifdef CONFIG_X86_32 - /* - * APIC LDR is initialized. If logical_apicid mapping was - * initialized during get_smp_config(), make sure it matches the - * actual value. - */ - logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); - ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); - WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid); - /* always use the value from LDR */ - early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + if (apic->dest_logical) { + int logical_apicid, ldr_apicid; + + /* + * APIC LDR is initialized. If logical_apicid mapping was + * initialized during get_smp_config(), make sure it matches + * the actual value. + */ + logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); + ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR)); + if (logical_apicid != BAD_APICID) + WARN_ON(logical_apicid != ldr_apicid); + /* Always use the value from LDR. */ + early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid; + } #endif /* @@ -1809,11 +1811,11 @@ static int __init setup_nox2apic(char *str) int apicid = native_apic_msr_read(APIC_ID); if (apicid >= 255) { - pr_warning("Apicid: %08x, cannot enforce nox2apic\n", - apicid); + pr_warn("Apicid: %08x, cannot enforce nox2apic\n", + apicid); return 0; } - pr_warning("x2apic already enabled.\n"); + pr_warn("x2apic already enabled.\n"); __x2apic_disable(); } setup_clear_cpu_cap(X86_FEATURE_X2APIC); @@ -1981,7 +1983,7 @@ static int __init apic_verify(void) */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); + pr_warn("Could not enable APIC!\n"); return -1; } set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); @@ -2335,7 +2337,7 @@ static int cpuid_to_apicid[] = { #ifdef CONFIG_SMP /** * apic_id_is_primary_thread - Check whether APIC ID belongs to a primary thread - * @id: APIC ID to check + * @apicid: APIC ID to check */ bool apic_id_is_primary_thread(unsigned int apicid) { @@ -2408,9 +2410,8 @@ int generic_processor_info(int apicid, int version) disabled_cpu_apicid == apicid) { int thiscpu = num_processors + disabled_cpus; - pr_warning("APIC: Disabling requested cpu." - " Processor %d/0x%x ignored.\n", - thiscpu, apicid); + pr_warn("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", thiscpu, apicid); disabled_cpus++; return -ENODEV; @@ -2424,8 +2425,7 @@ int generic_processor_info(int apicid, int version) apicid != boot_cpu_physical_apicid) { int thiscpu = max + disabled_cpus - 1; - pr_warning( - "APIC: NR_CPUS/possible_cpus limit of %i almost" + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2436,9 +2436,8 @@ int generic_processor_info(int apicid, int version) if (num_processors >= nr_cpu_ids) { int thiscpu = max + disabled_cpus; - pr_warning("APIC: NR_CPUS/possible_cpus limit of %i " - "reached. Processor %d/0x%x ignored.\n", - max, thiscpu, apicid); + pr_warn("APIC: NR_CPUS/possible_cpus limit of %i reached. " + "Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; @@ -2468,13 +2467,13 @@ int generic_processor_info(int apicid, int version) * Validate version */ if (version == 0x0) { - pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", - cpu, apicid); + pr_warn("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); version = 0x10; } if (version != boot_cpu_apic_version) { - pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + pr_warn("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", boot_cpu_apic_version, cpu, version); } @@ -2843,7 +2842,7 @@ static int __init apic_set_verbosity(char *arg) apic_verbosity = APIC_VERBOSE; #ifdef CONFIG_X86_64 else { - pr_warning("APIC Verbosity level %s not recognised" + pr_warn("APIC Verbosity level %s not recognised" " use apic=verbose or apic=debug\n", arg); return -EINVAL; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d6af97fd170a..913c88617848 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1725,19 +1725,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) return false; } -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { - /* If we are moving the irq we need to mask it */ + /* If we are moving the IRQ we need to mask it */ if (unlikely(irqd_is_setaffinity_pending(data))) { - mask_ioapic_irq(data); + if (!irqd_irq_masked(data)) + mask_ioapic_irq(data); return true; } return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { - if (unlikely(masked)) { + if (unlikely(moveit)) { /* Only migrate the irq if the ack has been received. * * On rare occasions the broadcast level triggered ack gets @@ -1766,15 +1767,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) */ if (!io_apic_level_ack_pending(data->chip_data)) irq_move_masked_irq(data); - unmask_ioapic_irq(data); + /* If the IRQ is masked in the core, leave it: */ + if (!irqd_irq_masked(data)) + unmask_ioapic_irq(data); } } #else -static inline bool ioapic_irqd_mask(struct irq_data *data) +static inline bool ioapic_prepare_move(struct irq_data *data) { return false; } -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) { } #endif @@ -1783,11 +1786,11 @@ static void ioapic_ack_level(struct irq_data *irq_data) { struct irq_cfg *cfg = irqd_cfg(irq_data); unsigned long v; - bool masked; + bool moveit; int i; irq_complete_move(cfg); - masked = ioapic_irqd_mask(irq_data); + moveit = ioapic_prepare_move(irq_data); /* * It appears there is an erratum which affects at least version 0x11 @@ -1842,7 +1845,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) eoi_ioapic_pin(cfg->vector, irq_data->chip_data); } - ioapic_irqd_unmask(irq_data, masked); + ioapic_finish_move(irq_data, moveit); } static void ioapic_ir_ack_level(struct irq_data *irq_data) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 45e92cba92f5..b0889c48a2ac 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -156,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu) { struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu); - cpumask_clear_cpu(dead_cpu, &cmsk->mask); + if (cmsk) + cpumask_clear_cpu(dead_cpu, &cmsk->mask); free_cpumask_var(per_cpu(ipi_mask, dead_cpu)); return 0; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index e6230af19864..d5b51a740524 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -14,6 +14,8 @@ #include <linux/memory.h> #include <linux/export.h> #include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/efi.h> #include <asm/e820/api.h> #include <asm/uv/uv_mmrs.h> @@ -25,12 +27,17 @@ static DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static bool uv_hubless_system; +static int uv_hubbed_system; +static int uv_hubless_system; static u64 gru_start_paddr, gru_end_paddr; static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr; static u64 gru_dist_lmask, gru_dist_umask; static union uvh_apicid uvh_apicid; +/* Unpack OEM/TABLE ID's to be NULL terminated strings */ +static u8 oem_id[ACPI_OEM_ID_SIZE + 1]; +static u8 oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1]; + /* Information derived from CPUID: */ static struct { unsigned int apicid_shift; @@ -248,17 +255,35 @@ static void __init uv_set_apicid_hibit(void) } } -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static void __init uv_stringify(int len, char *to, char *from) +{ + /* Relies on 'to' being NULL chars so result will be NULL terminated */ + strncpy(to, from, len-1); +} + +static int __init uv_acpi_madt_oem_check(char *_oem_id, char *_oem_table_id) { int pnodeid; int uv_apic; + uv_stringify(sizeof(oem_id), oem_id, _oem_id); + uv_stringify(sizeof(oem_table_id), oem_table_id, _oem_table_id); + if (strncmp(oem_id, "SGI", 3) != 0) { - if (strncmp(oem_id, "NSGI", 4) == 0) { - uv_hubless_system = true; - pr_info("UV: OEM IDs %s/%s, HUBLESS\n", - oem_id, oem_table_id); - } + if (strncmp(oem_id, "NSGI", 4) != 0) + return 0; + + /* UV4 Hubless, CH, (0x11:UV4+Any) */ + if (strncmp(oem_id, "NSGI4", 5) == 0) + uv_hubless_system = 0x11; + + /* UV3 Hubless, UV300/MC990X w/o hub (0x9:UV3+Any) */ + else + uv_hubless_system = 0x9; + + pr_info("UV: OEM IDs %s/%s, HUBLESS(0x%x)\n", + oem_id, oem_table_id, uv_hubless_system); + return 0; } @@ -286,6 +311,24 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (uv_hub_info->hub_revision == 0) goto badbios; + switch (uv_hub_info->hub_revision) { + case UV4_HUB_REVISION_BASE: + uv_hubbed_system = 0x11; + break; + + case UV3_HUB_REVISION_BASE: + uv_hubbed_system = 0x9; + break; + + case UV2_HUB_REVISION_BASE: + uv_hubbed_system = 0x5; + break; + + case UV1_HUB_REVISION_BASE: + uv_hubbed_system = 0x3; + break; + } + pnodeid = early_get_pnodeid(); early_get_apic_socketid_shift(); @@ -336,9 +379,15 @@ int is_uv_system(void) } EXPORT_SYMBOL_GPL(is_uv_system); -int is_uv_hubless(void) +int is_uv_hubbed(int uvtype) +{ + return (uv_hubbed_system & uvtype); +} +EXPORT_SYMBOL_GPL(is_uv_hubbed); + +int is_uv_hubless(int uvtype) { - return uv_hubless_system; + return (uv_hubless_system & uvtype); } EXPORT_SYMBOL_GPL(is_uv_hubless); @@ -1255,7 +1304,8 @@ static int __init decode_uv_systab(void) struct uv_systab *st; int i; - if (uv_hub_info->hub_revision < UV4_HUB_REVISION_BASE) + /* If system is uv3 or lower, there is no extended UVsystab */ + if (is_uv_hubbed(0xfffffe) < uv(4) && is_uv_hubless(0xfffffe) < uv(4)) return 0; /* No extended UVsystab required */ st = uv_systab; @@ -1434,6 +1484,103 @@ static void __init build_socket_tables(void) } } +/* Check which reboot to use */ +static void check_efi_reboot(void) +{ + /* If EFI reboot not available, use ACPI reboot */ + if (!efi_enabled(EFI_BOOT)) + reboot_type = BOOT_ACPI; +} + +/* Setup user proc fs files */ +static int proc_hubbed_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubbed_system); + return 0; +} + +static int proc_hubless_show(struct seq_file *file, void *data) +{ + seq_printf(file, "0x%x\n", uv_hubless_system); + return 0; +} + +static int proc_oemid_show(struct seq_file *file, void *data) +{ + seq_printf(file, "%s/%s\n", oem_id, oem_table_id); + return 0; +} + +static int proc_hubbed_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubbed_show, (void *)NULL); +} + +static int proc_hubless_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_hubless_show, (void *)NULL); +} + +static int proc_oemid_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_oemid_show, (void *)NULL); +} + +/* (struct is "non-const" as open function is set at runtime) */ +static struct file_operations proc_version_fops = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations proc_oemid_fops = { + .open = proc_oemid_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init void uv_setup_proc_files(int hubless) +{ + struct proc_dir_entry *pde; + char *name = hubless ? "hubless" : "hubbed"; + + pde = proc_mkdir(UV_PROC_NODE, NULL); + proc_create("oemid", 0, pde, &proc_oemid_fops); + proc_create(name, 0, pde, &proc_version_fops); + if (hubless) + proc_version_fops.open = proc_hubless_open; + else + proc_version_fops.open = proc_hubbed_open; +} + +/* Initialize UV hubless systems */ +static __init int uv_system_init_hubless(void) +{ + int rc; + + /* Setup PCH NMI handler */ + uv_nmi_setup_hubless(); + + /* Init kernel/BIOS interface */ + rc = uv_bios_init(); + if (rc < 0) + return rc; + + /* Process UVsystab */ + rc = decode_uv_systab(); + if (rc < 0) + return rc; + + /* Create user access node */ + if (rc >= 0) + uv_setup_proc_files(1); + + check_efi_reboot(); + + return rc; +} + static void __init uv_system_init_hub(void) { struct uv_hub_info_s hub_info = {0}; @@ -1559,32 +1706,27 @@ static void __init uv_system_init_hub(void) uv_nmi_setup(); uv_cpu_init(); uv_scir_register_cpu_notifier(); - proc_mkdir("sgi_uv", NULL); + uv_setup_proc_files(0); /* Register Legacy VGA I/O redirection handler: */ pci_register_set_vga_state(uv_set_vga_state); - /* - * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as - * EFI is not enabled in the kdump kernel: - */ - if (is_kdump_kernel()) - reboot_type = BOOT_ACPI; + check_efi_reboot(); } /* - * There is a small amount of UV specific code needed to initialize a - * UV system that does not have a "UV HUB" (referred to as "hubless"). + * There is a different code path needed to initialize a UV system that does + * not have a "UV HUB" (referred to as "hubless"). */ void __init uv_system_init(void) { - if (likely(!is_uv_system() && !is_uv_hubless())) + if (likely(!is_uv_system() && !is_uv_hubless(1))) return; if (is_uv_system()) uv_system_init_hub(); else - uv_nmi_setup_hubless(); + uv_system_init_hubless(); } apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d7a1e5a9331c..890f60083eca 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o ifdef CONFIG_CPU_SUP_INTEL -obj-y += intel.o intel_pconfig.o +obj-y += intel.o intel_pconfig.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 91c2561b905f..8bf64899f56a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -39,6 +39,8 @@ static void __init spectre_v2_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); +static void __init mds_print_mitigation(void); +static void __init taa_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -105,6 +107,13 @@ void __init check_bugs(void) ssb_select_mitigation(); l1tf_select_mitigation(); mds_select_mitigation(); + taa_select_mitigation(); + + /* + * As MDS and TAA mitigations are inter-related, print MDS + * mitigation until after TAA mitigation selection is done. + */ + mds_print_mitigation(); arch_smt_update(); @@ -243,6 +252,12 @@ static void __init mds_select_mitigation(void) (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } +} + +static void __init mds_print_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; pr_info("%s\n", mds_strings[mds_mitigation]); } @@ -269,6 +284,113 @@ static int __init mds_cmdline(char *str) early_param("mds", mds_cmdline); #undef pr_fmt +#define pr_fmt(fmt) "TAA: " fmt + +/* Default mitigation for TAA-affected CPUs */ +static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; +static bool taa_nosmt __ro_after_init; + +static const char * const taa_strings[] = { + [TAA_MITIGATION_OFF] = "Vulnerable", + [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", + [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", + [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", +}; + +static void __init taa_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_TAA)) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* TSX previously disabled by tsx=off */ + if (!boot_cpu_has(X86_FEATURE_RTM)) { + taa_mitigation = TAA_MITIGATION_TSX_DISABLED; + goto out; + } + + if (cpu_mitigations_off()) { + taa_mitigation = TAA_MITIGATION_OFF; + return; + } + + /* + * TAA mitigation via VERW is turned off if both + * tsx_async_abort=off and mds=off are specified. + */ + if (taa_mitigation == TAA_MITIGATION_OFF && + mds_mitigation == MDS_MITIGATION_OFF) + goto out; + + if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_VERW; + else + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ( (ia32_cap & ARCH_CAP_MDS_NO) && + !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + static_branch_enable(&mds_user_clear); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + + /* + * Update MDS mitigation, if necessary, as the mds_user_clear is + * now enabled for TAA mitigation. + */ + if (mds_mitigation == MDS_MITIGATION_OFF && + boot_cpu_has_bug(X86_BUG_MDS)) { + mds_mitigation = MDS_MITIGATION_FULL; + mds_select_mitigation(); + } +out: + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static int __init tsx_async_abort_parse_cmdline(char *str) +{ + if (!boot_cpu_has_bug(X86_BUG_TAA)) + return 0; + + if (!str) + return -EINVAL; + + if (!strcmp(str, "off")) { + taa_mitigation = TAA_MITIGATION_OFF; + } else if (!strcmp(str, "full")) { + taa_mitigation = TAA_MITIGATION_VERW; + } else if (!strcmp(str, "full,nosmt")) { + taa_mitigation = TAA_MITIGATION_VERW; + taa_nosmt = true; + } + + return 0; +} +early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); + +#undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { @@ -786,13 +908,10 @@ static void update_mds_branch_idle(void) } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" +#define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" void cpu_bugs_smt_update(void) { - /* Enhanced IBRS implies STIBP. No update required. */ - if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) - return; - mutex_lock(&spec_ctrl_mutex); switch (spectre_v2_user) { @@ -819,6 +938,17 @@ void cpu_bugs_smt_update(void) break; } + switch (taa_mitigation) { + case TAA_MITIGATION_VERW: + case TAA_MITIGATION_UCODE_NEEDED: + if (sched_smt_active()) + pr_warn_once(TAA_MSG_SMT); + break; + case TAA_MITIGATION_TSX_DISABLED: + case TAA_MITIGATION_OFF: + break; + } + mutex_unlock(&spec_ctrl_mutex); } @@ -1149,6 +1279,9 @@ void x86_spec_ctrl_setup_ap(void) x86_amd_ssb_disable(); } +bool itlb_multihit_kvm_mitigation; +EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); + #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt @@ -1304,11 +1437,24 @@ static ssize_t l1tf_show_state(char *buf) l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + if (itlb_multihit_kvm_mitigation) + return sprintf(buf, "KVM: Mitigation: Split huge pages\n"); + else + return sprintf(buf, "KVM: Vulnerable\n"); +} #else static ssize_t l1tf_show_state(char *buf) { return sprintf(buf, "%s\n", L1TF_DEFAULT_MSG); } + +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} #endif static ssize_t mds_show_state(char *buf) @@ -1328,6 +1474,21 @@ static ssize_t mds_show_state(char *buf) sched_smt_active() ? "vulnerable" : "disabled"); } +static ssize_t tsx_async_abort_show_state(char *buf) +{ + if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || + (taa_mitigation == TAA_MITIGATION_OFF)) + return sprintf(buf, "%s\n", taa_strings[taa_mitigation]); + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + return sprintf(buf, "%s; SMT Host state unknown\n", + taa_strings[taa_mitigation]); + } + + return sprintf(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], + sched_smt_active() ? "vulnerable" : "disabled"); +} + static char *stibp_state(void) { if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED) @@ -1398,6 +1559,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_MDS: return mds_show_state(buf); + case X86_BUG_TAA: + return tsx_async_abort_show_state(buf); + + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1434,4 +1601,14 @@ ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *bu { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } + +ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_TAA); +} + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9ae7d1bcd4f4..3103f2bfed85 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -565,8 +565,9 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; -__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; +/* Aligned to unsigned long to avoid split lock in atomic bitmap ops */ +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); +__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long)); void load_percpu_segment(int cpu) { @@ -1016,13 +1017,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,27 +1045,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1073,15 +1075,17 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1092,19 +1096,30 @@ static bool __init cpu_matches(unsigned long which) return m && !!(m->driver_data & which); } -static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +u64 x86_read_arch_cap_msr(void) { u64 ia32_cap = 0; + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + + return ia32_cap; +} + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); - if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) && !cpu_has(c, X86_FEATURE_AMD_SSB_NO)) setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); @@ -1121,6 +1136,21 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (!cpu_matches(NO_SWAPGS)) setup_force_cpu_bug(X86_BUG_SWAPGS); + /* + * When the CPU is not mitigated for TAA (TAA_NO=0) set TAA bug when: + * - TSX is supported or + * - TSX_CTRL is present + * + * TSX_CTRL check is needed for cases when TSX could be disabled before + * the kernel boot e.g. kexec. + * TSX_CTRL check alone is not sufficient for cases when the microcode + * update is not present or running as guest that don't get TSX_CTRL. + */ + if (!(ia32_cap & ARCH_CAP_TAA_NO) && + (cpu_has(c, X86_FEATURE_RTM) || + (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) + setup_force_cpu_bug(X86_BUG_TAA); + if (cpu_matches(NO_MELTDOWN)) return; @@ -1554,6 +1584,8 @@ void __init identify_boot_cpu(void) #endif cpu_detect_tlb(&boot_cpu_data); setup_cr_pinning(); + + tsx_init(); } void identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index c0e2407abdd6..38ab6e115eac 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -44,6 +44,22 @@ struct _tlb_table { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; +#ifdef CONFIG_CPU_SUP_INTEL +enum tsx_ctrl_states { + TSX_CTRL_ENABLE, + TSX_CTRL_DISABLE, + TSX_CTRL_NOT_SUPPORTED, +}; + +extern __ro_after_init enum tsx_ctrl_states tsx_ctrl_state; + +extern void __init tsx_init(void); +extern void tsx_enable(void); +extern void tsx_disable(void); +#else +static inline void tsx_init(void) { } +#endif /* CONFIG_CPU_SUP_INTEL */ + extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void get_cpu_address_sizes(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); @@ -62,4 +78,6 @@ unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern u64 x86_read_arch_cap_msr(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index c2fdc00df163..4a900804a023 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -762,6 +762,11 @@ static void init_intel(struct cpuinfo_x86 *c) detect_tme(c); init_intel_misc_features(c); + + if (tsx_ctrl_state == TSX_CTRL_ENABLE) + tsx_enable(); + if (tsx_ctrl_state == TSX_CTRL_DISABLE) + tsx_disable(); } #ifdef CONFIG_X86_32 @@ -814,7 +819,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" }, { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" }, - { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" }, + { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages" }, { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" }, @@ -842,7 +847,7 @@ static const struct _tlb_table intel_tlb_table[] = { { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, - { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, + { 0xc2, TLB_DATA_2M_4M, 16, " TLB_DATA 2 MByte/4MByte pages, 4-way associative" }, { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, { 0x00, 0, 0 } }; @@ -854,8 +859,8 @@ static void intel_tlb_lookup(const unsigned char desc) return; /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && \ - intel_tlb_table[k].descriptor != 0; k++) + for (k = 0; intel_tlb_table[k].descriptor != desc && + intel_tlb_table[k].descriptor != 0; k++) ; if (intel_tlb_table[k].tlb_type == 0) diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 6ea7fdc82f3c..5167bd2bb6b1 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -583,7 +583,7 @@ bool amd_filter_mce(struct mce *m) * - Prevent possible spurious interrupts from the IF bank on Family 0x17 * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) +static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { int i, num_msrs; u64 hwcr; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 743370ee4983..5f42f25bac8f 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -488,8 +488,9 @@ int mce_usable_address(struct mce *m) if (!(m->status & MCI_STATUS_ADDRV)) return 0; - /* Checks after this one are Intel-specific: */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + /* Checks after this one are Intel/Zhaoxin-specific: */ + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 1; if (!(m->status & MCI_STATUS_MISCV)) @@ -507,10 +508,13 @@ EXPORT_SYMBOL_GPL(mce_usable_address); bool mce_is_memory_error(struct mce *m) { - if (m->cpuvendor == X86_VENDOR_AMD || - m->cpuvendor == X86_VENDOR_HYGON) { + switch (m->cpuvendor) { + case X86_VENDOR_AMD: + case X86_VENDOR_HYGON: return amd_mce_is_memory_error(m); - } else if (m->cpuvendor == X86_VENDOR_INTEL) { + + case X86_VENDOR_INTEL: + case X86_VENDOR_ZHAOXIN: /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -527,9 +531,10 @@ bool mce_is_memory_error(struct mce *m) return (m->status & 0xef80) == BIT(7) || (m->status & 0xef00) == BIT(8) || (m->status & 0xeffc) == 0xc; - } - return false; + default: + return false; + } } EXPORT_SYMBOL_GPL(mce_is_memory_error); @@ -1127,6 +1132,12 @@ static bool __mc_check_crashing_cpu(int cpu) u64 mcgstatus; mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { + if (mcgstatus & MCG_STATUS_LMCES) + return false; + } + if (mcgstatus & MCG_STATUS_RIPV) { mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); return true; @@ -1277,9 +1288,10 @@ void do_machine_check(struct pt_regs *regs, long error_code) /* * Check if this MCE is signaled to only this logical processor, - * on Intel only. + * on Intel, Zhaoxin only. */ - if (m.cpuvendor == X86_VENDOR_INTEL) + if (m.cpuvendor == X86_VENDOR_INTEL || + m.cpuvendor == X86_VENDOR_ZHAOXIN) lmce = m.mcgstatus & MCG_STATUS_LMCES; /* @@ -1697,6 +1709,18 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; } + + if (c->x86_vendor == X86_VENDOR_ZHAOXIN) { + /* + * All newer Zhaoxin CPUs support MCE broadcasting. Enable + * synchronization with a one second timeout. + */ + if (c->x86 > 6 || (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (cfg->monarch_timeout < 0) + cfg->monarch_timeout = USEC_PER_SEC; + } + } + if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; if (cfg->bootlog != 0) @@ -1760,6 +1784,35 @@ static void mce_centaur_feature_init(struct cpuinfo_x86 *c) } } +static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c) +{ + struct mce_bank *mce_banks = this_cpu_ptr(mce_banks_array); + + /* + * These CPUs have MCA bank 8 which reports only one error type called + * SVAD (System View Address Decoder). The reporting of that error is + * controlled by IA32_MC8.CTL.0. + * + * If enabled, prefetching on these CPUs will cause SVAD MCE when + * virtual machines start and result in a system panic. Always disable + * bank 8 SVAD error by default. + */ + if ((c->x86 == 7 && c->x86_model == 0x1b) || + (c->x86_model == 0x19 || c->x86_model == 0x1f)) { + if (this_cpu_read(mce_num_banks) > 8) + mce_banks[8].ctl = 0; + } + + intel_init_cmci(); + intel_init_lmce(); + mce_adjust_timer = cmci_intel_adjust_timer; +} + +static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c) +{ + intel_clear_lmce(); +} + static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { @@ -1781,6 +1834,10 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) mce_centaur_feature_init(c); break; + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_init(c); + break; + default: break; } @@ -1792,6 +1849,11 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) case X86_VENDOR_INTEL: mce_intel_feature_clear(c); break; + + case X86_VENDOR_ZHAOXIN: + mce_zhaoxin_feature_clear(c); + break; + default: break; } @@ -2014,15 +2076,16 @@ static void mce_disable_error_reporting(void) static void vendor_disable_error_reporting(void) { /* - * Don't clear on Intel or AMD or Hygon CPUs. Some of these MSRs - * are socket-wide. - * Disabling them for just a single offlined CPU is bad, since it will - * inhibit reporting for all shared resources on the socket like the - * last level cache (LLC), the integrated memory controller (iMC), etc. + * Don't clear on Intel or AMD or Hygon or Zhaoxin CPUs. Some of these + * MSRs are socket-wide. Disabling them for just a single offlined CPU + * is bad, since it will inhibit reporting for all shared resources on + * the socket like the last level cache (LLC), the integrated memory + * controller (iMC), etc. */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) return; mce_disable_error_reporting(); diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index 88cd9598fa57..e270d0770134 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -85,8 +85,10 @@ static int cmci_supported(int *banks) * initialization is vendor keyed and this * makes sure none of the backdoors are entered otherwise. */ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) return 0; + if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return 0; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -423,7 +425,7 @@ void cmci_disable_bank(int bank) raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } -static void intel_init_cmci(void) +void intel_init_cmci(void) { int banks; @@ -442,7 +444,7 @@ static void intel_init_cmci(void) cmci_recheck(); } -static void intel_init_lmce(void) +void intel_init_lmce(void) { u64 val; @@ -455,7 +457,7 @@ static void intel_init_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } -static void intel_clear_lmce(void) +void intel_clear_lmce(void) { u64 val; @@ -482,6 +484,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_X: case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 43031db429d2..842b273bce31 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -45,11 +45,17 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval); bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); +void intel_init_cmci(void); +void intel_init_lmce(void); +void intel_clear_lmce(void); #else # define cmci_intel_adjust_timer mce_adjust_timer_default static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } +static inline void intel_init_cmci(void) { } +static inline void intel_init_lmce(void) { } +static inline void intel_clear_lmce(void) { } #endif void mce_timer_kick(unsigned long interval); diff --git a/arch/x86/kernel/cpu/mce/therm_throt.c b/arch/x86/kernel/cpu/mce/therm_throt.c index 6e2becf547c5..d01e0da0163a 100644 --- a/arch/x86/kernel/cpu/mce/therm_throt.c +++ b/arch/x86/kernel/cpu/mce/therm_throt.c @@ -40,15 +40,58 @@ #define THERMAL_THROTTLING_EVENT 0 #define POWER_LIMIT_EVENT 1 -/* - * Current thermal event state: +/** + * struct _thermal_state - Represent the current thermal event state + * @next_check: Stores the next timestamp, when it is allowed + * to log the next warning message. + * @last_interrupt_time: Stores the timestamp for the last threshold + * high event. + * @therm_work: Delayed workqueue structure + * @count: Stores the current running count for thermal + * or power threshold interrupts. + * @last_count: Stores the previous running count for thermal + * or power threshold interrupts. + * @max_time_ms: This shows the maximum amount of time CPU was + * in throttled state for a single thermal + * threshold high to low state. + * @total_time_ms: This is a cumulative time during which CPU was + * in the throttled state. + * @rate_control_active: Set when a throttling message is logged. + * This is used for the purpose of rate-control. + * @new_event: Stores the last high/low status of the + * THERM_STATUS_PROCHOT or + * THERM_STATUS_POWER_LIMIT. + * @level: Stores whether this _thermal_state instance is + * for a CORE level or for PACKAGE level. + * @sample_index: Index for storing the next sample in the buffer + * temp_samples[]. + * @sample_count: Total number of samples collected in the buffer + * temp_samples[]. + * @average: The last moving average of temperature samples + * @baseline_temp: Temperature at which thermal threshold high + * interrupt was generated. + * @temp_samples: Storage for temperature samples to calculate + * moving average. + * + * This structure is used to represent data related to thermal state for a CPU. + * There is a separate storage for core and package level for each CPU. */ struct _thermal_state { - bool new_event; - int event; u64 next_check; + u64 last_interrupt_time; + struct delayed_work therm_work; unsigned long count; unsigned long last_count; + unsigned long max_time_ms; + unsigned long total_time_ms; + bool rate_control_active; + bool new_event; + u8 level; + u8 sample_index; + u8 sample_count; + u8 average; + u8 baseline_temp; + u8 temp_samples[3]; }; struct thermal_state { @@ -121,8 +164,22 @@ define_therm_throt_device_one_ro(package_throttle_count); define_therm_throt_device_show_func(package_power_limit, count); define_therm_throt_device_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(core_throttle, max_time_ms); +define_therm_throt_device_one_ro(core_throttle_max_time_ms); + +define_therm_throt_device_show_func(package_throttle, max_time_ms); +define_therm_throt_device_one_ro(package_throttle_max_time_ms); + +define_therm_throt_device_show_func(core_throttle, total_time_ms); +define_therm_throt_device_one_ro(core_throttle_total_time_ms); + +define_therm_throt_device_show_func(package_throttle, total_time_ms); +define_therm_throt_device_one_ro(package_throttle_total_time_ms); + static struct attribute *thermal_throttle_attrs[] = { &dev_attr_core_throttle_count.attr, + &dev_attr_core_throttle_max_time_ms.attr, + &dev_attr_core_throttle_total_time_ms.attr, NULL }; @@ -135,6 +192,105 @@ static const struct attribute_group thermal_attr_group = { #define CORE_LEVEL 0 #define PACKAGE_LEVEL 1 +#define THERM_THROT_POLL_INTERVAL HZ +#define THERM_STATUS_PROCHOT_LOG BIT(1) + +static void clear_therm_status_log(int level) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG); +} + +static void get_therm_status(int level, bool *proc_hot, u8 *temp) +{ + int msr; + u64 msr_val; + + if (level == CORE_LEVEL) + msr = MSR_IA32_THERM_STATUS; + else + msr = MSR_IA32_PACKAGE_THERM_STATUS; + + rdmsrl(msr, msr_val); + if (msr_val & THERM_STATUS_PROCHOT_LOG) + *proc_hot = true; + else + *proc_hot = false; + + *temp = (msr_val >> 16) & 0x7F; +} + +static void throttle_active_work(struct work_struct *work) +{ + struct _thermal_state *state = container_of(to_delayed_work(work), + struct _thermal_state, therm_work); + unsigned int i, avg, this_cpu = smp_processor_id(); + u64 now = get_jiffies_64(); + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* temperature value is offset from the max so lesser means hotter */ + if (!hot && temp > state->baseline_temp) { + if (state->rate_control_active) + pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + + state->rate_control_active = false; + return; + } + + if (time_before64(now, state->next_check) && + state->rate_control_active) + goto re_arm; + + state->next_check = now + CHECK_INTERVAL; + + if (state->count != state->last_count) { + /* There was one new thermal interrupt */ + state->last_count = state->count; + state->average = 0; + state->sample_count = 0; + state->sample_index = 0; + } + + state->temp_samples[state->sample_index] = temp; + state->sample_count++; + state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples); + if (state->sample_count < ARRAY_SIZE(state->temp_samples)) + goto re_arm; + + avg = 0; + for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i) + avg += state->temp_samples[i]; + + avg /= ARRAY_SIZE(state->temp_samples); + + if (state->average > avg) { + pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n", + this_cpu, + state->level == CORE_LEVEL ? "Core" : "Package", + state->count); + state->rate_control_active = true; + } + + state->average = avg; + +re_arm: + clear_therm_status_log(state->level); + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); +} + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -178,27 +334,33 @@ static void therm_throt_process(bool new_event, int event, int level) if (new_event) state->count++; - if (time_before64(now, state->next_check) && - state->count != state->last_count) + if (event != THERMAL_THROTTLING_EVENT) return; - state->next_check = now + CHECK_INTERVAL; - state->last_count = state->count; + if (new_event && !state->last_interrupt_time) { + bool hot; + u8 temp; + + get_therm_status(state->level, &hot, &temp); + /* + * Ignore short temperature spike as the system is not close + * to PROCHOT. 10C offset is large enough to ignore. It is + * already dropped from the high threshold temperature. + */ + if (temp > 10) + return; - /* if we just entered the thermal event */ - if (new_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_crit("CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", - this_cpu, - level == CORE_LEVEL ? "Core" : "Package", - state->count); - return; - } - if (old_event) { - if (event == THERMAL_THROTTLING_EVENT) - pr_info("CPU%d: %s temperature/speed normal\n", this_cpu, - level == CORE_LEVEL ? "Core" : "Package"); - return; + state->baseline_temp = temp; + state->last_interrupt_time = now; + schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL); + } else if (old_event && state->last_interrupt_time) { + unsigned long throttle_time; + + throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time); + if (throttle_time > state->max_time_ms) + state->max_time_ms = throttle_time; + state->total_time_ms += throttle_time; + state->last_interrupt_time = 0; } } @@ -244,20 +406,47 @@ static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) if (err) return err; - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } + if (cpu_has(c, X86_FEATURE_PTS)) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_throttle_count.attr, thermal_attr_group.name); - if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_max_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_total_time_ms.attr, + thermal_attr_group.name); + if (err) + goto del_group; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) { err = sysfs_add_file_to_group(&dev->kobj, &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); + if (err) + goto del_group; + } } + return 0; + +del_group: + sysfs_remove_group(&dev->kobj, &thermal_attr_group); + return err; } @@ -269,15 +458,29 @@ static void thermal_throttle_remove_dev(struct device *dev) /* Get notified when a cpu comes on/off. Be hotplug friendly. */ static int thermal_throttle_online(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + state->package_throttle.level = PACKAGE_LEVEL; + state->core_throttle.level = CORE_LEVEL; + + INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work); + INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work); + return thermal_throttle_add_dev(dev, cpu); } static int thermal_throttle_offline(unsigned int cpu) { + struct thermal_state *state = &per_cpu(thermal_state, cpu); struct device *dev = get_cpu_device(cpu); + cancel_delayed_work(&state->package_throttle.therm_work); + cancel_delayed_work(&state->core_throttle.therm_work); + + state->package_throttle.rate_control_active = false; + state->core_throttle.rate_control_active = false; + thermal_throttle_remove_dev(dev); return 0; } diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index a0e52bd00ecc..3f6b137ef4e6 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -567,7 +567,7 @@ int __init save_microcode_in_initrd_amd(unsigned int cpuid_1_eax) void reload_ucode_amd(void) { struct microcode_amd *mc; - u32 rev, dummy; + u32 rev, dummy __always_unused; mc = (struct microcode_amd *)amd_ucode_patch; @@ -673,7 +673,7 @@ static enum ucode_state apply_microcode_amd(int cpu) struct ucode_cpu_info *uci; struct ucode_patch *p; enum ucode_state ret; - u32 rev, dummy; + u32 rev, dummy __always_unused; BUG_ON(raw_smp_processor_id() != cpu); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index cb0fdcaf1415..7019d4b2df0c 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -63,11 +63,6 @@ LIST_HEAD(microcode_cache); */ static DEFINE_MUTEX(microcode_mutex); -/* - * Serialize late loading so that CPUs get updated one-by-one. - */ -static DEFINE_RAW_SPINLOCK(update_lock); - struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; struct cpu_info_ctx { @@ -566,11 +561,18 @@ static int __reload_late(void *info) if (__wait_for_cpus(&late_cpus_in, NSEC_PER_SEC)) return -1; - raw_spin_lock(&update_lock); - apply_microcode_local(&err); - raw_spin_unlock(&update_lock); + /* + * On an SMT system, it suffices to load the microcode on one sibling of + * the core because the microcode engine is shared between the threads. + * Synchronization still needs to take place so that no concurrent + * loading attempts happen on multiple threads of an SMT core. See + * below. + */ + if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu) + apply_microcode_local(&err); + else + goto wait_for_siblings; - /* siblings return UCODE_OK because their engine got updated already */ if (err > UCODE_NFOUND) { pr_warn("Error reloading microcode on CPU %d\n", cpu); ret = -1; @@ -578,14 +580,18 @@ static int __reload_late(void *info) ret = 1; } +wait_for_siblings: + if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC)) + panic("Timeout during microcode update!\n"); + /* - * Increase the wait timeout to a safe value here since we're - * serializing the microcode update and that could take a while on a - * large number of CPUs. And that is fine as the *actual* timeout will - * be determined by the last CPU finished updating and thus cut short. + * At least one thread has completed update on each core. + * For others, simply call the update to make sure the + * per-cpu cpuinfo can be updated with right microcode + * revision. */ - if (__wait_for_cpus(&late_cpus_out, NSEC_PER_SEC * num_online_cpus())) - panic("Timeout during microcode update!\n"); + if (cpumask_first(topology_sibling_cpumask(cpu)) != cpu) + apply_microcode_local(&err); return ret; } diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index ce799cfe9434..6a99535d7f37 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -791,6 +791,7 @@ static enum ucode_state apply_microcode_intel(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpuinfo_x86 *c = &cpu_data(cpu); + bool bsp = c->cpu_index == boot_cpu_data.cpu_index; struct microcode_intel *mc; enum ucode_state ret; static int prev_rev; @@ -836,7 +837,7 @@ static enum ucode_state apply_microcode_intel(int cpu) return UCODE_ERROR; } - if (rev != prev_rev) { + if (bsp && rev != prev_rev) { pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", rev, mc->hdr.date & 0xffff, @@ -852,7 +853,7 @@ out: c->microcode = rev; /* Update boot_cpu_data's revision too, if we're on the BSP: */ - if (c->cpu_index == boot_cpu_data.cpu_index) + if (bsp) boot_cpu_data.microcode = rev; return ret; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 267daad8c036..caa032ce3fe3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -216,6 +216,10 @@ static void __init ms_hyperv_init_platform(void) int hv_host_info_ecx; int hv_host_info_edx; +#ifdef CONFIG_PARAVIRT + pv_info.name = "Hyper-V"; +#endif + /* * Extract the features and hints */ @@ -286,7 +290,12 @@ static void __init ms_hyperv_init_platform(void) machine_ops.shutdown = hv_machine_shutdown; machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif - mark_tsc_unstable("running on Hyper-V"); + if (ms_hyperv.features & HV_X64_ACCESS_TSC_INVARIANT) { + wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, 0x1); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + } else { + mark_tsc_unstable("running on Hyper-V"); + } /* * Generation 2 instances don't support reading the NMI status from diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 5c900f9527ff..c4be62058dd9 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -29,7 +29,8 @@ __setup("nordrand", x86_rdrand_setup); #ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { - unsigned long tmp; + unsigned int changed = 0; + unsigned long tmp, prev; int i; if (!cpu_has(c, X86_FEATURE_RDRAND)) @@ -42,5 +43,24 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } + + /* + * Stupid sanity-check whether RDRAND does *actually* generate + * some at least random-looking data. + */ + prev = tmp; + for (i = 0; i < SANITY_CHECK_LOOPS; i++) { + if (rdrand_long(&tmp)) { + if (prev != tmp) + changed++; + + prev = tmp; + } + } + + if (WARN_ON_ONCE(!changed)) + pr_emerg( +"RDRAND gives funky smelling output, might consider not using it by booting with \"nordrand\""); + } #endif diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index efbd54cc4e69..055c8613b531 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) int ret = 0; rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } md.priv = of->kn->priv; resid = md.u.rid; diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index a46dee8e78db..2e3b06d6bbc6 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -461,10 +461,8 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, } rdtgrp = rdtgroup_kn_lock_live(of->kn); - rdt_last_cmd_clear(); if (!rdtgrp) { ret = -ENOENT; - rdt_last_cmd_puts("Directory was removed\n"); goto unlock; } @@ -2648,10 +2646,8 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, int ret; prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); - rdt_last_cmd_clear(); if (!prdtgrp) { ret = -ENODEV; - rdt_last_cmd_puts("Directory was removed\n"); goto out_unlock; } diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c new file mode 100644 index 000000000000..3e20d322bc98 --- /dev/null +++ b/arch/x86/kernel/cpu/tsx.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Transactional Synchronization Extensions (TSX) control. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Pawan Gupta <pawan.kumar.gupta@linux.intel.com> + */ + +#include <linux/cpufeature.h> + +#include <asm/cmdline.h> + +#include "cpu.h" + +enum tsx_ctrl_states tsx_ctrl_state __ro_after_init = TSX_CTRL_NOT_SUPPORTED; + +void tsx_disable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Force all transactions to immediately abort */ + tsx |= TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is not enumerated in CPUID. + * This is visible to userspace and will ensure they + * do not waste resources trying TSX transactions that + * will always abort. + */ + tsx |= TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +void tsx_enable(void) +{ + u64 tsx; + + rdmsrl(MSR_IA32_TSX_CTRL, tsx); + + /* Enable the RTM feature in the cpu */ + tsx &= ~TSX_CTRL_RTM_DISABLE; + + /* + * Ensure TSX support is enumerated in CPUID. + * This is visible to userspace and will ensure they + * can enumerate and use the TSX feature. + */ + tsx &= ~TSX_CTRL_CPUID_CLEAR; + + wrmsrl(MSR_IA32_TSX_CTRL, tsx); +} + +static bool __init tsx_ctrl_is_supported(void) +{ + u64 ia32_cap = x86_read_arch_cap_msr(); + + /* + * TSX is controlled via MSR_IA32_TSX_CTRL. However, support for this + * MSR is enumerated by ARCH_CAP_TSX_MSR bit in MSR_IA32_ARCH_CAPABILITIES. + * + * TSX control (aka MSR_IA32_TSX_CTRL) is only available after a + * microcode update on CPUs that have their MSR_IA32_ARCH_CAPABILITIES + * bit MDS_NO=1. CPUs with MDS_NO=0 are not planned to get + * MSR_IA32_TSX_CTRL support even after a microcode update. Thus, + * tsx= cmdline requests will do nothing on CPUs without + * MSR_IA32_TSX_CTRL support. + */ + return !!(ia32_cap & ARCH_CAP_TSX_CTRL_MSR); +} + +static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) +{ + if (boot_cpu_has_bug(X86_BUG_TAA)) + return TSX_CTRL_DISABLE; + + return TSX_CTRL_ENABLE; +} + +void __init tsx_init(void) +{ + char arg[5] = {}; + int ret; + + if (!tsx_ctrl_is_supported()) + return; + + ret = cmdline_find_option(boot_command_line, "tsx", arg, sizeof(arg)); + if (ret >= 0) { + if (!strcmp(arg, "on")) { + tsx_ctrl_state = TSX_CTRL_ENABLE; + } else if (!strcmp(arg, "off")) { + tsx_ctrl_state = TSX_CTRL_DISABLE; + } else if (!strcmp(arg, "auto")) { + tsx_ctrl_state = x86_get_tsx_auto_mode(); + } else { + tsx_ctrl_state = TSX_CTRL_DISABLE; + pr_err("tsx: invalid option, defaulting to off\n"); + } + } else { + /* tsx= not provided */ + if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_AUTO)) + tsx_ctrl_state = x86_get_tsx_auto_mode(); + else if (IS_ENABLED(CONFIG_X86_INTEL_TSX_MODE_OFF)) + tsx_ctrl_state = TSX_CTRL_DISABLE; + else + tsx_ctrl_state = TSX_CTRL_ENABLE; + } + + if (tsx_ctrl_state == TSX_CTRL_DISABLE) { + tsx_disable(); + + /* + * tsx_disable() will change the state of the + * RTM CPUID bit. Clear it here since it is now + * expected to be not set. + */ + setup_clear_cpu_cap(X86_FEATURE_RTM); + } else if (tsx_ctrl_state == TSX_CTRL_ENABLE) { + + /* + * HW defaults TSX to be enabled at bootup. + * We may still need the TSX enable support + * during init for special cases like + * kexec after TSX is disabled. + */ + tsx_enable(); + + /* + * tsx_enable() will change the state of the + * RTM CPUID bit. Force it here since it is now + * expected to be set. + */ + setup_force_cpu_cap(X86_FEATURE_RTM); + } +} diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 9735139cfdf8..46d732696c1c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -49,7 +49,7 @@ #define VMWARE_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ - __asm__("inl (%%dx)" : \ + __asm__("inl (%%dx), %%eax" : \ "=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \ "a"(VMWARE_HYPERVISOR_MAGIC), \ "c"(VMWARE_CMD_##cmd), \ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index eb651fbde92a..00fc55ac7ffa 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/memblock.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -39,6 +40,7 @@ #include <asm/virtext.h> #include <asm/intel_pt.h> #include <asm/crash.h> +#include <asm/cmdline.h> /* Used while preparing memory map entries for second kernel */ struct crash_memmap_data { @@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void) rcu_read_unlock(); } +/* + * When the crashkernel option is specified, only use the low + * 1M for the real mode trampoline. + */ +void __init crash_reserve_low_1M(void) +{ + if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0) + return; + + memblock_reserve(0, 1<<20); + pr_info("Reserving the low 1M of memory for crashkernel\n"); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct pt_regs *regs) @@ -173,8 +188,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #ifdef CONFIG_KEXEC_FILE -static unsigned long crash_zero_bytes; - static int get_nr_ram_ranges_callback(struct resource *res, void *arg) { unsigned int *nr_ranges = arg; @@ -189,8 +202,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) unsigned int nr_ranges = 0; struct crash_mem *cmem; - walk_system_ram_res(0, -1, &nr_ranges, - get_nr_ram_ranges_callback); + walk_system_ram_res(0, -1, &nr_ranges, get_nr_ram_ranges_callback); if (!nr_ranges) return NULL; @@ -217,15 +229,19 @@ static int elf_header_exclude_ranges(struct crash_mem *cmem) { int ret = 0; + /* Exclude the low 1M because it is always reserved */ + ret = crash_exclude_mem_range(cmem, 0, 1<<20); + if (ret) + return ret; + /* Exclude crashkernel region */ ret = crash_exclude_mem_range(cmem, crashk_res.start, crashk_res.end); if (ret) return ret; - if (crashk_low_res.end) { + if (crashk_low_res.end) ret = crash_exclude_mem_range(cmem, crashk_low_res.start, - crashk_low_res.end); - } + crashk_low_res.end); return ret; } @@ -246,16 +262,13 @@ static int prepare_elf_headers(struct kimage *image, void **addr, unsigned long *sz) { struct crash_mem *cmem; - Elf64_Ehdr *ehdr; - Elf64_Phdr *phdr; - int ret, i; + int ret; cmem = fill_up_crash_elf_data(); if (!cmem) return -ENOMEM; - ret = walk_system_ram_res(0, -1, cmem, - prepare_elf64_ram_headers_callback); + ret = walk_system_ram_res(0, -1, cmem, prepare_elf64_ram_headers_callback); if (ret) goto out; @@ -265,24 +278,8 @@ static int prepare_elf_headers(struct kimage *image, void **addr, goto out; /* By default prepare 64bit headers */ - ret = crash_prepare_elf64_headers(cmem, - IS_ENABLED(CONFIG_X86_64), addr, sz); - if (ret) - goto out; + ret = crash_prepare_elf64_headers(cmem, IS_ENABLED(CONFIG_X86_64), addr, sz); - /* - * If a range matches backup region, adjust offset to backup - * segment. - */ - ehdr = (Elf64_Ehdr *)*addr; - phdr = (Elf64_Phdr *)(ehdr + 1); - for (i = 0; i < ehdr->e_phnum; phdr++, i++) - if (phdr->p_type == PT_LOAD && - phdr->p_paddr == image->arch.backup_src_start && - phdr->p_memsz == image->arch.backup_src_sz) { - phdr->p_offset = image->arch.backup_load_addr; - break; - } out: vfree(cmem); return ret; @@ -296,8 +293,7 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry) if (nr_e820_entries >= E820_MAX_ENTRIES_ZEROPAGE) return 1; - memcpy(¶ms->e820_table[nr_e820_entries], entry, - sizeof(struct e820_entry)); + memcpy(¶ms->e820_table[nr_e820_entries], entry, sizeof(struct e820_entry)); params->e820_entries++; return 0; } @@ -321,19 +317,11 @@ static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, unsigned long long mend) { unsigned long start, end; - int ret = 0; cmem->ranges[0].start = mstart; cmem->ranges[0].end = mend; cmem->nr_ranges = 1; - /* Exclude Backup region */ - start = image->arch.backup_load_addr; - end = start + image->arch.backup_src_sz - 1; - ret = crash_exclude_mem_range(cmem, start, end); - if (ret) - return ret; - /* Exclude elf header region */ start = image->arch.elf_load_addr; end = start + image->arch.elf_headers_sz - 1; @@ -356,28 +344,28 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) memset(&cmd, 0, sizeof(struct crash_memmap_data)); cmd.params = params; - /* Add first 640K segment */ - ei.addr = image->arch.backup_src_start; - ei.size = image->arch.backup_src_sz; - ei.type = E820_TYPE_RAM; - add_e820_entry(params, &ei); + /* Add the low 1M */ + cmd.type = E820_TYPE_RAM; + flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + walk_iomem_res_desc(IORES_DESC_NONE, flags, 0, (1<<20)-1, &cmd, + memmap_entry_callback); /* Add ACPI tables */ cmd.type = E820_TYPE_ACPI; flags = IORESOURCE_MEM | IORESOURCE_BUSY; walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add ACPI Non-volatile Storage */ cmd.type = E820_TYPE_NVS; walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add e820 reserved ranges */ cmd.type = E820_TYPE_RESERVED; flags = IORESOURCE_MEM; walk_iomem_res_desc(IORES_DESC_RESERVED, flags, 0, -1, &cmd, - memmap_entry_callback); + memmap_entry_callback); /* Add crashk_low_res region */ if (crashk_low_res.end) { @@ -388,8 +376,7 @@ int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) } /* Exclude some ranges from crashk_res and add rest to memmap */ - ret = memmap_exclude_ranges(image, cmem, crashk_res.start, - crashk_res.end); + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, crashk_res.end); if (ret) goto out; @@ -409,55 +396,12 @@ out: return ret; } -static int determine_backup_region(struct resource *res, void *arg) -{ - struct kimage *image = arg; - - image->arch.backup_src_start = res->start; - image->arch.backup_src_sz = resource_size(res); - - /* Expecting only one range for backup region */ - return 1; -} - int crash_load_segments(struct kimage *image) { int ret; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ULONG_MAX, .top_down = false }; - /* - * Determine and load a segment for backup area. First 640K RAM - * region is backup source - */ - - ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, - image, determine_backup_region); - - /* Zero or postive return values are ok */ - if (ret < 0) - return ret; - - /* Add backup segment. */ - if (image->arch.backup_src_sz) { - kbuf.buffer = &crash_zero_bytes; - kbuf.bufsz = sizeof(crash_zero_bytes); - kbuf.memsz = image->arch.backup_src_sz; - kbuf.buf_align = PAGE_SIZE; - /* - * Ideally there is no source for backup segment. This is - * copied in purgatory after crash. Just add a zero filled - * segment for now to make sure checksum logic works fine. - */ - ret = kexec_add_buffer(&kbuf); - if (ret) - return ret; - image->arch.backup_load_addr = kbuf.mem; - pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", - image->arch.backup_load_addr, - image->arch.backup_src_start, kbuf.memsz); - } - /* Prepare elf headers and add a segment */ ret = prepare_elf_headers(image, &kbuf.buffer, &kbuf.bufsz); if (ret) diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 0b8cedb20d6d..d5c9b13bafdf 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c @@ -65,6 +65,9 @@ struct x86_hw_tss doublefault_tss __cacheline_aligned = { .ss = __KERNEL_DS, .ds = __USER_DS, .fs = __KERNEL_PERCPU, +#ifndef CONFIG_X86_32_LAZY_GS + .gs = __KERNEL_STACK_CANARY, +#endif .__cr3 = __pa_nodebug(swapper_pg_dir), }; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 753b8cfe8b8a..87b97897a881 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info) BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + /* + * Handle the case where stack trace is collected _before_ + * cea_exception_stacks had been initialized. + */ + if (!begin) + return false; + end = begin + sizeof(struct cea_exception_stacks); /* Bail if @stack is outside the exception stack area. */ if (stk < begin || stk >= end) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7da2bcd2b8eb..0bfe9a685b3b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -999,6 +999,17 @@ void __init e820__reserve_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); e820__range_update(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); e820__range_update_kexec(pa_data, sizeof(*data)+data->len, E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + e820__range_update(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + e820__range_update_kexec(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len, + E820_TYPE_RAM, E820_TYPE_RESERVED_KERN); + } + pa_data = data->next; early_memunmap(data, sizeof(*data)); } diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 6f6b1d04dadf..4cba91ec8049 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -710,6 +710,8 @@ static struct chipset early_qrk[] __initdata = { */ { PCI_VENDOR_ID_INTEL, 0x0f00, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, + { PCI_VENDOR_ID_INTEL, 0x3ec4, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet}, { PCI_VENDOR_ID_BROADCOM, 0x4331, PCI_CLASS_NETWORK_OTHER, PCI_ANY_ID, 0, apple_airport_reset}, {} diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index e5cb67d67c03..319be936c348 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -60,7 +60,7 @@ u64 xfeatures_mask __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; -static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -254,10 +254,13 @@ static void __init setup_xstate_features(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_offsets[0] = 0; - xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); - xstate_offsets[1] = xstate_sizes[0]; - xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); + xstate_offsets[XFEATURE_FP] = 0; + xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, + xmm_space); + + xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; + xstate_sizes[XFEATURE_SSE] = FIELD_SIZEOF(struct fxregs_state, + xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) @@ -342,7 +345,7 @@ static int xfeature_is_aligned(int xfeature_nr) */ static void __init setup_xstate_comp(void) { - unsigned int xstate_comp_sizes[sizeof(xfeatures_mask)*8]; + unsigned int xstate_comp_sizes[XFEATURE_MAX]; int i; /* @@ -350,8 +353,9 @@ static void __init setup_xstate_comp(void) * in the fixed offsets in the xsave area in either compacted form * or standard form. */ - xstate_comp_offsets[0] = 0; - xstate_comp_offsets[1] = offsetof(struct fxregs_state, xmm_space); + xstate_comp_offsets[XFEATURE_FP] = 0; + xstate_comp_offsets[XFEATURE_SSE] = offsetof(struct fxregs_state, + xmm_space); if (!boot_cpu_has(X86_FEATURE_XSAVES)) { for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { @@ -840,7 +844,7 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that pcntxt_mask is + * have not enabled. Remember that xfeatures_mask is * what we write to the XCR0 register. */ WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 29ffa495bd1c..206a4b6144c2 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr, * we might write invalid pmds, when the kernel is relocated * cleanup_highmap() fixes this up along with the mappings * beyond _end. + * + * Only the region occupied by the kernel image has so far + * been checked against the table of usable memory regions + * provided by the firmware, so invalidate pages outside that + * region. A page table entry that maps to a reserved area of + * memory would allow processor speculation into that area, + * and on some hardware (particularly the UV platform) even + * speculative access to some reserved areas is caught as an + * error, causing the BIOS to halt the system. */ pmd = fixup_pointer(level2_kernel_pgt, physaddr); - for (i = 0; i < PTRS_PER_PMD; i++) { + + /* invalidate pages before the kernel image */ + for (i = 0; i < pmd_index((unsigned long)_text); i++) + pmd[i] &= ~_PAGE_PRESENT; + + /* fixup pages that are part of the kernel image */ + for (; i <= pmd_index((unsigned long)_end); i++) if (pmd[i] & _PAGE_PRESENT) pmd[i] += load_delta; - } + + /* invalidate pages after the kernel image */ + for (; i < PTRS_PER_PMD; i++) + pmd[i] &= ~_PAGE_PRESENT; /* * Fixup phys_base - remove the memory encryption mask to obtain diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 3fe7d2008b7a..3923ab4630d7 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -569,6 +569,16 @@ SYM_DATA_START(initial_page_table) # error "Kernel PMDs should be 1, 2 or 3" # endif .align PAGE_SIZE /* needs to be page-sized too */ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * PTI needs another page so sync_initial_pagetable() works correctly + * and does not scribble over the data which is placed behind the + * actual initial_page_table. See clone_pgd_range(). + */ + .fill 1024, 4, 0 +#endif + SYM_DATA_END(initial_page_table) #endif diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index 3ad34f01de2a..6eb8b50ea07e 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -11,6 +11,7 @@ #include <linux/acpi_pmtmr.h> #include <linux/kernel.h> #include <linux/reboot.h> +#include <linux/serial_8250.h> #include <asm/apic.h> #include <asm/cpu.h> #include <asm/hypervisor.h> @@ -21,9 +22,24 @@ #include <asm/setup.h> #include <asm/jailhouse_para.h> -static __initdata struct jailhouse_setup_data setup_data; +static struct jailhouse_setup_data setup_data; +#define SETUP_DATA_V1_LEN (sizeof(setup_data.hdr) + sizeof(setup_data.v1)) +#define SETUP_DATA_V2_LEN (SETUP_DATA_V1_LEN + sizeof(setup_data.v2)) + static unsigned int precalibrated_tsc_khz; +static void jailhouse_setup_irq(unsigned int irq) +{ + struct mpc_intsrc mp_irq = { + .type = MP_INTSRC, + .irqtype = mp_INT, + .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, + .srcbusirq = irq, + .dstirq = irq, + }; + mp_save_irq(&mp_irq); +} + static uint32_t jailhouse_cpuid_base(void) { if (boot_cpu_data.cpuid_level < 0 || @@ -45,7 +61,7 @@ static void jailhouse_get_wallclock(struct timespec64 *now) static void __init jailhouse_timer_init(void) { - lapic_timer_period = setup_data.apic_khz * (1000 / HZ); + lapic_timer_period = setup_data.v1.apic_khz * (1000 / HZ); } static unsigned long jailhouse_get_tsc(void) @@ -77,33 +93,28 @@ static void __init jailhouse_get_smp_config(unsigned int early) .type = IOAPIC_DOMAIN_STRICT, .ops = &mp_ioapic_irqdomain_ops, }; - struct mpc_intsrc mp_irq = { - .type = MP_INTSRC, - .irqtype = mp_INT, - .irqflag = MP_IRQPOL_ACTIVE_HIGH | MP_IRQTRIG_EDGE, - }; unsigned int cpu; jailhouse_x2apic_init(); register_lapic_address(0xfee00000); - for (cpu = 0; cpu < setup_data.num_cpus; cpu++) { - generic_processor_info(setup_data.cpu_ids[cpu], + for (cpu = 0; cpu < setup_data.v1.num_cpus; cpu++) { + generic_processor_info(setup_data.v1.cpu_ids[cpu], boot_cpu_apic_version); } smp_found_config = 1; - if (setup_data.standard_ioapic) { + if (setup_data.v1.standard_ioapic) { mp_register_ioapic(0, 0xfec00000, gsi_top, &ioapic_cfg); - /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ - mp_irq.srcbusirq = mp_irq.dstirq = 3; - mp_save_irq(&mp_irq); - - mp_irq.srcbusirq = mp_irq.dstirq = 4; - mp_save_irq(&mp_irq); + if (IS_ENABLED(CONFIG_SERIAL_8250) && + setup_data.hdr.version < 2) { + /* Register 1:1 mapping for legacy UART IRQs 3 and 4 */ + jailhouse_setup_irq(3); + jailhouse_setup_irq(4); + } } } @@ -126,9 +137,9 @@ static int __init jailhouse_pci_arch_init(void) pcibios_last_bus = 0xff; #ifdef CONFIG_PCI_MMCONFIG - if (setup_data.pci_mmconfig_base) { + if (setup_data.v1.pci_mmconfig_base) { pci_mmconfig_add(0, 0, pcibios_last_bus, - setup_data.pci_mmconfig_base); + setup_data.v1.pci_mmconfig_base); pci_mmcfg_arch_init(); } #endif @@ -136,9 +147,57 @@ static int __init jailhouse_pci_arch_init(void) return 0; } +#ifdef CONFIG_SERIAL_8250 +static inline bool jailhouse_uart_enabled(unsigned int uart_nr) +{ + return setup_data.v2.flags & BIT(uart_nr); +} + +static void jailhouse_serial_fixup(int port, struct uart_port *up, + u32 *capabilities) +{ + static const u16 pcuart_base[] = {0x3f8, 0x2f8, 0x3e8, 0x2e8}; + unsigned int n; + + for (n = 0; n < ARRAY_SIZE(pcuart_base); n++) { + if (pcuart_base[n] != up->iobase) + continue; + + if (jailhouse_uart_enabled(n)) { + pr_info("Enabling UART%u (port 0x%lx)\n", n, + up->iobase); + jailhouse_setup_irq(up->irq); + } else { + /* Deactivate UART if access isn't allowed */ + up->iobase = 0; + } + break; + } +} + +static void __init jailhouse_serial_workaround(void) +{ + /* + * There are flags inside setup_data that indicate availability of + * platform UARTs since setup data version 2. + * + * In case of version 1, we don't know which UARTs belong Linux. In + * this case, unconditionally register 1:1 mapping for legacy UART IRQs + * 3 and 4. + */ + if (setup_data.hdr.version > 1) + serial8250_set_isa_configurator(jailhouse_serial_fixup); +} +#else /* !CONFIG_SERIAL_8250 */ +static inline void jailhouse_serial_workaround(void) +{ +} +#endif /* CONFIG_SERIAL_8250 */ + static void __init jailhouse_init_platform(void) { u64 pa_data = boot_params.hdr.setup_data; + unsigned long setup_data_len; struct setup_data header; void *mapping; @@ -163,16 +222,8 @@ static void __init jailhouse_init_platform(void) memcpy(&header, mapping, sizeof(header)); early_memunmap(mapping, sizeof(header)); - if (header.type == SETUP_JAILHOUSE && - header.len >= sizeof(setup_data)) { - pa_data += offsetof(struct setup_data, data); - - mapping = early_memremap(pa_data, sizeof(setup_data)); - memcpy(&setup_data, mapping, sizeof(setup_data)); - early_memunmap(mapping, sizeof(setup_data)); - + if (header.type == SETUP_JAILHOUSE) break; - } pa_data = header.next; } @@ -180,13 +231,28 @@ static void __init jailhouse_init_platform(void) if (!pa_data) panic("Jailhouse: No valid setup data found"); - if (setup_data.compatible_version > JAILHOUSE_SETUP_REQUIRED_VERSION) - panic("Jailhouse: Unsupported setup data structure"); - - pmtmr_ioport = setup_data.pm_timer_address; + /* setup data must at least contain the header */ + if (header.len < sizeof(setup_data.hdr)) + goto unsupported; + + pa_data += offsetof(struct setup_data, data); + setup_data_len = min_t(unsigned long, sizeof(setup_data), + (unsigned long)header.len); + mapping = early_memremap(pa_data, setup_data_len); + memcpy(&setup_data, mapping, setup_data_len); + early_memunmap(mapping, setup_data_len); + + if (setup_data.hdr.version == 0 || + setup_data.hdr.compatible_version != + JAILHOUSE_SETUP_REQUIRED_VERSION || + (setup_data.hdr.version == 1 && header.len < SETUP_DATA_V1_LEN) || + (setup_data.hdr.version >= 2 && header.len < SETUP_DATA_V2_LEN)) + goto unsupported; + + pmtmr_ioport = setup_data.v1.pm_timer_address; pr_debug("Jailhouse: PM-Timer IO Port: %#x\n", pmtmr_ioport); - precalibrated_tsc_khz = setup_data.tsc_khz; + precalibrated_tsc_khz = setup_data.v1.tsc_khz; setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); pci_probe = 0; @@ -196,6 +262,12 @@ static void __init jailhouse_init_platform(void) * are none in a non-root cell. */ disable_acpi(); + + jailhouse_serial_workaround(); + return; + +unsupported: + panic("Jailhouse: Unsupported setup data structure"); } bool jailhouse_paravirt(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index 044053235302..c1a8b9e71408 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -89,8 +89,7 @@ static void __ref __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, - (void *)jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); + text_poke_bp((void *)jump_entry_code(entry), &code, JUMP_LABEL_NOP_SIZE, NULL); } void arch_jump_label_transform(struct jump_entry *entry, @@ -147,11 +146,9 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, } __jump_label_set_jump_code(entry, type, - (union jump_code_union *) &tp->opcode, 0); + (union jump_code_union *)&tp->text, 0); - tp->addr = entry_code; - tp->detour = entry_code + JUMP_LABEL_NOP_SIZE; - tp->len = JUMP_LABEL_NOP_SIZE; + text_poke_loc_init(tp, entry_code, NULL, JUMP_LABEL_NOP_SIZE, NULL); tp_vec_nr++; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index edaa30b20841..64b6da95af98 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -44,7 +44,12 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf, if (count > node->len - pos) count = node->len - pos; - pa = node->paddr + sizeof(struct setup_data) + pos; + pa = node->paddr + pos; + + /* Is it direct data or invalid indirect one? */ + if (!(node->type & SETUP_INDIRECT) || node->type == SETUP_INDIRECT) + pa += sizeof(struct setup_data); + p = memremap(pa, count, MEMREMAP_WB); if (!p) return -ENOMEM; @@ -108,9 +113,17 @@ static int __init create_setup_data_nodes(struct dentry *parent) goto err_dir; } - node->paddr = pa_data; - node->type = data->type; - node->len = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + node->paddr = ((struct setup_indirect *)data->data)->addr; + node->type = ((struct setup_indirect *)data->data)->type; + node->len = ((struct setup_indirect *)data->data)->len; + } else { + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + } + create_setup_data_node(d, no, node); pa_data = data->next; diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index b348dd506d58..8900329c28a7 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -437,8 +437,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = RELATIVEJUMP_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, NULL); list_del_init(&op->list); } @@ -448,12 +447,18 @@ void arch_optimize_kprobes(struct list_head *oplist) void arch_unoptimize_kprobe(struct optimized_kprobe *op) { u8 insn_buff[RELATIVEJUMP_SIZE]; + u8 emulate_buff[RELATIVEJUMP_SIZE]; /* Set int3 to first byte for kprobes */ insn_buff[0] = BREAKPOINT_INSTRUCTION; memcpy(insn_buff + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + emulate_buff[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&emulate_buff[1]) = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + text_poke_bp(op->kp.addr, insn_buff, RELATIVEJUMP_SIZE, - op->optinsn.insn); + emulate_buff); } /* diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index 7969da939213..d0a19121c6a4 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -100,7 +100,12 @@ static int __init get_setup_data_size(int nr, size_t *size) if (!data) return -ENOMEM; if (nr == i) { - *size = data->len; + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + *size = ((struct setup_indirect *)data->data)->len; + else + *size = data->len; + memunmap(data); return 0; } @@ -130,7 +135,10 @@ static ssize_t type_show(struct kobject *kobj, if (!data) return -ENOMEM; - ret = sprintf(buf, "0x%x\n", data->type); + if (data->type == SETUP_INDIRECT) + ret = sprintf(buf, "0x%x\n", ((struct setup_indirect *)data->data)->type); + else + ret = sprintf(buf, "0x%x\n", data->type); memunmap(data); return ret; } @@ -142,7 +150,7 @@ static ssize_t setup_data_data_read(struct file *fp, loff_t off, size_t count) { int nr, ret = 0; - u64 paddr; + u64 paddr, len; struct setup_data *data; void *p; @@ -157,19 +165,28 @@ static ssize_t setup_data_data_read(struct file *fp, if (!data) return -ENOMEM; - if (off > data->len) { + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } else { + paddr += sizeof(*data); + len = data->len; + } + + if (off > len) { ret = -EINVAL; goto out; } - if (count > data->len - off) - count = data->len - off; + if (count > len - off) + count = len - off; if (!count) goto out; ret = count; - p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); + p = memremap(paddr, len, MEMREMAP_WB); if (!p) { ret = -ENOMEM; goto out; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e820568ed4d5..32ef1ee733b7 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -33,6 +33,7 @@ #include <asm/apicdef.h> #include <asm/hypervisor.h> #include <asm/tlb.h> +#include <asm/cpuidle_haltpoll.h> static int kvmapf = 1; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 5dcd438ad8f2..16e125a50b33 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -298,48 +298,6 @@ static void load_segments(void) ); } -#ifdef CONFIG_KEXEC_FILE -/* Update purgatory as needed after various image segments have been prepared */ -static int arch_update_purgatory(struct kimage *image) -{ - int ret = 0; - - if (!image->file_mode) - return 0; - - /* Setup copying of backup region */ - if (image->type == KEXEC_TYPE_CRASH) { - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_dest", - &image->arch.backup_load_addr, - sizeof(image->arch.backup_load_addr), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_src", - &image->arch.backup_src_start, - sizeof(image->arch.backup_src_start), 0); - if (ret) - return ret; - - ret = kexec_purgatory_get_set_symbol(image, - "purgatory_backup_sz", - &image->arch.backup_src_sz, - sizeof(image->arch.backup_src_sz), 0); - if (ret) - return ret; - } - - return ret; -} -#else /* !CONFIG_KEXEC_FILE */ -static inline int arch_update_purgatory(struct kimage *image) -{ - return 0; -} -#endif /* CONFIG_KEXEC_FILE */ - int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -353,11 +311,6 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; - /* update purgatory as needed */ - result = arch_update_purgatory(image); - if (result) - return result; - return 0; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c deleted file mode 100644 index 23fdec030c37..000000000000 --- a/arch/x86/kernel/pci-calgary_64.c +++ /dev/null @@ -1,1586 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Derived from arch/powerpc/kernel/iommu.c - * - * Copyright IBM Corporation, 2006-2007 - * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> - * - * Author: Jon Mason <jdmason@kudzu.us> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - - */ - -#define pr_fmt(fmt) "Calgary: " fmt - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/crash_dump.h> -#include <linux/dma-mapping.h> -#include <linux/dma-direct.h> -#include <linux/bitmap.h> -#include <linux/pci_ids.h> -#include <linux/pci.h> -#include <linux/delay.h> -#include <linux/scatterlist.h> -#include <linux/iommu-helper.h> - -#include <asm/iommu.h> -#include <asm/calgary.h> -#include <asm/tce.h> -#include <asm/pci-direct.h> -#include <asm/dma.h> -#include <asm/rio.h> -#include <asm/bios_ebda.h> -#include <asm/x86_init.h> -#include <asm/iommu_table.h> - -#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT -int use_calgary __read_mostly = 1; -#else -int use_calgary __read_mostly = 0; -#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ - -#define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 - -/* register offsets inside the host bridge space */ -#define CALGARY_CONFIG_REG 0x0108 -#define PHB_CSR_OFFSET 0x0110 /* Channel Status */ -#define PHB_PLSSR_OFFSET 0x0120 -#define PHB_CONFIG_RW_OFFSET 0x0160 -#define PHB_IOBASE_BAR_LOW 0x0170 -#define PHB_IOBASE_BAR_HIGH 0x0180 -#define PHB_MEM_1_LOW 0x0190 -#define PHB_MEM_1_HIGH 0x01A0 -#define PHB_IO_ADDR_SIZE 0x01B0 -#define PHB_MEM_1_SIZE 0x01C0 -#define PHB_MEM_ST_OFFSET 0x01D0 -#define PHB_AER_OFFSET 0x0200 -#define PHB_CONFIG_0_HIGH 0x0220 -#define PHB_CONFIG_0_LOW 0x0230 -#define PHB_CONFIG_0_END 0x0240 -#define PHB_MEM_2_LOW 0x02B0 -#define PHB_MEM_2_HIGH 0x02C0 -#define PHB_MEM_2_SIZE_HIGH 0x02D0 -#define PHB_MEM_2_SIZE_LOW 0x02E0 -#define PHB_DOSHOLE_OFFSET 0x08E0 - -/* CalIOC2 specific */ -#define PHB_SAVIOR_L2 0x0DB0 -#define PHB_PAGE_MIG_CTRL 0x0DA8 -#define PHB_PAGE_MIG_DEBUG 0x0DA0 -#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 - -/* PHB_CONFIG_RW */ -#define PHB_TCE_ENABLE 0x20000000 -#define PHB_SLOT_DISABLE 0x1C000000 -#define PHB_DAC_DISABLE 0x01000000 -#define PHB_MEM2_ENABLE 0x00400000 -#define PHB_MCSR_ENABLE 0x00100000 -/* TAR (Table Address Register) */ -#define TAR_SW_BITS 0x0000ffffffff800fUL -#define TAR_VALID 0x0000000000000008UL -/* CSR (Channel/DMA Status Register) */ -#define CSR_AGENT_MASK 0xffe0ffff -/* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL -/* PMCR/PMDR (Page Migration Control/Debug Registers */ -#define PMR_SOFTSTOP 0x80000000 -#define PMR_SOFTSTOPFAULT 0x40000000 -#define PMR_HARDSTOP 0x20000000 - -/* - * The maximum PHB bus number. - * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 - * x3950M2: 4 chassis, 48 PHBs per chassis = 192 - * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 - * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 - */ -#define MAX_PHB_BUS_NUM 256 - -#define PHBS_PER_CALGARY 4 - -/* register offsets in Calgary's internal register space */ -static const unsigned long tar_offsets[] = { - 0x0580 /* TAR0 */, - 0x0588 /* TAR1 */, - 0x0590 /* TAR2 */, - 0x0598 /* TAR3 */ -}; - -static const unsigned long split_queue_offsets[] = { - 0x4870 /* SPLIT QUEUE 0 */, - 0x5870 /* SPLIT QUEUE 1 */, - 0x6870 /* SPLIT QUEUE 2 */, - 0x7870 /* SPLIT QUEUE 3 */ -}; - -static const unsigned long phb_offsets[] = { - 0x8000 /* PHB0 */, - 0x9000 /* PHB1 */, - 0xA000 /* PHB2 */, - 0xB000 /* PHB3 */ -}; - -/* PHB debug registers */ - -static const unsigned long phb_debug_offsets[] = { - 0x4000 /* PHB 0 DEBUG */, - 0x5000 /* PHB 1 DEBUG */, - 0x6000 /* PHB 2 DEBUG */, - 0x7000 /* PHB 3 DEBUG */ -}; - -/* - * STUFF register for each debug PHB, - * byte 1 = start bus number, byte 2 = end bus number - */ - -#define PHB_DEBUG_STUFF_OFFSET 0x0020 - -unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; -static int translate_empty_slots __read_mostly = 0; -static int calgary_detected __read_mostly = 0; - -static struct rio_table_hdr *rio_table_hdr __initdata; -static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; -static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; - -struct calgary_bus_info { - void *tce_space; - unsigned char translation_disabled; - signed char phbid; - void __iomem *bbar; -}; - -static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calgary_tce_cache_blast(struct iommu_table *tbl); -static void calgary_dump_error_regs(struct iommu_table *tbl); -static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); -static void calioc2_tce_cache_blast(struct iommu_table *tbl); -static void calioc2_dump_error_regs(struct iommu_table *tbl); -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); -static void get_tce_space_from_tar(void); - -static const struct cal_chipset_ops calgary_chip_ops = { - .handle_quirks = calgary_handle_quirks, - .tce_cache_blast = calgary_tce_cache_blast, - .dump_error_regs = calgary_dump_error_regs -}; - -static const struct cal_chipset_ops calioc2_chip_ops = { - .handle_quirks = calioc2_handle_quirks, - .tce_cache_blast = calioc2_tce_cache_blast, - .dump_error_regs = calioc2_dump_error_regs -}; - -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; - -static inline int translation_enabled(struct iommu_table *tbl) -{ - /* only PHBs with translation enabled have an IOMMU table */ - return (tbl != NULL); -} - -static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) -{ - unsigned long index; - unsigned long end; - unsigned long flags; - - index = start_addr >> PAGE_SHIFT; - - /* bail out if we're asked to reserve a region we don't cover */ - if (index >= tbl->it_size) - return; - - end = index + npages; - if (end > tbl->it_size) /* don't go off the table */ - end = tbl->it_size; - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_set(tbl->it_map, index, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static unsigned long iommu_range_alloc(struct device *dev, - struct iommu_table *tbl, - unsigned int npages) -{ - unsigned long flags; - unsigned long offset; - unsigned long boundary_size; - - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, - PAGE_SIZE) >> PAGE_SHIFT; - - BUG_ON(npages == 0); - - spin_lock_irqsave(&tbl->it_lock, flags); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, tbl->it_hint, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - tbl->chip_ops->tce_cache_blast(tbl); - - offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, - npages, 0, boundary_size, 0); - if (offset == ~0UL) { - pr_warn("IOMMU full\n"); - spin_unlock_irqrestore(&tbl->it_lock, flags); - if (panic_on_overflow) - panic("Calgary: fix the allocator.\n"); - else - return DMA_MAPPING_ERROR; - } - } - - tbl->it_hint = offset + npages; - BUG_ON(tbl->it_hint > tbl->it_size); - - spin_unlock_irqrestore(&tbl->it_lock, flags); - - return offset; -} - -static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, - void *vaddr, unsigned int npages, int direction) -{ - unsigned long entry; - dma_addr_t ret; - - entry = iommu_range_alloc(dev, tbl, npages); - if (unlikely(entry == DMA_MAPPING_ERROR)) { - pr_warn("failed to allocate %u pages in iommu %p\n", - npages, tbl); - return DMA_MAPPING_ERROR; - } - - /* set the return dma address */ - ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); - - /* put the TCEs in the HW table */ - tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, - direction); - return ret; -} - -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) -{ - unsigned long entry; - unsigned long flags; - - /* were we called with bad_dma_address? */ - if (unlikely(dma_addr == DMA_MAPPING_ERROR)) { - WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " - "address 0x%Lx\n", dma_addr); - return; - } - - entry = dma_addr >> PAGE_SHIFT; - - BUG_ON(entry + npages > tbl->it_size); - - tce_free(tbl, entry, npages); - - spin_lock_irqsave(&tbl->it_lock, flags); - - bitmap_clear(tbl->it_map, entry, npages); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - -static inline struct iommu_table *find_iommu_table(struct device *dev) -{ - struct pci_dev *pdev; - struct pci_bus *pbus; - struct iommu_table *tbl; - - pdev = to_pci_dev(dev); - - /* search up the device tree for an iommu */ - pbus = pdev->bus; - do { - tbl = pci_iommu(pbus); - if (tbl && tbl->it_busno == pbus->number) - break; - tbl = NULL; - pbus = pbus->parent; - } while (pbus); - - BUG_ON(tbl && (tbl->it_busno != pbus->number)); - - return tbl; -} - -static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems,enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - int i; - - if (!translation_enabled(tbl)) - return; - - for_each_sg(sglist, s, nelems, i) { - unsigned int npages; - dma_addr_t dma = s->dma_address; - unsigned int dmalen = s->dma_length; - - if (dmalen == 0) - break; - - npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); - iommu_free(tbl, dma, npages); - } -} - -static int calgary_map_sg(struct device *dev, struct scatterlist *sg, - int nelems, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - struct scatterlist *s; - unsigned long vaddr; - unsigned int npages; - unsigned long entry; - int i; - - for_each_sg(sg, s, nelems, i) { - BUG_ON(!sg_page(s)); - - vaddr = (unsigned long) sg_virt(s); - npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); - - entry = iommu_range_alloc(dev, tbl, npages); - if (entry == DMA_MAPPING_ERROR) { - /* makes sure unmap knows to stop */ - s->dma_length = 0; - goto error; - } - - s->dma_address = (entry << PAGE_SHIFT) | s->offset; - - /* insert into HW table */ - tce_build(tbl, entry, npages, vaddr & PAGE_MASK, dir); - - s->dma_length = s->length; - } - - return nelems; -error: - calgary_unmap_sg(dev, sg, nelems, dir, 0); - for_each_sg(sg, s, nelems, i) { - sg->dma_address = DMA_MAPPING_ERROR; - sg->dma_length = 0; - } - return 0; -} - -static dma_addr_t calgary_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - void *vaddr = page_address(page) + offset; - unsigned long uaddr; - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - uaddr = (unsigned long)vaddr; - npages = iommu_num_pages(uaddr, size, PAGE_SIZE); - - return iommu_alloc(dev, tbl, vaddr, npages, dir); -} - -static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, - size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - struct iommu_table *tbl = find_iommu_table(dev); - unsigned int npages; - - npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); - iommu_free(tbl, dma_addr, npages); -} - -static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) -{ - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); /* size rounded up to full pages */ - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(flag, order); - if (!ret) - goto error; - memset(ret, 0, size); - - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == DMA_MAPPING_ERROR) - goto free; - *dma_handle = mapping; - return ret; -free: - free_pages((unsigned long)ret, get_order(size)); - ret = NULL; -error: - return ret; -} - -static void calgary_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t dma_handle, - unsigned long attrs) -{ - unsigned int npages; - struct iommu_table *tbl = find_iommu_table(dev); - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); -} - -static const struct dma_map_ops calgary_dma_ops = { - .alloc = calgary_alloc_coherent, - .free = calgary_free_coherent, - .map_sg = calgary_map_sg, - .unmap_sg = calgary_unmap_sg, - .map_page = calgary_map_page, - .unmap_page = calgary_unmap_page, - .dma_supported = dma_direct_supported, - .mmap = dma_common_mmap, - .get_sgtable = dma_common_get_sgtable, -}; - -static inline void __iomem * busno_to_bbar(unsigned char num) -{ - return bus_info[num].bbar; -} - -static inline int busno_to_phbid(unsigned char num) -{ - return bus_info[num].phbid; -} - -static inline unsigned long split_queue_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return split_queue_offsets[idx]; -} - -static inline unsigned long tar_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return tar_offsets[idx]; -} - -static inline unsigned long phb_offset(unsigned char num) -{ - size_t idx = busno_to_phbid(num); - - return phb_offsets[idx]; -} - -static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) -{ - unsigned long target = ((unsigned long)bar) | offset; - return (void __iomem*)target; -} - -static inline int is_calioc2(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALIOC2); -} - -static inline int is_calgary(unsigned short device) -{ - return (device == PCI_DEVICE_ID_IBM_CALGARY); -} - -static inline int is_cal_pci_dev(unsigned short device) -{ - return (is_calgary(device) || is_calioc2(device)); -} - -static void calgary_tce_cache_blast(struct iommu_table *tbl) -{ - u64 val; - u32 aer; - int i = 0; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - - /* disable arbitration on the bus */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - aer = readl(target); - writel(0, target); - - /* read plssr to ensure it got there */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - val = readl(target); - - /* poll split queues until all DMA activity is done */ - target = calgary_reg(bbar, split_queue_offset(tbl->it_busno)); - do { - val = readq(target); - i++; - } while ((val & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("PCI bus not quiesced, continuing anyway\n"); - - /* invalidate TCE cache */ - target = calgary_reg(bbar, tar_offset(tbl->it_busno)); - writeq(tbl->tar_val, target); - - /* enable arbitration */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_AER_OFFSET); - writel(aer, target); - (void)readl(target); /* flush */ -} - -static void calioc2_tce_cache_blast(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u64 val64; - u32 val; - int i = 0; - int count = 1; - unsigned char bus = tbl->it_busno; - -begin: - printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " - "sequence - count %d\n", bus, count); - - /* 1. using the Page Migration Control reg set SoftStop */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); - val |= PMR_SOFTSTOP; - printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - - /* 2. poll split queues until all DMA activity is done */ - printk(KERN_DEBUG "2a. starting to poll split queues\n"); - target = calgary_reg(bbar, split_queue_offset(bus)); - do { - val64 = readq(target); - i++; - } while ((val64 & 0xff) != 0xff && i < 100); - if (i == 100) - pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); - - /* 3. poll Page Migration DEBUG for SoftStopFault */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); - - /* 4. if SoftStopFault - goto (1) */ - if (val & PMR_SOFTSTOPFAULT) { - if (++count < 100) - goto begin; - else { - pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); - return; /* pray for the best */ - } - } - - /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); - - /* 6. invalidate TCE cache */ - printk(KERN_DEBUG "6. invalidating TCE cache\n"); - target = calgary_reg(bbar, tar_offset(bus)); - writeq(tbl->tar_val, target); - - /* 7. Re-read PMCR */ - printk(KERN_DEBUG "7a. Re-reading PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); - - /* 8. Remove HardStop */ - printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); - target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); - val = 0; - printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); - writel(cpu_to_be32(val), target); - val = be32_to_cpu(readl(target)); - printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); -} - -static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, - u64 limit) -{ - unsigned int numpages; - - limit = limit | 0xfffff; - limit++; - - numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(pci_iommu(dev->bus), start, numpages); -} - -static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) -{ - void __iomem *target; - u64 low, high, sizelow; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* peripheral MEM_1 region */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_1_SIZE); - sizelow = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) -{ - void __iomem *target; - u32 val32; - u64 low, high, sizelow, sizehigh; - u64 start, limit; - struct iommu_table *tbl = pci_iommu(dev->bus); - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - - /* is it enabled? */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - if (!(val32 & PHB_MEM2_ENABLE)) - return; - - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_LOW); - low = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_HIGH); - high = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_LOW); - sizelow = be32_to_cpu(readl(target)); - target = calgary_reg(bbar, phb_offset(busnum) | PHB_MEM_2_SIZE_HIGH); - sizehigh = be32_to_cpu(readl(target)); - - start = (high << 32) | low; - limit = (sizehigh << 32) | sizelow; - - calgary_reserve_mem_region(dev, start, limit); -} - -/* - * some regions of the IO address space do not get translated, so we - * must not give devices IO addresses in those regions. The regions - * are the 640KB-1MB region and the two PCI peripheral memory holes. - * Reserve all of them in the IOMMU bitmap to avoid giving them out - * later. - */ -static void __init calgary_reserve_regions(struct pci_dev *dev) -{ - unsigned int npages; - u64 start; - struct iommu_table *tbl = pci_iommu(dev->bus); - - /* avoid the BIOS/VGA first 640KB-1MB region */ - /* for CalIOC2 - avoid the entire first MB */ - if (is_calgary(dev->device)) { - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; - } else { /* calioc2 */ - start = 0; - npages = (1 * 1024 * 1024) >> PAGE_SHIFT; - } - iommu_range_reserve(tbl, start, npages); - - /* reserve the two PCI peripheral memory regions in IO space */ - calgary_reserve_peripheral_mem_1(dev); - calgary_reserve_peripheral_mem_2(dev); -} - -static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) -{ - u64 val64; - u64 table_phys; - void __iomem *target; - int ret; - struct iommu_table *tbl; - - /* build TCE tables for each PHB */ - ret = build_tce_table(dev, bbar); - if (ret) - return ret; - - tbl = pci_iommu(dev->bus); - tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - - if (is_kdump_kernel()) - calgary_init_bitmap_from_tce_table(tbl); - else - tce_free(tbl, 0, tbl->it_size); - - if (is_calgary(dev->device)) - tbl->chip_ops = &calgary_chip_ops; - else if (is_calioc2(dev->device)) - tbl->chip_ops = &calioc2_chip_ops; - else - BUG(); - - calgary_reserve_regions(dev); - - /* set TARs for each PHB */ - target = calgary_reg(bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - - /* zero out all TAR bits under sw control */ - val64 &= ~TAR_SW_BITS; - table_phys = (u64)__pa(tbl->it_base); - - val64 |= table_phys; - - BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); - val64 |= (u64) specified_table_size; - - tbl->tar_val = cpu_to_be64(val64); - - writeq(tbl->tar_val, target); - readq(target); /* flush */ - - return 0; -} - -static void __init calgary_free_bus(struct pci_dev *dev) -{ - u64 val64; - struct iommu_table *tbl = pci_iommu(dev->bus); - void __iomem *target; - unsigned int bitmapsz; - - target = calgary_reg(tbl->bbar, tar_offset(dev->bus->number)); - val64 = be64_to_cpu(readq(target)); - val64 &= ~TAR_SW_BITS; - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ - - bitmapsz = tbl->it_size / BITS_PER_BYTE; - free_pages((unsigned long)tbl->it_map, get_order(bitmapsz)); - tbl->it_map = NULL; - - kfree(tbl); - - set_pci_iommu(dev->bus, NULL); - - /* Can't free bootmem allocated memory after system is up :-( */ - bus_info[dev->bus->number].tce_space = NULL; -} - -static void calgary_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 csr, plssr; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", - tbl->it_busno, csr, plssr); -} - -static void calioc2_dump_error_regs(struct iommu_table *tbl) -{ - void __iomem *bbar = tbl->bbar; - u32 csr, csmr, plssr, mck, rcstat; - void __iomem *target; - unsigned long phboff = phb_offset(tbl->it_busno); - unsigned long erroff; - u32 errregs[7]; - int i; - - /* dump CSR */ - target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); - csr = be32_to_cpu(readl(target)); - /* dump PLSSR */ - target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); - plssr = be32_to_cpu(readl(target)); - /* dump CSMR */ - target = calgary_reg(bbar, phboff | 0x290); - csmr = be32_to_cpu(readl(target)); - /* dump mck */ - target = calgary_reg(bbar, phboff | 0x800); - mck = be32_to_cpu(readl(target)); - - pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - - pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); - - /* dump rest of error regs */ - pr_emerg(""); - for (i = 0; i < ARRAY_SIZE(errregs); i++) { - /* err regs are at 0x810 - 0x870 */ - erroff = (0x810 + (i * 0x10)); - target = calgary_reg(bbar, phboff | erroff); - errregs[i] = be32_to_cpu(readl(target)); - pr_cont("0x%08x@0x%lx ", errregs[i], erroff); - } - pr_cont("\n"); - - /* root complex status */ - target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); - rcstat = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, - PHB_ROOT_COMPLEX_STATUS); -} - -static void calgary_watchdog(struct timer_list *t) -{ - struct iommu_table *tbl = from_timer(tbl, t, watchdog_timer); - void __iomem *bbar = tbl->bbar; - u32 val32; - void __iomem *target; - - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); - val32 = be32_to_cpu(readl(target)); - - /* If no error, the agent ID in the CSR is not valid */ - if (val32 & CSR_AGENT_MASK) { - tbl->chip_ops->dump_error_regs(tbl); - - /* reset error */ - writel(0, target); - - /* Disable bus that caused the error */ - target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_SLOT_DISABLE; - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - } else { - /* Reset the timer */ - mod_timer(&tbl->watchdog_timer, jiffies + 2 * HZ); - } -} - -static void __init calgary_set_split_completion_timeout(void __iomem *bbar, - unsigned char busnum, unsigned long timeout) -{ - u64 val64; - void __iomem *target; - unsigned int phb_shift = ~0; /* silence gcc */ - u64 mask; - - switch (busno_to_phbid(busnum)) { - case 0: phb_shift = (63 - 19); - break; - case 1: phb_shift = (63 - 23); - break; - case 2: phb_shift = (63 - 27); - break; - case 3: phb_shift = (63 - 35); - break; - default: - BUG_ON(busno_to_phbid(busnum)); - } - - target = calgary_reg(bbar, CALGARY_CONFIG_REG); - val64 = be64_to_cpu(readq(target)); - - /* zero out this PHB's timer bits */ - mask = ~(0xFUL << phb_shift); - val64 &= mask; - val64 |= (timeout << phb_shift); - writeq(cpu_to_be64(val64), target); - readq(target); /* flush */ -} - -static void __init calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - void __iomem *bbar = tbl->bbar; - void __iomem *target; - u32 val; - - /* - * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 - */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); - val = cpu_to_be32(readl(target)); - val |= 0x00800000; - writel(cpu_to_be32(val), target); -} - -static void __init calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) -{ - unsigned char busnum = dev->bus->number; - - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (is_calgary(dev->device) && (busnum == 1)) - calgary_set_split_completion_timeout(tbl->bbar, busnum, - CCR_2SEC_TIMEOUT); -} - -static void __init calgary_enable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* enable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - - printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", - (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? - "Calgary" : "CalIOC2", busnum); - printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " - "bus.\n"); - - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - timer_setup(&tbl->watchdog_timer, calgary_watchdog, 0); - mod_timer(&tbl->watchdog_timer, jiffies); -} - -static void __init calgary_disable_translation(struct pci_dev *dev) -{ - u32 val32; - unsigned char busnum; - void __iomem *target; - void __iomem *bbar; - struct iommu_table *tbl; - - busnum = dev->bus->number; - tbl = pci_iommu(dev->bus); - bbar = tbl->bbar; - - /* disable TCE in PHB Config Register */ - target = calgary_reg(bbar, phb_offset(busnum) | PHB_CONFIG_RW_OFFSET); - val32 = be32_to_cpu(readl(target)); - val32 &= ~(PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE); - - printk(KERN_INFO "Calgary: disabling translation on PHB %#x!\n", busnum); - writel(cpu_to_be32(val32), target); - readl(target); /* flush */ - - del_timer_sync(&tbl->watchdog_timer); -} - -static void __init calgary_init_one_nontraslated(struct pci_dev *dev) -{ - pci_dev_get(dev); - set_pci_iommu(dev->bus, NULL); - - /* is the device behind a bridge? */ - if (dev->bus->parent) - dev->bus->parent->self = dev; - else - dev->bus->self = dev; -} - -static int __init calgary_init_one(struct pci_dev *dev) -{ - void __iomem *bbar; - struct iommu_table *tbl; - int ret; - - bbar = busno_to_bbar(dev->bus->number); - ret = calgary_setup_tar(dev, bbar); - if (ret) - goto done; - - pci_dev_get(dev); - - if (dev->bus->parent) { - if (dev->bus->parent->self) - printk(KERN_WARNING "Calgary: IEEEE, dev %p has " - "bus->parent->self!\n", dev); - dev->bus->parent->self = dev; - } else - dev->bus->self = dev; - - tbl = pci_iommu(dev->bus); - tbl->chip_ops->handle_quirks(tbl, dev); - - calgary_enable_translation(dev); - - return 0; - -done: - return ret; -} - -static int __init calgary_locate_bbars(void) -{ - int ret; - int rioidx, phb, bus; - void __iomem *bbar; - void __iomem *target; - unsigned long offset; - u8 start_bus, end_bus; - u32 val; - - ret = -ENODATA; - for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { - struct rio_detail *rio = rio_devs[rioidx]; - - if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) - continue; - - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); - if (!bbar) - goto error; - - for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { - offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; - target = calgary_reg(bbar, offset); - - val = be32_to_cpu(readl(target)); - - start_bus = (u8)((val & 0x00FF0000) >> 16); - end_bus = (u8)((val & 0x0000FF00) >> 8); - - if (end_bus) { - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; - } - } else { - bus_info[start_bus].bbar = bbar; - bus_info[start_bus].phbid = phb; - } - } - } - - return 0; - -error: - /* scan bus_info and iounmap any bbars we previously ioremap'd */ - for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) - if (bus_info[bus].bbar) - iounmap(bus_info[bus].bbar); - - return ret; -} - -static int __init calgary_init(void) -{ - int ret; - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - ret = calgary_locate_bbars(); - if (ret) - return ret; - - /* Purely for kdump kernel case */ - if (is_kdump_kernel()) - get_tce_space_from_tar(); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - calgary_init_one_nontraslated(dev); - continue; - } - - if (!info->tce_space && !translate_empty_slots) - continue; - - ret = calgary_init_one(dev); - if (ret) - goto error; - } while (1); - - dev = NULL; - for_each_pci_dev(dev) { - struct iommu_table *tbl; - - tbl = find_iommu_table(&dev->dev); - - if (translation_enabled(tbl)) - dev->dev.dma_ops = &calgary_dma_ops; - } - - return ret; - -error: - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) { - pci_dev_put(dev); - continue; - } - if (!info->tce_space && !translate_empty_slots) - continue; - - calgary_disable_translation(dev); - calgary_free_bus(dev); - pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ - dev->dev.dma_ops = NULL; - } while (1); - - return ret; -} - -static inline int __init determine_tce_table_size(void) -{ - int ret; - - if (specified_table_size != TCE_TABLE_SIZE_UNSPECIFIED) - return specified_table_size; - - if (is_kdump_kernel() && saved_max_pfn) { - /* - * Table sizes are from 0 to 7 (TCE_TABLE_SIZE_64K to - * TCE_TABLE_SIZE_8M). Table size 0 has 8K entries and each - * larger table size has twice as many entries, so shift the - * max ram address by 13 to divide by 8K and then look at the - * order of the result to choose between 0-7. - */ - ret = get_order((saved_max_pfn * PAGE_SIZE) >> 13); - if (ret > TCE_TABLE_SIZE_8M) - ret = TCE_TABLE_SIZE_8M; - } else { - /* - * Use 8M by default (suggested by Muli) if it's not - * kdump kernel and saved_max_pfn isn't set. - */ - ret = TCE_TABLE_SIZE_8M; - } - - return ret; -} - -static int __init build_detail_arrays(void) -{ - unsigned long ptr; - unsigned numnodes, i; - int scal_detail_size, rio_detail_size; - - numnodes = rio_table_hdr->num_scal_dev; - if (numnodes > MAX_NUMNODES){ - printk(KERN_WARNING - "Calgary: MAX_NUMNODES too low! Defined as %d, " - "but system has %d nodes.\n", - MAX_NUMNODES, numnodes); - return -ENODEV; - } - - switch (rio_table_hdr->version){ - case 2: - scal_detail_size = 11; - rio_detail_size = 13; - break; - case 3: - scal_detail_size = 12; - rio_detail_size = 15; - break; - default: - printk(KERN_WARNING - "Calgary: Invalid Rio Grande Table Version: %d\n", - rio_table_hdr->version); - return -EPROTO; - } - - ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < numnodes; i++, ptr += scal_detail_size) - scal_devs[i] = (struct scal_detail *)ptr; - - for (i = 0; i < rio_table_hdr->num_rio_dev; - i++, ptr += rio_detail_size) - rio_devs[i] = (struct rio_detail *)ptr; - - return 0; -} - -static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) -{ - int dev; - u32 val; - - if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { - /* - * FIXME: properly scan for devices across the - * PCI-to-PCI bridge on every CalIOC2 port. - */ - return 1; - } - - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff) - break; - } - return (val != 0xffffffff); -} - -/* - * calgary_init_bitmap_from_tce_table(): - * Function for kdump case. In the second/kdump kernel initialize - * the bitmap based on the tce table entries obtained from first kernel - */ -static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) -{ - u64 *tp; - unsigned int index; - tp = ((u64 *)tbl->it_base); - for (index = 0 ; index < tbl->it_size; index++) { - if (*tp != 0x0) - set_bit(index, tbl->it_map); - tp++; - } -} - -/* - * get_tce_space_from_tar(): - * Function for kdump case. Get the tce tables from first kernel - * by reading the contents of the base address register of calgary iommu - */ -static void __init get_tce_space_from_tar(void) -{ - int bus; - void __iomem *target; - unsigned long tce_space; - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - target = calgary_reg(bus_info[bus].bbar, - tar_offset(bus)); - tce_space = be64_to_cpu(readq(target)); - tce_space = tce_space & TAR_SW_BITS; - - tce_space = tce_space & (~specified_table_size); - info->tce_space = (u64 *)__va(tce_space); - } - } - return; -} - -static int __init calgary_iommu_init(void) -{ - int ret; - - /* ok, we're trying to use Calgary - let's roll */ - printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n"); - - ret = calgary_init(); - if (ret) { - printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " - "falling back to no_iommu\n", ret); - return ret; - } - - return 0; -} - -int __init detect_calgary(void) -{ - int bus; - void *tbl; - int calgary_found = 0; - unsigned long ptr; - unsigned int offset, prev_offset; - int ret; - - /* - * if the user specified iommu=off or iommu=soft or we found - * another HW IOMMU already, bail out. - */ - if (no_iommu || iommu_detected) - return -ENODEV; - - if (!use_calgary) - return -ENODEV; - - if (!early_pci_allowed()) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); - - ptr = (unsigned long)phys_to_virt(get_bios_ebda()); - - rio_table_hdr = NULL; - prev_offset = 0; - offset = 0x180; - /* - * The next offset is stored in the 1st word. - * Only parse up until the offset increases: - */ - while (offset > prev_offset) { - /* The block id is stored in the 2nd word */ - if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ - /* set the pointer past the offset & block id */ - rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); - break; - } - prev_offset = offset; - offset = *((unsigned short *)(ptr + offset)); - } - if (!rio_table_hdr) { - printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " - "in EBDA - bailing!\n"); - return -ENODEV; - } - - ret = build_detail_arrays(); - if (ret) { - printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); - return -ENOMEM; - } - - specified_table_size = determine_tce_table_size(); - - for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - struct calgary_bus_info *info = &bus_info[bus]; - unsigned short pci_device; - u32 val; - - val = read_pci_config(bus, 0, 0, 0); - pci_device = (val & 0xFFFF0000) >> 16; - - if (!is_cal_pci_dev(pci_device)) - continue; - - if (info->translation_disabled) - continue; - - if (calgary_bus_has_devices(bus, pci_device) || - translate_empty_slots) { - /* - * If it is kdump kernel, find and use tce tables - * from first kernel, else allocate tce tables here - */ - if (!is_kdump_kernel()) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - } - calgary_found = 1; - } - } - - printk(KERN_DEBUG "Calgary: finished detection, Calgary %s\n", - calgary_found ? "found" : "not found"); - - if (calgary_found) { - iommu_detected = 1; - calgary_detected = 1; - printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", - specified_table_size); - - x86_init.iommu.iommu_init = calgary_iommu_init; - } - return calgary_found; - -cleanup: - for (--bus; bus >= 0; --bus) { - struct calgary_bus_info *info = &bus_info[bus]; - - if (info->tce_space) - free_tce_table(info->tce_space); - } - return -ENOMEM; -} - -static int __init calgary_parse_options(char *p) -{ - unsigned int bridge; - unsigned long val; - size_t len; - ssize_t ret; - - while (*p) { - if (!strncmp(p, "64k", 3)) - specified_table_size = TCE_TABLE_SIZE_64K; - else if (!strncmp(p, "128k", 4)) - specified_table_size = TCE_TABLE_SIZE_128K; - else if (!strncmp(p, "256k", 4)) - specified_table_size = TCE_TABLE_SIZE_256K; - else if (!strncmp(p, "512k", 4)) - specified_table_size = TCE_TABLE_SIZE_512K; - else if (!strncmp(p, "1M", 2)) - specified_table_size = TCE_TABLE_SIZE_1M; - else if (!strncmp(p, "2M", 2)) - specified_table_size = TCE_TABLE_SIZE_2M; - else if (!strncmp(p, "4M", 2)) - specified_table_size = TCE_TABLE_SIZE_4M; - else if (!strncmp(p, "8M", 2)) - specified_table_size = TCE_TABLE_SIZE_8M; - - len = strlen("translate_empty_slots"); - if (!strncmp(p, "translate_empty_slots", len)) - translate_empty_slots = 1; - - len = strlen("disable"); - if (!strncmp(p, "disable", len)) { - p += len; - if (*p == '=') - ++p; - if (*p == '\0') - break; - ret = kstrtoul(p, 0, &val); - if (ret) - break; - - bridge = val; - if (bridge < MAX_PHB_BUS_NUM) { - printk(KERN_INFO "Calgary: disabling " - "translation for PHB %#x\n", bridge); - bus_info[bridge].translation_disabled = 1; - } - } - - p = strpbrk(p, ","); - if (!p) - break; - - p++; /* skip ',' */ - } - return 1; -} -__setup("calgary=", calgary_parse_options); - -static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) -{ - struct iommu_table *tbl; - unsigned int npages; - int i; - - tbl = pci_iommu(dev->bus); - - for (i = 0; i < 4; i++) { - struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; - - /* Don't give out TCEs that map MEM resources */ - if (!(r->flags & IORESOURCE_MEM)) - continue; - - /* 0-based? we reserve the whole 1st MB anyway */ - if (!r->start) - continue; - - /* cover the whole region */ - npages = resource_size(r) >> PAGE_SHIFT; - npages++; - - iommu_range_reserve(tbl, r->start, npages); - } -} - -static int __init calgary_fixup_tce_spaces(void) -{ - struct pci_dev *dev = NULL; - struct calgary_bus_info *info; - - if (no_iommu || swiotlb || !calgary_detected) - return -ENODEV; - - printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); - - do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); - if (!dev) - break; - if (!is_cal_pci_dev(dev->device)) - continue; - - info = &bus_info[dev->bus->number]; - if (info->translation_disabled) - continue; - - if (!info->tce_space) - continue; - - calgary_fixup_one_tce_space(dev); - - } while (1); - - return 0; -} - -/* - * We need to be call after pcibios_assign_resources (fs_initcall level) - * and before device_initcall. - */ -rootfs_initcall(calgary_fixup_tce_spaces); - -IOMMU_INIT_POST(detect_calgary); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index fa4352dce491..57de2ebff7e2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -12,7 +12,6 @@ #include <asm/dma.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/calgary.h> #include <asm/x86_init.h> #include <asm/iommu_table.h> @@ -112,11 +111,6 @@ static __init int iommu_setup(char *p) gart_parse_options(p); -#ifdef CONFIG_CALGARY_IOMMU - if (!strncmp(p, "calgary", 7)) - use_calgary = 1; -#endif /* CONFIG_CALGARY_IOMMU */ - p += strcspn(p, ","); if (*p == ',') ++p; diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h index 320ab978fb1f..1d0797b2338a 100644 --- a/arch/x86/kernel/process.h +++ b/arch/x86/kernel/process.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // // Code shared between 32 and 64 bit diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 591e885a852e..d398afd206b8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -445,6 +445,12 @@ static void __init memblock_x86_reserve_range_setup_data(void) while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) + memblock_reserve(((struct setup_indirect *)data->data)->addr, + ((struct setup_indirect *)data->data)->len); + pa_data = data->next; early_memunmap(data, sizeof(*data)); } @@ -466,7 +472,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) * due to mapping restrictions. * * On 64bit, kdump kernel need be restricted to be under 64TB, which is - * the upper limit of system RAM in 4-level paing mode. Since the kdump + * the upper limit of system RAM in 4-level paging mode. Since the kdump * jumping could be from 5-level to 4-level, the jumping will fail if * kernel is put above 64TB, and there's no way to detect the paging mode * of the kernel which will be loaded for dumping during the 1st kernel @@ -750,8 +756,8 @@ static void __init trim_bios_range(void) e820__range_update(0, PAGE_SIZE, E820_TYPE_RAM, E820_TYPE_RESERVED); /* - * special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. + * special case: Some BIOSes report the PC BIOS + * area (640Kb -> 1Mb) as RAM even though it is not. * take them out. */ e820__range_remove(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_TYPE_RAM, 1); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 86663874ef04..e6d7894ad127 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -207,8 +207,8 @@ void __init setup_per_cpu_areas(void) pcpu_cpu_distance, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) - pr_warning("%s allocator failed (%d), falling back to page size\n", - pcpu_fc_names[pcpu_chosen_fc], rc); + pr_warn("%s allocator failed (%d), falling back to page size\n", + pcpu_fc_names[pcpu_chosen_fc], rc); } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index a49fe1dcb47e..4c61f0713832 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -57,7 +57,7 @@ void __init tboot_probe(void) */ if (!e820__mapped_any(boot_params.tboot_addr, boot_params.tboot_addr, E820_TYPE_RESERVED)) { - pr_warning("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); + pr_warn("non-0 tboot_addr but it is not of type E820_TYPE_RESERVED\n"); return; } @@ -65,13 +65,12 @@ void __init tboot_probe(void) set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr); tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE); if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) { - pr_warning("tboot at 0x%llx is invalid\n", - boot_params.tboot_addr); + pr_warn("tboot at 0x%llx is invalid\n", boot_params.tboot_addr); tboot = NULL; return; } if (tboot->version < 5) { - pr_warning("tboot version is invalid: %u\n", tboot->version); + pr_warn("tboot version is invalid: %u\n", tboot->version); tboot = NULL; return; } @@ -289,7 +288,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) if (sleep_state >= ACPI_S_STATE_COUNT || acpi_shutdown_map[sleep_state] == -1) { - pr_warning("unsupported sleep state 0x%x\n", sleep_state); + pr_warn("unsupported sleep state 0x%x\n", sleep_state); return -1; } @@ -302,7 +301,7 @@ static int tboot_extended_sleep(u8 sleep_state, u32 val_a, u32 val_b) if (!tboot_enabled()) return 0; - pr_warning("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); + pr_warn("tboot is not able to suspend on platforms with reduced hardware sleep (ACPIv5)"); return -ENODEV; } @@ -320,7 +319,7 @@ static int tboot_wait_for_aps(int num_aps) } if (timeout) - pr_warning("tboot wait for APs timeout\n"); + pr_warn("tboot wait for APs timeout\n"); return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } @@ -516,7 +515,7 @@ int tboot_force_iommu(void) return 1; if (no_iommu || swiotlb || dmar_disabled) - pr_warning("Forcing Intel-IOMMU to enabled\n"); + pr_warn("Forcing Intel-IOMMU to enabled\n"); dmar_disabled = 0; #ifdef CONFIG_SWIOTLB diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c deleted file mode 100644 index 6384be751eff..000000000000 --- a/arch/x86/kernel/tce_64.c +++ /dev/null @@ -1,177 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * This file manages the translation entries for the IBM Calgary IOMMU. - * - * Derived from arch/powerpc/platforms/pseries/iommu.c - * - * Copyright (C) IBM Corporation, 2006 - * - * Author: Jon Mason <jdmason@us.ibm.com> - * Author: Muli Ben-Yehuda <muli@il.ibm.com> - */ - -#include <linux/types.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/spinlock.h> -#include <linux/string.h> -#include <linux/pci.h> -#include <linux/dma-mapping.h> -#include <linux/memblock.h> -#include <asm/tce.h> -#include <asm/calgary.h> -#include <asm/proto.h> -#include <asm/cacheflush.h> - -/* flush a tce at 'tceaddr' to main memory */ -static inline void flush_tce(void* tceaddr) -{ - /* a single tce can't cross a cache line */ - if (boot_cpu_has(X86_FEATURE_CLFLUSH)) - clflush(tceaddr); - else - wbinvd(); -} - -void tce_build(struct iommu_table *tbl, unsigned long index, - unsigned int npages, unsigned long uaddr, int direction) -{ - u64* tp; - u64 t; - u64 rpn; - - t = (1 << TCE_READ_SHIFT); - if (direction != DMA_TO_DEVICE) - t |= (1 << TCE_WRITE_SHIFT); - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - rpn = (virt_to_bus((void*)uaddr)) >> PAGE_SHIFT; - t &= ~TCE_RPN_MASK; - t |= (rpn << TCE_RPN_SHIFT); - - *tp = cpu_to_be64(t); - flush_tce(tp); - - uaddr += PAGE_SIZE; - tp++; - } -} - -void tce_free(struct iommu_table *tbl, long index, unsigned int npages) -{ - u64* tp; - - tp = ((u64*)tbl->it_base) + index; - - while (npages--) { - *tp = cpu_to_be64(0); - flush_tce(tp); - tp++; - } -} - -static inline unsigned int table_size_to_number_of_entries(unsigned char size) -{ - /* - * size is the order of the table, 0-7 - * smallest table is 8K entries, so shift result by 13 to - * multiply by 8K - */ - return (1 << size) << 13; -} - -static int tce_table_setparms(struct pci_dev *dev, struct iommu_table *tbl) -{ - unsigned int bitmapsz; - unsigned long bmppages; - int ret; - - tbl->it_busno = dev->bus->number; - - /* set the tce table size - measured in entries */ - tbl->it_size = table_size_to_number_of_entries(specified_table_size); - - /* - * number of bytes needed for the bitmap size in number of - * entries; we need one bit per entry - */ - bitmapsz = tbl->it_size / BITS_PER_BYTE; - bmppages = __get_free_pages(GFP_KERNEL, get_order(bitmapsz)); - if (!bmppages) { - printk(KERN_ERR "Calgary: cannot allocate bitmap\n"); - ret = -ENOMEM; - goto done; - } - - tbl->it_map = (unsigned long*)bmppages; - - memset(tbl->it_map, 0, bitmapsz); - - tbl->it_hint = 0; - - spin_lock_init(&tbl->it_lock); - - return 0; - -done: - return ret; -} - -int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) -{ - struct iommu_table *tbl; - int ret; - - if (pci_iommu(dev->bus)) { - printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", - dev, pci_iommu(dev->bus)); - BUG(); - } - - tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); - if (!tbl) { - printk(KERN_ERR "Calgary: error allocating iommu_table\n"); - ret = -ENOMEM; - goto done; - } - - ret = tce_table_setparms(dev, tbl); - if (ret) - goto free_tbl; - - tbl->bbar = bbar; - - set_pci_iommu(dev->bus, tbl); - - return 0; - -free_tbl: - kfree(tbl); -done: - return ret; -} - -void * __init alloc_tce_table(void) -{ - unsigned int size; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - return memblock_alloc_low(size, size); -} - -void __init free_tce_table(void *tbl) -{ - unsigned int size; - - if (!tbl) - return; - - size = table_size_to_number_of_entries(specified_table_size); - size *= TCE_ENTRY_SIZE; - - memblock_free(__pa(tbl), size); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4bb0f8447112..c90312146da0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,11 +37,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> - -#if defined(CONFIG_EDAC) -#include <linux/edac.h> -#endif - #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c59454c382fd..7e322e2daaf5 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1505,6 +1505,9 @@ void __init tsc_init(void) return; } + if (tsc_clocksource_reliable || no_tsc_watchdog) + clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; + clocksource_register_khz(&clocksource_tsc_early, tsc_khz); detect_art(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index ec534f978867..b8acf639abd1 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -364,12 +364,12 @@ retry: /* Force it to 0 if random warps brought us here */ atomic_set(&test_runs, 0); - pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + pr_warn("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); - pr_warning("Measured %Ld cycles TSC warp between CPUs, " - "turning off TSC clock.\n", max_warp); + pr_warn("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); if (random_warps) - pr_warning("TSC warped randomly between CPUs\n"); + pr_warn("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c index 548fefed71ee..4d732a444711 100644 --- a/arch/x86/kernel/umip.c +++ b/arch/x86/kernel/umip.c @@ -1,6 +1,6 @@ /* - * umip.c Emulation for instruction protected by the Intel User-Mode - * Instruction Prevention feature + * umip.c Emulation for instruction protected by the User-Mode Instruction + * Prevention feature * * Copyright (c) 2017, Intel Corporation. * Ricardo Neri <ricardo.neri-calderon@linux.intel.com> @@ -18,10 +18,10 @@ /** DOC: Emulation for User-Mode Instruction Prevention (UMIP) * - * The feature User-Mode Instruction Prevention present in recent Intel - * processor prevents a group of instructions (SGDT, SIDT, SLDT, SMSW and STR) - * from being executed with CPL > 0. Otherwise, a general protection fault is - * issued. + * User-Mode Instruction Prevention is a security feature present in recent + * x86 processors that, when enabled, prevents a group of instructions (SGDT, + * SIDT, SLDT, SMSW and STR) from being run in user mode by issuing a general + * protection fault if the instruction is executed with CPL > 0. * * Rather than relaying to the user space the general protection fault caused by * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be @@ -91,7 +91,7 @@ const char * const umip_insns[5] = { #define umip_pr_err(regs, fmt, ...) \ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__) -#define umip_pr_warning(regs, fmt, ...) \ +#define umip_pr_warn(regs, fmt, ...) \ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__) /** @@ -380,14 +380,14 @@ bool fixup_umip_exception(struct pt_regs *regs) if (umip_inst < 0) return false; - umip_pr_warning(regs, "%s instruction cannot be used by applications.\n", + umip_pr_warn(regs, "%s instruction cannot be used by applications.\n", umip_insns[umip_inst]); /* Do not emulate (spoof) SLDT or STR. */ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT) return false; - umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n"); + umip_pr_warn(regs, "For now, expensive software emulation returns the result.\n"); if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size, user_64bit_mode(regs))) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 8cd745ef8c7b..15e5aad8ac2c 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -842,8 +842,8 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) /** * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. + * @auprobe: the probepoint information. * @mm: the probed address space. - * @arch_uprobe: the probepoint information. * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 18a799c8fa28..ce89430a7f80 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -31,6 +31,28 @@ static int __init iommu_init_noop(void) { return 0; } static void iommu_shutdown_noop(void) { } bool __init bool_x86_init_noop(void) { return false; } void x86_op_int_noop(int cpu) { } +static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +static __init void get_rtc_noop(struct timespec64 *now) { } + +static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, + {} +}; + +/* + * Allow devicetree configured systems to disable the RTC by setting the + * corresponding DT node's status property to disabled. Code is optimized + * out for CONFIG_OF=n builds. + */ +static __init void x86_wallclock_init(void) +{ + struct device_node *node = of_find_matching_node(NULL, of_cmos_match); + + if (node && !of_device_is_available(node)) { + x86_platform.get_wallclock = get_rtc_noop; + x86_platform.set_wallclock = set_rtc_noop; + } +} /* * The platform setup functions are preset with the default functions @@ -73,7 +95,7 @@ struct x86_init_ops x86_init __initdata = { .timers = { .setup_percpu_clockev = setup_boot_APIC_clock, .timer_init = hpet_time_init, - .wallclock_init = x86_init_noop, + .wallclock_init = x86_wallclock_init, }, .iommu = { diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..b19ef421084d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -8,9 +8,9 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ $(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o -kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ - hyperv.o page_track.o debugfs.o + hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 63316036f85a..c0aa07487eb8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; @@ -485,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = + F(CLZERO) | F(XSAVEERPTR) | F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); @@ -618,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, */ case 0x1f: case 0xb: { - int i, level_type; + int i; - /* read more entries until level_type is zero */ - for (i = 1; ; ++i) { + /* + * We filled in entry[0] for CPUID(EAX=<function>, + * ECX=00H) above. If its level type (ECX[15:8]) is + * zero, then the leaf is unimplemented, and we're + * done. Otherwise, continue to populate entries + * until the level type (ECX[15:8]) of the previously + * added entry is zero. + */ + for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { if (*nent >= maxnent) goto out; - level_type = entry[i - 1].ecx & 0xff00; - if (!level_type) - break; do_host_cpuid(&entry[i], function, i); ++*nent; } @@ -811,8 +816,6 @@ static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, return __do_cpuid_func(entry, func, nent, maxnent); } -#undef F - struct kvm_cpuid_param { u32 func; bool (*qualifier)(const struct kvm_cpuid_param *param); @@ -969,53 +972,72 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If no match is found, check whether we exceed the vCPU's limit - * and return the content of the highest valid _standard_ leaf instead. - * This is to satisfy the CPUID specification. + * If the basic or extended CPUID leaf requested is higher than the + * maximum supported basic or extended leaf, respectively, then it is + * out of range. */ -static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, - u32 function, u32 index) +static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) { - struct kvm_cpuid_entry2 *maxlevel; - - maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - if (!maxlevel || maxlevel->eax >= function) - return NULL; - if (function & 0x80000000) { - maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); - if (!maxlevel) - return NULL; - } - return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); + struct kvm_cpuid_entry2 *max; + + max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + return max && function <= max->eax; } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) { u32 function = *eax, index = *ecx; - struct kvm_cpuid_entry2 *best; - bool entry_found = true; - - best = kvm_find_cpuid_entry(vcpu, function, index); - - if (!best) { - entry_found = false; - if (!check_limit) - goto out; + struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *max; + bool found; - best = check_cpuid_limit(vcpu, function, index); + entry = kvm_find_cpuid_entry(vcpu, function, index); + found = entry; + /* + * Intel CPUID semantics treats any query for an out-of-range + * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were + * requested. AMD CPUID semantics returns all zeroes for any + * undefined leaf, whether or not the leaf is in range. + */ + if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && + !cpuid_function_in_range(vcpu, function)) { + max = kvm_find_cpuid_entry(vcpu, 0, 0); + if (max) { + function = max->eax; + entry = kvm_find_cpuid_entry(vcpu, function, index); + } } - -out: - if (best) { - *eax = best->eax; - *ebx = best->ebx; - *ecx = best->ecx; - *edx = best->edx; - } else + if (entry) { + *eax = entry->eax; + *ebx = entry->ebx; + *ecx = entry->ecx; + *edx = entry->edx; + if (function == 7 && index == 0) { + u64 data; + if (!__kvm_get_msr(vcpu, MSR_IA32_TSX_CTRL, &data, true) && + (data & TSX_CTRL_CPUID_CLEAR)) + *ebx &= ~(F(RTM) | F(HLE)); + } + } else { *eax = *ebx = *ecx = *edx = 0; - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found); - return entry_found; + /* + * When leaf 0BH or 1FH is defined, CL is pass-through + * and EDX is always the x2APIC ID, even for undefined + * subleaves. Index 1 will exist iff the leaf is + * implemented, so we pass through CL iff leaf 1 + * exists. EDX can be copied from any existing index. + */ + if (function == 0xb || function == 0x1f) { + entry = kvm_find_cpuid_entry(vcpu, function, 1); + if (entry) { + *ecx = index & 0xff; + *edx = entry->edx; + } + } + } + trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); + return found; } EXPORT_SYMBOL_GPL(kvm_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 698efb8c3897..952d1a4f4d7e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2770,11 +2770,10 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) return emulate_ud(ctxt); ops->get_msr(ctxt, MSR_EFER, &efer); - setup_syscalls_segments(ctxt, &cs, &ss); - if (!(efer & EFER_SCE)) return emulate_ud(ctxt); + setup_syscalls_segments(ctxt, &cs, &ss); ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); @@ -2838,12 +2837,11 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_PROT64) return X86EMUL_UNHANDLEABLE; - setup_syscalls_segments(ctxt, &cs, &ss); - ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); if ((msr_data & 0xfffc) == 0x0) return emulate_gp(ctxt, 0); + setup_syscalls_segments(ctxt, &cs, &ss); ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF); cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK; ss_sel = cs_sel + 8; diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index d859ae8890d0..9fd2dd89a1c5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -271,8 +271,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) { unsigned index; bool mask_before, mask_after; - int old_remote_irr, old_delivery_status; union kvm_ioapic_redirect_entry *e; + unsigned long vcpu_bitmap; + int old_remote_irr, old_delivery_status, old_dest_id, old_dest_mode; switch (ioapic->ioregsel) { case IOAPIC_REG_VERSION: @@ -296,6 +297,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) /* Preserve read-only fields */ old_remote_irr = e->fields.remote_irr; old_delivery_status = e->fields.delivery_status; + old_dest_id = e->fields.dest_id; + old_dest_mode = e->fields.dest_mode; if (ioapic->ioregsel & 1) { e->bits &= 0xffffffff; e->bits |= (u64) val << 32; @@ -321,7 +324,34 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_make_scan_ioapic_request(ioapic->kvm); + if (e->fields.delivery_mode == APIC_DM_FIXED) { + struct kvm_lapic_irq irq; + + irq.shorthand = 0; + irq.vector = e->fields.vector; + irq.delivery_mode = e->fields.delivery_mode << 8; + irq.dest_id = e->fields.dest_id; + irq.dest_mode = e->fields.dest_mode; + bitmap_zero(&vcpu_bitmap, 16); + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + if (old_dest_mode != e->fields.dest_mode || + old_dest_id != e->fields.dest_id) { + /* + * Update vcpu_bitmap with vcpus specified in + * the previous request as well. This is done to + * keep ioapic_handled_vectors synchronized. + */ + irq.dest_id = old_dest_id; + irq.dest_mode = old_dest_mode; + kvm_bitmap_or_dest_vcpus(ioapic->kvm, &irq, + &vcpu_bitmap); + } + kvm_make_scan_ioapic_request_mask(ioapic->kvm, + &vcpu_bitmap); + } else { + kvm_make_scan_ioapic_request(ioapic->kvm); + } break; } } diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1cc6c47dc77e..58767020de41 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -37,22 +37,50 @@ BUILD_KVM_GPR_ACCESSORS(r14, R14) BUILD_KVM_GPR_ACCESSORS(r15, R15) #endif -static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline bool kvm_register_is_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) { - if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline bool kvm_register_is_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + return test_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline void kvm_register_mark_available(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline void kvm_register_mark_dirty(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); +} + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) +{ + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return 0; + + if (!kvm_register_is_available(vcpu, reg)) kvm_x86_ops->cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; } -static inline void kvm_register_write(struct kvm_vcpu *vcpu, - enum kvm_reg reg, +static inline void kvm_register_write(struct kvm_vcpu *vcpu, int reg, unsigned long val) { + if (WARN_ON_ONCE((unsigned int)reg >= NR_VCPU_REGS)) + return; + vcpu->arch.regs[reg] = val; - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_dirty(vcpu, reg); } static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) @@ -79,9 +107,8 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) { might_sleep(); /* on svm */ - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) - kvm_x86_ops->cache_reg(vcpu, (enum kvm_reg)VCPU_EXREG_PDPTR); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -109,8 +136,8 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - kvm_x86_ops->decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3a3a6854dcca..cf9177b4a07f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -66,9 +66,10 @@ #define X2APIC_BROADCAST 0xFFFFFFFFul static bool lapic_timer_advance_dynamic __read_mostly; -#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 -#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000 -#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000 +#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */ +#define LAPIC_TIMER_ADVANCE_NS_INIT 1000 +#define LAPIC_TIMER_ADVANCE_NS_MAX 5000 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -110,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline u8 kvm_xapic_id(struct kvm_lapic *apic) -{ - return kvm_lapic_get_reg(apic, APIC_ID) >> 24; -} - static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) { return apic->vcpu->vcpu_id; @@ -561,60 +557,53 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, irq->level, irq->trig_mode, dest_map); } +static int __pv_send_ipi(unsigned long *ipi_bitmap, struct kvm_apic_map *map, + struct kvm_lapic_irq *irq, u32 min) +{ + int i, count = 0; + struct kvm_vcpu *vcpu; + + if (min > map->max_apic_id) + return 0; + + for_each_set_bit(i, ipi_bitmap, + min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { + if (map->phys_map[min + i]) { + vcpu = map->phys_map[min + i]->vcpu; + count += kvm_apic_set_irq(vcpu, irq, NULL); + } + } + + return count; +} + int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit) { - int i; struct kvm_apic_map *map; - struct kvm_vcpu *vcpu; struct kvm_lapic_irq irq = {0}; int cluster_size = op_64_bit ? 64 : 32; - int count = 0; + int count; + + if (icr & (APIC_DEST_MASK | APIC_SHORT_MASK)) + return -KVM_EINVAL; irq.vector = icr & APIC_VECTOR_MASK; irq.delivery_mode = icr & APIC_MODE_MASK; irq.level = (icr & APIC_INT_ASSERT) != 0; irq.trig_mode = icr & APIC_INT_LEVELTRIG; - if (icr & APIC_DEST_MASK) - return -KVM_EINVAL; - if (icr & APIC_SHORT_MASK) - return -KVM_EINVAL; - rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (unlikely(!map)) { - count = -EOPNOTSUPP; - goto out; - } - - if (min > map->max_apic_id) - goto out; - /* Bits above cluster_size are masked in the caller. */ - for_each_set_bit(i, &ipi_bitmap_low, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } - } - - min += cluster_size; - - if (min > map->max_apic_id) - goto out; - - for_each_set_bit(i, &ipi_bitmap_high, - min((u32)BITS_PER_LONG, (map->max_apic_id - min + 1))) { - if (map->phys_map[min + i]) { - vcpu = map->phys_map[min + i]->vcpu; - count += kvm_apic_set_irq(vcpu, &irq, NULL); - } + count = -EOPNOTSUPP; + if (likely(map)) { + count = __pv_send_ipi(&ipi_bitmap_low, map, &irq, min); + min += cluster_size; + count += __pv_send_ipi(&ipi_bitmap_high, map, &irq, min); } -out: rcu_read_unlock(); return count; } @@ -1128,6 +1117,50 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, return result; } +/* + * This routine identifies the destination vcpus mask meant to receive the + * IOAPIC interrupts. It either uses kvm_apic_map_get_dest_lapic() to find + * out the destination vcpus array and set the bitmap or it traverses to + * each available vcpu to identify the same. + */ +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap) +{ + struct kvm_lapic **dest_vcpu = NULL; + struct kvm_lapic *src = NULL; + struct kvm_apic_map *map; + struct kvm_vcpu *vcpu; + unsigned long bitmap; + int i, vcpu_idx; + bool ret; + + rcu_read_lock(); + map = rcu_dereference(kvm->arch.apic_map); + + ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dest_vcpu, + &bitmap); + if (ret) { + for_each_set_bit(i, &bitmap, 16) { + if (!dest_vcpu[i]) + continue; + vcpu_idx = dest_vcpu[i]->vcpu->vcpu_idx; + __set_bit(vcpu_idx, vcpu_bitmap); + } + } else { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_apic_present(vcpu)) + continue; + if (!kvm_apic_match_dest(vcpu, NULL, + irq->delivery_mode, + irq->dest_id, + irq->dest_mode)) + continue; + __set_bit(i, vcpu_bitmap); + } + } + rcu_read_unlock(); +} + int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) { return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; @@ -1504,8 +1537,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP; } - if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX)) - timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX)) + timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; apic->lapic_timer.timer_advance_ns = timer_advance_ns; } @@ -2302,7 +2335,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) HRTIMER_MODE_ABS_HARD); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { - apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; + apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT; lapic_timer_advance_dynamic = true; } else { apic->lapic_timer.timer_advance_ns = timer_advance_ns; @@ -2713,7 +2746,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs * and leave the INIT pending. */ - if (is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu)) { + if (kvm_vcpu_latch_init(vcpu)) { WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) clear_bit(KVM_APIC_SIPI, &apic->pending_events); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 2aad7e226fc0..39925afdfcdc 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -226,6 +226,9 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu); +void kvm_bitmap_or_dest_vcpus(struct kvm *kvm, struct kvm_lapic_irq *irq, + unsigned long *vcpu_bitmap); + bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); int kvm_vector_to_index(u32 vector, u32 dest_vcpus, @@ -242,4 +245,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base) return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE); } +static inline u8 kvm_xapic_id(struct kvm_lapic *apic) +{ + return kvm_lapic_get_reg(apic, APIC_ID) >> 24; +} + #endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 11f8ec89433b..d55674f44a18 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -210,4 +210,8 @@ void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn); int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); + +int kvm_mmu_post_init_vm(struct kvm *kvm); +void kvm_mmu_pre_destroy_vm(struct kvm *kvm); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu/mmu.c index 5269aa057dfa..6f92b40d798c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -37,6 +37,7 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/kern_levels.h> +#include <linux/kthread.h> #include <asm/page.h> #include <asm/pat.h> @@ -47,6 +48,35 @@ #include <asm/kvm_page_track.h> #include "trace.h" +extern bool itlb_multihit_kvm_mitigation; + +static int __read_mostly nx_huge_pages = -1; +#ifdef CONFIG_PREEMPT_RT +/* Recovery can cause latency spikes, disable it for PREEMPT_RT. */ +static uint __read_mostly nx_huge_pages_recovery_ratio = 0; +#else +static uint __read_mostly nx_huge_pages_recovery_ratio = 60; +#endif + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp); + +static struct kernel_param_ops nx_huge_pages_ops = { + .set = set_nx_huge_pages, + .get = param_get_bool, +}; + +static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = { + .set = set_nx_huge_pages_recovery_ratio, + .get = param_get_uint, +}; + +module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644); +__MODULE_PARM_TYPE(nx_huge_pages, "bool"); +module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops, + &nx_huge_pages_recovery_ratio, 0644); +__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); + /* * When setting this variable to true it enables Two-Dimensional-Paging * where the hardware walks 2 page tables: @@ -83,7 +113,17 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 #define PT_FIRST_AVAIL_BITS_SHIFT 10 -#define PT64_SECOND_AVAIL_BITS_SHIFT 52 +#define PT64_SECOND_AVAIL_BITS_SHIFT 54 + +/* + * The mask used to denote special SPTEs, which can be either MMIO SPTEs or + * Access Tracking SPTEs. + */ +#define SPTE_SPECIAL_MASK (3ULL << 52) +#define SPTE_AD_ENABLED_MASK (0ULL << 52) +#define SPTE_AD_DISABLED_MASK (1ULL << 52) +#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52) +#define SPTE_MMIO_MASK (3ULL << 52) #define PT64_LEVEL_BITS 9 @@ -219,12 +259,11 @@ static u64 __read_mostly shadow_present_mask; static u64 __read_mostly shadow_me_mask; /* - * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. - * Non-present SPTEs with shadow_acc_track_value set are in place for access - * tracking. + * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK; + * shadow_acc_track_mask is the set of bits to be cleared in non-accessed + * pages. */ static u64 __read_mostly shadow_acc_track_mask; -static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK; /* * The mask/shift to use for saving the original R/X bits when marking the PTE @@ -304,7 +343,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); BUG_ON((mmio_mask & mmio_value) != mmio_value); - shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK; + shadow_mmio_value = mmio_value | SPTE_MMIO_MASK; shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK; shadow_mmio_access_mask = access_mask; } @@ -320,10 +359,32 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp) return sp->role.ad_disabled; } +static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu) +{ + /* + * When using the EPT page-modification log, the GPAs in the log + * would come from L2 rather than L1. Therefore, we need to rely + * on write protection to record dirty pages. This also bypasses + * PML, since writes now result in a vmexit. + */ + return vcpu->arch.mmu == &vcpu->arch.guest_mmu; +} + static inline bool spte_ad_enabled(u64 spte) { MMU_WARN_ON(is_mmio_spte(spte)); - return !(spte & shadow_acc_track_value); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK; +} + +static inline bool spte_ad_need_write_protect(u64 spte) +{ + MMU_WARN_ON(is_mmio_spte(spte)); + return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK; +} + +static bool is_nx_huge_page_enabled(void) +{ + return READ_ONCE(nx_huge_pages); } static inline u64 spte_shadow_accessed_mask(u64 spte) @@ -461,7 +522,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, { BUG_ON(!dirty_mask != !accessed_mask); BUG_ON(!accessed_mask && !acc_track_mask); - BUG_ON(acc_track_mask & shadow_acc_track_value); + BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK); shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; @@ -1164,6 +1225,17 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_disallow_lpage(slot, gfn); } +static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + if (sp->lpage_disallowed) + return; + + ++kvm->stat.nx_lpage_splits; + list_add_tail(&sp->lpage_disallowed_link, + &kvm->arch.lpage_disallowed_mmu_pages); + sp->lpage_disallowed = true; +} + static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { struct kvm_memslots *slots; @@ -1181,6 +1253,13 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_gfn_allow_lpage(slot, gfn); } +static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + --kvm->stat.nx_lpage_splits; + sp->lpage_disallowed = false; + list_del(&sp->lpage_disallowed_link); +} + static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level, struct kvm_memory_slot *slot) { @@ -1589,16 +1668,16 @@ static bool spte_clear_dirty(u64 *sptep) rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); + MMU_WARN_ON(!spte_ad_enabled(spte)); spte &= ~shadow_dirty_mask; - return mmu_spte_update(sptep, spte); } -static bool wrprot_ad_disabled_spte(u64 *sptep) +static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable) + if (was_writable && !spte_ad_enabled(*sptep)) kvm_set_pfn_dirty(spte_to_pfn(*sptep)); return was_writable; @@ -1617,10 +1696,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head) bool flush = false; for_each_rmap_spte(rmap_head, &iter, sptep) - if (spte_ad_enabled(*sptep)) - flush |= spte_clear_dirty(sptep); + if (spte_ad_need_write_protect(*sptep)) + flush |= spte_wrprot_for_clear_dirty(sptep); else - flush |= wrprot_ad_disabled_spte(sptep); + flush |= spte_clear_dirty(sptep); return flush; } @@ -1631,6 +1710,11 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); + /* + * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * do not bother adding back write access to pages marked + * SPTE_AD_WRPROT_ONLY_MASK. + */ spte |= shadow_dirty_mask; return mmu_spte_update(sptep, spte); @@ -2622,7 +2706,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep, shadow_user_mask | shadow_x_mask | shadow_me_mask; if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; else spte |= shadow_accessed_mask; @@ -2761,6 +2845,9 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, kvm_reload_remote_mmus(kvm); } + if (sp->lpage_disallowed) + unaccount_huge_nx_page(kvm, sp); + sp->role.invalid = 1; return list_unstable; } @@ -2968,7 +3055,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, sp = page_header(__pa(sptep)); if (sp_ad_disabled(sp)) - spte |= shadow_acc_track_value; + spte |= SPTE_AD_DISABLED_MASK; + else if (kvm_vcpu_ad_need_write_protect(vcpu)) + spte |= SPTE_AD_WRPROT_ONLY_MASK; /* * For the EPT case, shadow_present_mask is 0 if hardware @@ -2980,6 +3069,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); + if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + is_nx_huge_page_enabled()) { + pte_access &= ~ACC_EXEC_MASK; + } + if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else @@ -3200,9 +3294,32 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) __direct_pte_prefetch(vcpu, sp, sptep); } +static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, + gfn_t gfn, kvm_pfn_t *pfnp, int *levelp) +{ + int level = *levelp; + u64 spte = *it.sptep; + + if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + is_nx_huge_page_enabled() && + is_shadow_present_pte(spte) && + !is_large_pte(spte)) { + /* + * A small SPTE exists for this pfn, but FNAME(fetch) + * and __direct_map would like to create a large PTE + * instead: just force them to go down another level, + * patching back for them into pfn the next 9 bits of + * the address. + */ + u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1); + *pfnp |= gfn & page_mask; + (*levelp)--; + } +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, int map_writable, int level, kvm_pfn_t pfn, - bool prefault) + bool prefault, bool lpage_disallowed) { struct kvm_shadow_walk_iterator it; struct kvm_mmu_page *sp; @@ -3215,6 +3332,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, trace_kvm_mmu_spte_requested(gpa, level, pfn); for_each_shadow_entry(vcpu, gpa, it) { + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &level); + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == level) break; @@ -3225,6 +3348,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write, it.level - 1, true, ACC_ALL); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -3273,7 +3398,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -3517,11 +3642,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - bool force_pt_level = false; + bool force_pt_level; kvm_pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + force_pt_level = lpage_disallowed; level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { /* @@ -3555,7 +3683,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, v, write, map_writable, level, pfn, + prefault, false); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4141,6 +4270,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; bool map_writable; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)); @@ -4151,8 +4282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, - PT_DIRECTORY_LEVEL); + force_pt_level = + lpage_disallowed || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL); level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { if (level > PT_DIRECTORY_LEVEL && @@ -4181,7 +4313,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, goto out_unlock; if (likely(!force_pt_level)) transparent_hugepage_adjust(vcpu, gfn, &pfn, &level); - r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault); + r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, + prefault, lpage_disallowed); out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(pfn); @@ -4262,7 +4395,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } /* @@ -5881,9 +6014,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) @@ -6122,10 +6255,59 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } +static bool get_nx_auto_mode(void) +{ + /* Return true when CPU has the bug, and mitigations are ON */ + return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off(); +} + +static void __set_nx_huge_pages(bool val) +{ + nx_huge_pages = itlb_multihit_kvm_mitigation = val; +} + +static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) +{ + bool old_val = nx_huge_pages; + bool new_val; + + /* In "auto" mode deploy workaround only if CPU has the bug. */ + if (sysfs_streq(val, "off")) + new_val = 0; + else if (sysfs_streq(val, "force")) + new_val = 1; + else if (sysfs_streq(val, "auto")) + new_val = get_nx_auto_mode(); + else if (strtobool(val, &new_val) < 0) + return -EINVAL; + + __set_nx_huge_pages(new_val); + + if (new_val != old_val) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + mutex_lock(&kvm->slots_lock); + kvm_mmu_zap_all_fast(kvm); + mutex_unlock(&kvm->slots_lock); + + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + } + mutex_unlock(&kvm_lock); + } + + return 0; +} + int kvm_mmu_module_init(void) { int ret = -ENOMEM; + if (nx_huge_pages == -1) + __set_nx_huge_pages(get_nx_auto_mode()); + /* * MMU roles use union aliasing which is, generally speaking, an * undefined behavior. However, we supposedly know how compilers behave @@ -6205,3 +6387,116 @@ void kvm_mmu_module_exit(void) unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } + +static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp) +{ + unsigned int old_val; + int err; + + old_val = nx_huge_pages_recovery_ratio; + err = param_set_uint(val, kp); + if (err) + return err; + + if (READ_ONCE(nx_huge_pages) && + !old_val && nx_huge_pages_recovery_ratio) { + struct kvm *kvm; + + mutex_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) + wake_up_process(kvm->arch.nx_lpage_recovery_thread); + + mutex_unlock(&kvm_lock); + } + + return err; +} + +static void kvm_recover_nx_lpages(struct kvm *kvm) +{ + int rcu_idx; + struct kvm_mmu_page *sp; + unsigned int ratio; + LIST_HEAD(invalid_list); + ulong to_zap; + + rcu_idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; + while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) { + /* + * We use a separate list instead of just using active_mmu_pages + * because the number of lpage_disallowed pages is expected to + * be relatively small compared to the total. + */ + sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages, + struct kvm_mmu_page, + lpage_disallowed_link); + WARN_ON_ONCE(!sp->lpage_disallowed); + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); + WARN_ON_ONCE(sp->lpage_disallowed); + + if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_mmu_commit_zap_page(kvm, &invalid_list); + if (to_zap) + cond_resched_lock(&kvm->mmu_lock); + } + } + + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, rcu_idx); +} + +static long get_nx_lpage_recovery_timeout(u64 start_time) +{ + return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio) + ? start_time + 60 * HZ - get_jiffies_64() + : MAX_SCHEDULE_TIMEOUT; +} + +static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data) +{ + u64 start_time; + long remaining_time; + + while (true) { + start_time = get_jiffies_64(); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop() && remaining_time > 0) { + schedule_timeout(remaining_time); + remaining_time = get_nx_lpage_recovery_timeout(start_time); + set_current_state(TASK_INTERRUPTIBLE); + } + + set_current_state(TASK_RUNNING); + + if (kthread_should_stop()) + return 0; + + kvm_recover_nx_lpages(kvm); + } +} + +int kvm_mmu_post_init_vm(struct kvm *kvm) +{ + int err; + + err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0, + "kvm-nx-lpage-recovery", + &kvm->arch.nx_lpage_recovery_thread); + if (!err) + kthread_unpark(kvm->arch.nx_lpage_recovery_thread); + + return err; +} + +void kvm_mmu_pre_destroy_vm(struct kvm *kvm) +{ + if (kvm->arch.nx_lpage_recovery_thread) + kthread_stop(kvm->arch.nx_lpage_recovery_thread); +} diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/mmu/page_track.c index 3521e2d176f2..3521e2d176f2 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 7d5cdb3af594..97b21e7fd013 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -614,13 +614,14 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, int write_fault, int hlevel, - kvm_pfn_t pfn, bool map_writable, bool prefault) + kvm_pfn_t pfn, bool map_writable, bool prefault, + bool lpage_disallowed) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned direct_access, access = gw->pt_access; int top_level, ret; - gfn_t base_gfn; + gfn_t gfn, base_gfn; direct_access = gw->pte_access; @@ -665,13 +666,25 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, link_shadow_page(vcpu, it.sptep, sp); } - base_gfn = gw->gfn; + /* + * FNAME(page_fault) might have clobbered the bottom bits of + * gw->gfn, restore them from the virtual address. + */ + gfn = gw->gfn | ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT); + base_gfn = gfn; trace_kvm_mmu_spte_requested(addr, gw->level, pfn); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { clear_sp_write_flooding_count(it.sptep); - base_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); + + /* + * We cannot overwrite existing page tables with an NX + * large page, as the leaf could be executable. + */ + disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel); + + base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); if (it.level == hlevel) break; @@ -683,6 +696,8 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, sp = kvm_mmu_get_page(vcpu, base_gfn, addr, it.level - 1, true, direct_access); link_shadow_page(vcpu, it.sptep, sp); + if (lpage_disallowed) + account_huge_nx_page(vcpu->kvm, sp); } } @@ -759,9 +774,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int r; kvm_pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; - bool force_pt_level = false; unsigned long mmu_seq; bool map_writable, is_self_change_mapping; + bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) && + is_nx_huge_page_enabled(); + bool force_pt_level = lpage_disallowed; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -851,7 +868,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, if (!force_pt_level) transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level); r = FNAME(fetch)(vcpu, addr, &walker, write_fault, - level, pfn, map_writable, prefault); + level, pfn, map_writable, prefault, lpage_disallowed); kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); out_unlock: diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 46875bbd0419..d5e6d5b3f06f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -62,8 +62,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } @@ -76,8 +75,7 @@ static void kvm_perf_overflow_intr(struct perf_event *perf_event, struct kvm_pmc *pmc = perf_event->overflow_handler_context; struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!test_and_set_bit(pmc->idx, - (unsigned long *)&pmu->reprogram_pmi)) { + if (!test_and_set_bit(pmc->idx, pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); @@ -137,7 +135,37 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); + pmc_to_pmu(pmc)->event_count++; + clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); +} + +static void pmc_pause_counter(struct kvm_pmc *pmc) +{ + u64 counter = pmc->counter; + + if (!pmc->perf_event) + return; + + /* update counter, reset event value to avoid redundant accumulation */ + counter += perf_event_pause(pmc->perf_event, true); + pmc->counter = counter & pmc_bitmask(pmc); +} + +static bool pmc_resume_counter(struct kvm_pmc *pmc) +{ + if (!pmc->perf_event) + return false; + + /* recalibrate sample period and check if it's accepted by perf core */ + if (perf_event_period(pmc->perf_event, + (-pmc->counter) & pmc_bitmask(pmc))) + return false; + + /* reuse perf_event to serve as pmc_reprogram_counter() does*/ + perf_event_enable(pmc->perf_event); + + clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); + return true; } void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) @@ -154,7 +182,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc->eventsel = eventsel; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; @@ -193,6 +221,12 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (type == PERF_TYPE_RAW) config = eventsel & X86_RAW_EVENT_MASK; + if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = eventsel; pmc_reprogram_counter(pmc, type, config, !(eventsel & ARCH_PERFMON_EVENTSEL_USR), !(eventsel & ARCH_PERFMON_EVENTSEL_OS), @@ -209,7 +243,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) struct kvm_pmu_event_filter *filter; struct kvm *kvm = pmc->vcpu->kvm; - pmc_stop_counter(pmc); + pmc_pause_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; @@ -224,6 +258,12 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) return; } + if (pmc->current_config == (u64)ctrl && pmc_resume_counter(pmc)) + return; + + pmc_release_perf_event(pmc); + + pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ @@ -253,27 +293,32 @@ EXPORT_SYMBOL_GPL(reprogram_counter); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - u64 bitmask; int bit; - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + clear_bit(bit, pmu->reprogram_pmi); continue; } reprogram_counter(pmu, bit); } + + /* + * Unused perf_events are only released if the corresponding MSRs + * weren't accessed during the last vCPU time slice. kvm_arch_sched_in + * triggers KVM_REQ_PMU if cleanup is needed. + */ + if (unlikely(pmu->need_cleanup)) + kvm_pmu_cleanup(vcpu); } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); + return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -323,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx, &mask); + pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; @@ -339,7 +384,17 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); + return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) || + kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); +} + +static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr); + + if (pmc) + __set_bit(pmc->idx, pmu->pmc_in_use); } int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -349,6 +404,7 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); } @@ -376,9 +432,45 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) memset(pmu, 0, sizeof(*pmu)); kvm_x86_ops->pmu_ops->init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + pmu->event_count = 0; + pmu->need_cleanup = false; kvm_pmu_refresh(vcpu); } +static inline bool pmc_speculative_in_use(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (pmc_is_fixed(pmc)) + return fixed_ctrl_field(pmu->fixed_ctr_ctrl, + pmc->idx - INTEL_PMC_IDX_FIXED) & 0x3; + + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_ENABLE; +} + +/* Release perf_events for vPMCs that have been unused for a full time slice. */ +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; + DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); + int i; + + pmu->need_cleanup = false; + + bitmap_andnot(bitmask, pmu->all_valid_pmc_idx, + pmu->pmc_in_use, X86_PMC_IDX_MAX); + + for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { + pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i); + + if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) + pmc_stop_counter(pmc); + } + + bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX); +} + void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 58265f761c3b..7ebb62326c14 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -25,9 +25,10 @@ struct kvm_pmu_ops { unsigned (*find_fixed_event)(int idx); bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); - struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx, - u64 *mask); - int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); + struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); + int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -55,12 +56,21 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) return counter & pmc_bitmask(pmc); } -static inline void pmc_stop_counter(struct kvm_pmc *pmc) +static inline void pmc_release_perf_event(struct kvm_pmc *pmc) { if (pmc->perf_event) { - pmc->counter = pmc_read_counter(pmc); perf_event_release_kernel(pmc->perf_event); pmc->perf_event = NULL; + pmc->current_config = 0; + pmc_to_pmu(pmc)->event_count--; + } +} + +static inline void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + pmc_release_perf_event(pmc); } } @@ -79,6 +89,12 @@ static inline bool pmc_is_enabled(struct kvm_pmc *pmc) return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); } +static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, + u64 data) +{ + return !(pmu->global_ctrl_mask & data); +} + /* returns general purpose PMC with the specified MSR. Note that it can be * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a * paramenter to tell them apart. @@ -110,13 +126,14 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx); +int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); void kvm_pmu_refresh(struct kvm_vcpu *vcpu); void kvm_pmu_reset(struct kvm_vcpu *vcpu); void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_cleanup(struct kvm_vcpu *vcpu); void kvm_pmu_destroy(struct kvm_vcpu *vcpu); int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index c8388389a3b0..ce0b10fe5e2b 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -174,7 +174,7 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -184,7 +184,8 @@ static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) } /* idx is the ECX register of RDPMC instruction */ -static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *mask) +static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *counters; @@ -199,13 +200,19 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx, u static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ + return false; +} + +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - int ret = false; + struct kvm_pmc *pmc; - ret = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER) || - get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); + pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_COUNTER); + pmc = pmc ? pmc : get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL); - return ret; + return pmc; } static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) @@ -272,6 +279,7 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; pmu->global_status = 0; + bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } static void amd_pmu_init(struct kvm_vcpu *vcpu) @@ -285,6 +293,7 @@ static void amd_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } } @@ -306,8 +315,9 @@ struct kvm_pmu_ops amd_pmu_ops = { .find_fixed_event = amd_find_fixed_event, .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, - .is_valid_msr_idx = amd_is_valid_msr_idx, + .is_valid_rdpmc_ecx = amd_is_valid_rdpmc_ecx, .is_valid_msr = amd_is_valid_msr, .get_msr = amd_pmu_get_msr, .set_msr = amd_pmu_set_msr, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f8ecb6df5106..362e874297e4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,6 +38,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/swap.h> +#include <linux/rwsem.h> #include <asm/apic.h> #include <asm/perf_event.h> @@ -418,9 +419,13 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +static int sev_flush_asids(void); +static DECLARE_RWSEM(sev_deactivate_lock); +static DEFINE_MUTEX(sev_bitmap_lock); static unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; +static unsigned long *sev_reclaim_asid_bitmap; #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) struct enc_region { @@ -734,8 +739,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; - if (!npt_enabled && !(efer & EFER_LMA)) - efer &= ~EFER_LME; + + if (!npt_enabled) { + /* Shadow paging assumes NX to be available. */ + efer |= EFER_NX; + + if (!(efer & EFER_LMA)) + efer &= ~EFER_LME; + } to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -1229,11 +1240,15 @@ static __init int sev_hardware_setup(void) /* Minimum ASID value that should be used for SEV guest */ min_sev_asid = cpuid_edx(0x8000001F); - /* Initialize SEV ASID bitmap */ + /* Initialize SEV ASID bitmaps */ sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); if (!sev_asid_bitmap) return 1; + sev_reclaim_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL); + if (!sev_reclaim_asid_bitmap) + return 1; + status = kmalloc(sizeof(*status), GFP_KERNEL); if (!status) return 1; @@ -1412,8 +1427,12 @@ static __exit void svm_hardware_unsetup(void) { int cpu; - if (svm_sev_enabled()) + if (svm_sev_enabled()) { bitmap_free(sev_asid_bitmap); + bitmap_free(sev_reclaim_asid_bitmap); + + sev_flush_asids(); + } for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1723,25 +1742,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static void __sev_asid_free(int asid) +static void sev_asid_free(int asid) { struct svm_cpu_data *sd; int cpu, pos; + mutex_lock(&sev_bitmap_lock); + pos = asid - 1; - clear_bit(pos, sev_asid_bitmap); + __set_bit(pos, sev_reclaim_asid_bitmap); for_each_possible_cpu(cpu) { sd = per_cpu(svm_data, cpu); sd->sev_vmcbs[pos] = NULL; } -} -static void sev_asid_free(struct kvm *kvm) -{ - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - - __sev_asid_free(sev->asid); + mutex_unlock(&sev_bitmap_lock); } static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) @@ -1758,10 +1774,12 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) /* deactivate handle */ data->handle = handle; + + /* Guard DEACTIVATE against WBINVD/DF_FLUSH used in ASID recycling */ + down_read(&sev_deactivate_lock); sev_guest_deactivate(data, NULL); + up_read(&sev_deactivate_lock); - wbinvd_on_all_cpus(); - sev_guest_df_flush(NULL); kfree(data); decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); @@ -1910,7 +1928,7 @@ static void sev_vm_destroy(struct kvm *kvm) mutex_unlock(&kvm->lock); sev_unbind_asid(kvm, sev->handle); - sev_asid_free(kvm); + sev_asid_free(sev->asid); } static void avic_vm_destroy(struct kvm *kvm) @@ -2364,7 +2382,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - BUG(); + WARN_ON_ONCE(1); } } @@ -2517,10 +2535,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { } -static void svm_decache_cr3(struct kvm_vcpu *vcpu) -{ -} - static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { } @@ -4591,6 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) int ret = 0; struct vcpu_svm *svm = to_svm(vcpu); u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + u32 id = kvm_xapic_id(vcpu->arch.apic); if (ldr == svm->ldr_reg) return 0; @@ -4598,7 +4613,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) avic_invalidate_logical_id_entry(vcpu); if (ldr) - ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr); + ret = avic_ldr_write(vcpu, id, ldr); if (!ret) svm->ldr_reg = ldr; @@ -4610,8 +4625,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) { u64 *old, *new; struct vcpu_svm *svm = to_svm(vcpu); - u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); - u32 id = (apic_id_reg >> 24) & 0xff; + u32 id = kvm_xapic_id(vcpu->arch.apic); if (vcpu->vcpu_id == id) return 0; @@ -4991,6 +5005,18 @@ static int handle_exit(struct kvm_vcpu *vcpu) return 0; } +#ifdef CONFIG_RETPOLINE + if (exit_code == SVM_EXIT_MSR) + return msr_interception(svm); + else if (exit_code == SVM_EXIT_VINTR) + return interrupt_window_interception(svm); + else if (exit_code == SVM_EXIT_INTR) + return intr_interception(svm); + else if (exit_code == SVM_EXIT_HLT) + return halt_interception(svm); + else if (exit_code == SVM_EXIT_NPF) + return npf_interception(svm); +#endif return svm_exit_handlers[exit_code](svm); } @@ -5086,8 +5112,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu) || - kvm_vcpu_apicv_active(vcpu)) + if (svm_nested_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -5104,9 +5129,9 @@ static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) return; } -static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool svm_get_enable_apicv(struct kvm *kvm) { - return avic && irqchip_split(vcpu->kvm); + return avic && irqchip_split(kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -5628,7 +5653,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (lapic_in_kernel(vcpu) && vcpu->arch.apic->lapic_timer.timer_advance_ns) @@ -5778,7 +5803,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -5887,6 +5912,9 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + vcpu->arch.xsaves_enabled = guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && + boot_cpu_has(X86_FEATURE_XSAVES); + /* Update nrips enabled cache */ svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); @@ -5962,7 +5990,7 @@ static bool svm_mpx_supported(void) static bool svm_xsaves_supported(void) { - return false; + return boot_cpu_has(X86_FEATURE_XSAVES); } static bool svm_umip_emulated(void) @@ -6264,18 +6292,73 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int sev_flush_asids(void) +{ + int ret, error; + + /* + * DEACTIVATE will clear the WBINVD indicator causing DF_FLUSH to fail, + * so it must be guarded. + */ + down_write(&sev_deactivate_lock); + + wbinvd_on_all_cpus(); + ret = sev_guest_df_flush(&error); + + up_write(&sev_deactivate_lock); + + if (ret) + pr_err("SEV: DF_FLUSH failed, ret=%d, error=%#x\n", ret, error); + + return ret; +} + +/* Must be called with the sev_bitmap_lock held */ +static bool __sev_recycle_asids(void) +{ + int pos; + + /* Check if there are any ASIDs to reclaim before performing a flush */ + pos = find_next_bit(sev_reclaim_asid_bitmap, + max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return false; + + if (sev_flush_asids()) + return false; + + bitmap_xor(sev_asid_bitmap, sev_asid_bitmap, sev_reclaim_asid_bitmap, + max_sev_asid); + bitmap_zero(sev_reclaim_asid_bitmap, max_sev_asid); + + return true; +} + static int sev_asid_new(void) { + bool retry = true; int pos; + mutex_lock(&sev_bitmap_lock); + /* * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. */ +again: pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); - if (pos >= max_sev_asid) + if (pos >= max_sev_asid) { + if (retry && __sev_recycle_asids()) { + retry = false; + goto again; + } + mutex_unlock(&sev_bitmap_lock); return -EBUSY; + } + + __set_bit(pos, sev_asid_bitmap); + + mutex_unlock(&sev_bitmap_lock); - set_bit(pos, sev_asid_bitmap); return pos + 1; } @@ -6303,7 +6386,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) return 0; e_free: - __sev_asid_free(asid); + sev_asid_free(asid); return ret; } @@ -6313,12 +6396,6 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) int asid = sev_get_asid(kvm); int ret; - wbinvd_on_all_cpus(); - - ret = sev_guest_df_flush(error); - if (ret) - return ret; - data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT); if (!data) return -ENOMEM; @@ -7208,7 +7285,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr3 = svm_decache_cr3, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 41abc62c9a8a..4aea7d304beb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -10,6 +10,7 @@ #include "hyperv.h" #include "mmu.h" #include "nested.h" +#include "pmu.h" #include "trace.h" #include "x86.h" @@ -27,6 +28,16 @@ module_param(nested_early_check, bool, S_IRUGO); failed; \ }) +#define SET_MSR_OR_WARN(vcpu, idx, data) \ +({ \ + bool failed = kvm_set_msr(vcpu, idx, data); \ + if (failed) \ + pr_warn_ratelimited( \ + "%s cannot write MSR (0x%x, 0x%llx)\n", \ + __func__, idx, data); \ + failed; \ +}) + /* * Hyper-V requires all of these, so mark them as supported even though * they are just treated the same as all-context. @@ -257,7 +268,7 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_shadow_vmcs12 = NULL; /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -929,6 +940,57 @@ fail: return i + 1; } +static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, + u32 msr_index, + u64 *data) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If the L0 hypervisor stored a more accurate value for the TSC that + * does not include the time taken for emulation of the L2->L1 + * VM-exit in L0, use the more accurate value. + */ + if (msr_index == MSR_IA32_TSC) { + int index = vmx_find_msr_index(&vmx->msr_autostore.guest, + MSR_IA32_TSC); + + if (index >= 0) { + u64 val = vmx->msr_autostore.guest.val[index].value; + + *data = kvm_read_l1_tsc(vcpu, val); + return true; + } + } + + if (kvm_get_msr(vcpu, msr_index, data)) { + pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, + msr_index); + return false; + } + return true; +} + +static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, + struct vmx_msr_entry *e) +{ + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(*e), + e, 2 * sizeof(u32))) { + pr_debug_ratelimited( + "%s cannot read MSR entry (%u, 0x%08llx)\n", + __func__, i, gpa + i * sizeof(*e)); + return false; + } + if (nested_vmx_store_msr_check(vcpu, e)) { + pr_debug_ratelimited( + "%s check failed (%u, 0x%x, 0x%x)\n", + __func__, i, e->index, e->reserved); + return false; + } + return true; +} + static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) { u64 data; @@ -940,26 +1002,12 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) if (unlikely(i >= max_msr_list_size)) return -EINVAL; - if (kvm_vcpu_read_guest(vcpu, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { - pr_debug_ratelimited( - "%s cannot read MSR entry (%u, 0x%08llx)\n", - __func__, i, gpa + i * sizeof(e)); + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) return -EINVAL; - } - if (nested_vmx_store_msr_check(vcpu, &e)) { - pr_debug_ratelimited( - "%s check failed (%u, 0x%x, 0x%x)\n", - __func__, i, e.index, e.reserved); - return -EINVAL; - } - if (kvm_get_msr(vcpu, e.index, &data)) { - pr_debug_ratelimited( - "%s cannot read MSR (%u, 0x%x)\n", - __func__, i, e.index); + + if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) return -EINVAL; - } + if (kvm_vcpu_write_guest(vcpu, gpa + i * sizeof(e) + offsetof(struct vmx_msr_entry, value), @@ -973,6 +1021,60 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) +{ + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 count = vmcs12->vm_exit_msr_store_count; + u64 gpa = vmcs12->vm_exit_msr_store_addr; + struct vmx_msr_entry e; + u32 i; + + for (i = 0; i < count; i++) { + if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) + return false; + + if (e.index == msr_index) + return true; + } + return false; +} + +static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, + u32 msr_index) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmx_msrs *autostore = &vmx->msr_autostore.guest; + bool in_vmcs12_store_list; + int msr_autostore_index; + bool in_autostore_list; + int last; + + msr_autostore_index = vmx_find_msr_index(autostore, msr_index); + in_autostore_list = msr_autostore_index >= 0; + in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); + + if (in_vmcs12_store_list && !in_autostore_list) { + if (autostore->nr == NR_LOADSTORE_MSRS) { + /* + * Emulated VMEntry does not fail here. Instead a less + * accurate value will be returned by + * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() + * instead of reading the value from the vmcs02 VMExit + * MSR-store area. + */ + pr_warn_ratelimited( + "Not enough msr entries in msr_autostore. Can't add msr %x\n", + msr_index); + return; + } + last = autostore->nr++; + autostore->val[last].index = msr_index; + } else if (!in_vmcs12_store_list && in_autostore_list) { + last = --autostore->nr; + autostore->val[msr_autostore_index] = autostore->val[last]; + } +} + static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) { unsigned long invalid_mask; @@ -1012,7 +1114,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne kvm_mmu_new_cr3(vcpu, cr3, false); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_init_mmu(vcpu, false); @@ -1024,7 +1126,9 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * populated by L2 differently than TLB entries populated * by L1. * - * If L1 uses EPT, then TLB entries are tagged with different EPTP. + * If L0 uses EPT, L1 and L2 run with different EPTP because + * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries + * are tagged with different EPTP. * * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged * with different VPID (L1 entries are tagged with vmx->vpid @@ -1034,7 +1138,7 @@ static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - return nested_cpu_has_ept(vmcs12) || + return enable_ept || (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); } @@ -2018,7 +2122,7 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * addresses are constant (for vmcs02), the counts can change based * on L2's behavior, e.g. switching to/from long mode. */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); @@ -2073,6 +2177,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) exec_control &= ~CPU_BASED_TPR_SHADOW; exec_control |= vmcs12->cpu_based_vm_exec_control; + vmx->nested.l1_tpr_threshold = -1; if (exec_control & CPU_BASED_TPR_SHADOW) vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); #ifdef CONFIG_X86_64 @@ -2285,6 +2390,13 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); } + /* + * Make sure the msr_autostore list is up to date before we set the + * count in the vmcs02. + */ + prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); + + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); @@ -2381,9 +2493,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_ept(vmcs12)) nested_ept_init_mmu_context(vcpu); - else if (nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) - vmx_flush_tlb(vcpu, true); /* * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those @@ -2418,6 +2527,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, entry_failure_code)) return -EINVAL; + /* + * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 + * on nested VM-Exit, which can occur without actually running L2 and + * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the + * transition to HLT instead of running L2. + */ + if (enable_ept) + vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); + /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { @@ -2430,6 +2549,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->guest_ia32_perf_global_ctrl)) + return -EINVAL; + kvm_rsp_write(vcpu, vmcs12->guest_rsp); kvm_rip_write(vcpu, vmcs12->guest_rip); return 0; @@ -2610,7 +2734,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, /* VM-entry exception error code */ if (CC(has_error_code && - vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))) + vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) return -EINVAL; /* VM-entry interruption-info field: reserved bits */ @@ -2664,6 +2788,11 @@ static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) return -EINVAL; + if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->host_ia32_perf_global_ctrl))) + return -EINVAL; + #ifdef CONFIG_X86_64 ia32e = !!(vcpu->arch.efer & EFER_LMA); #else @@ -2779,6 +2908,11 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; } + if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && + CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), + vmcs12->guest_ia32_perf_global_ctrl))) + return -EINVAL; + /* * If the load IA32_EFER VM-entry control is 1, the following checks * are performed on the field for the IA32_EFER MSR: @@ -2917,7 +3051,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12); -static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) +static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2933,23 +3067,22 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) * to it so we can release it later. */ if (vmx->nested.apic_access_page) { /* shouldn't happen */ - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr); - /* - * If translation failed, no matter: This feature asks - * to exit when accessing the given address, and if it - * can never be accessed, this feature won't do - * anything anyway. - */ if (!is_error_page(page)) { vmx->nested.apic_access_page = page; hpa = page_to_phys(vmx->nested.apic_access_page); vmcs_write64(APIC_ACCESS_ADDR, hpa); } else { - secondary_exec_controls_clearbit(vmx, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; } } @@ -2994,6 +3127,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); else exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); + return true; } /* @@ -3032,13 +3166,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, /* * If from_vmentry is false, this is being called from state restore (either RSM * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. -+ * -+ * Returns: -+ * 0 - success, i.e. proceed with actual VMEnter -+ * 1 - consistency check VMExit -+ * -1 - consistency check VMFail + * + * Returns: + * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode + * NVMX_ENTRY_VMFAIL: Consistency check VMFail + * NVMX_ENTRY_VMEXIT: Consistency check VMExit + * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error */ -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -3081,11 +3217,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) prepare_vmcs02_early(vmx, vmcs12); if (from_vmentry) { - nested_get_vmcs12_pages(vcpu); + if (unlikely(!nested_get_vmcs12_pages(vcpu))) + return NVMX_VMENTRY_KVM_INTERNAL_ERROR; if (nested_vmx_check_vmentry_hw(vcpu)) { vmx_switch_vmcs(vcpu, &vmx->vmcs01); - return -1; + return NVMX_VMENTRY_VMFAIL; } if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) @@ -3149,7 +3286,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) * returned as far as L1 is concerned. It will only return (and set * the success flag) when L2 exits (see nested_vmx_vmexit()). */ - return 0; + return NVMX_VMENTRY_SUCCESS; /* * A failed consistency check that leads to a VMExit during L1's @@ -3165,14 +3302,14 @@ vmentry_fail_vmexit: vmx_switch_vmcs(vcpu, &vmx->vmcs01); if (!from_vmentry) - return 1; + return NVMX_VMENTRY_VMEXIT; load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; - return 1; + return NVMX_VMENTRY_VMEXIT; } /* @@ -3182,9 +3319,9 @@ vmentry_fail_vmexit: static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) { struct vmcs12 *vmcs12; + enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); - int ret; if (!nested_vmx_check_permission(vcpu)) return 1; @@ -3244,13 +3381,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; - ret = nested_vmx_enter_non_root_mode(vcpu, true); - vmx->nested.nested_run_pending = !ret; - if (ret > 0) - return 1; - else if (ret) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_INVALID_CONTROL_FIELD); + status = nested_vmx_enter_non_root_mode(vcpu, true); + if (unlikely(status != NVMX_VMENTRY_SUCCESS)) + goto vmentry_failed; /* Hide L1D cache contents from the nested guest. */ vmx->vcpu.arch.l1tf_flush_l1d = true; @@ -3281,6 +3414,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) return kvm_vcpu_halt(vcpu); } return 1; + +vmentry_failed: + vmx->nested.nested_run_pending = 0; + if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) + return 0; + if (status == NVMX_VMENTRY_VMEXIT) + return 1; + WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); + return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -3453,6 +3595,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) test_bit(KVM_APIC_INIT, &apic->pending_events)) { if (block_nested_events) return -EBUSY; + clear_bit(KVM_APIC_INIT, &apic->pending_events); nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); return 0; } @@ -3856,8 +3999,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) - vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, - vmcs12->host_ia32_perf_global_ctrl); + SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, + vmcs12->host_ia32_perf_global_ctrl); /* Set L1 segment info according to Intel SDM 27.5.2 Loading Host Segment and Descriptor-Table Registers */ @@ -3976,7 +4119,7 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) nested_ept_uninit_mmu_context(vcpu); vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -4104,6 +4247,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); + if (vmx->nested.l1_tpr_threshold != -1) + vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); if (kvm_has_tsc_control) decache_tsc_multiplier(vmx); @@ -4111,15 +4256,11 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (vmx->nested.change_vmcs01_virtual_apic_mode) { vmx->nested.change_vmcs01_virtual_apic_mode = false; vmx_set_virtual_apic_mode(vcpu); - } else if (!nested_cpu_has_ept(vmcs12) && - nested_cpu_has2(vmcs12, - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { - vmx_flush_tlb(vcpu, true); } /* Unpin physical memory we referred to in vmcs02 */ if (vmx->nested.apic_access_page) { - kvm_release_page_dirty(vmx->nested.apic_access_page); + kvm_release_page_clean(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; } kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); @@ -4319,6 +4460,27 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, return 0; } +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx; + + if (!nested_vmx_allowed(vcpu)) + return; + + vmx = to_vmx(vcpu); + if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + vmx->nested.msrs.entry_ctls_high |= + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high |= + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; + } else { + vmx->nested.msrs.entry_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + vmx->nested.msrs.exit_ctls_high &= + ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; + } +} + static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) { gva_t gva; @@ -5758,7 +5920,7 @@ error_guest_mode: return ret; } -void nested_vmx_vcpu_setup(void) +void nested_vmx_set_vmcs_shadowing_bitmap(void) { if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); @@ -6039,23 +6201,23 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) init_vmcs_shadow_fields(); } - exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear, - exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch, - exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld, - exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst, - exit_handlers[EXIT_REASON_VMREAD] = handle_vmread, - exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume, - exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite, - exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff, - exit_handlers[EXIT_REASON_VMON] = handle_vmon, - exit_handlers[EXIT_REASON_INVEPT] = handle_invept, - exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid, - exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc, + exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; + exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; + exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; + exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; + exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; + exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; + exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; + exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff; + exit_handlers[EXIT_REASON_VMON] = handle_vmon; + exit_handlers[EXIT_REASON_INVEPT] = handle_invept; + exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; + exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; kvm_x86_ops->check_nested_events = vmx_check_nested_events; kvm_x86_ops->get_nested_state = vmx_get_nested_state; kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages, + kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 187d39bf0bf1..fc874d4ead0f 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -6,14 +6,25 @@ #include "vmcs12.h" #include "vmx.h" +/* + * Status returned by nested_vmx_enter_non_root_mode(): + */ +enum nvmx_vmentry_status { + NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */ + NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */ + NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */ + NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */ +}; + void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps, bool apicv); void nested_vmx_hardware_unsetup(void); __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); -void nested_vmx_vcpu_setup(void); +void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); -int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry); +enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, + bool from_vmentry); bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason); void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, u32 exit_intr_info, unsigned long exit_qualification); @@ -22,6 +33,7 @@ int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); +void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) { @@ -245,7 +257,7 @@ static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) return ((val & fixed1) | fixed0) == val; } -static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -259,7 +271,7 @@ static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1; @@ -267,7 +279,7 @@ static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) return fixed_bits_valid(val, fixed0, fixed1); } -static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) { u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0; u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 4dea0e0e7e39..7023138b1cb0 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -15,6 +15,7 @@ #include "x86.h" #include "cpuid.h" #include "lapic.h" +#include "nested.h" #include "pmu.h" static struct kvm_event_hw_type_mapping intel_arch_events[] = { @@ -46,6 +47,7 @@ static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) if (old_ctrl == new_ctrl) continue; + __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use); reprogram_fixed_counter(pmc, new_ctrl, i); } @@ -111,7 +113,7 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } /* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -122,8 +124,8 @@ static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) (fixed && idx >= pmu->nr_arch_fixed_counters); } -static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, - unsigned idx, u64 *mask) +static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, + unsigned int idx, u64 *mask) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); @@ -162,6 +164,18 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) return ret; } +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, msr); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0); + pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0); + + return pmc; +} + static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -223,7 +237,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_CTRL: if (pmu->global_ctrl == data) return 0; - if (!(data & pmu->global_ctrl_mask)) { + if (kvm_valid_perf_global_ctrl(pmu, data)) { global_ctrl_changed(pmu, data); return 0; } @@ -262,6 +276,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void intel_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct x86_pmu_capability x86_pmu; struct kvm_cpuid_entry2 *entry; union cpuid10_eax eax; union cpuid10_edx edx; @@ -283,8 +298,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) if (!pmu->version) return; + perf_get_x86_pmu_capability(&x86_pmu); + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters_gp); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; pmu->available_event_types = ~entry->ebx & ((1ull << eax.split.mask_length) - 1); @@ -294,7 +311,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) } else { pmu->nr_arch_fixed_counters = min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << edx.split.bit_width_fixed) - 1; } @@ -314,6 +331,13 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; + + bitmap_set(pmu->all_valid_pmc_idx, + 0, pmu->nr_arch_gp_counters); + bitmap_set(pmu->all_valid_pmc_idx, + INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters); + + nested_vmx_pmu_entry_exit_ctls_update(vcpu); } static void intel_pmu_init(struct kvm_vcpu *vcpu) @@ -325,12 +349,14 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; + pmu->gp_counters[i].current_config = 0; } for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + pmu->fixed_counters[i].current_config = 0; } } @@ -363,8 +389,9 @@ struct kvm_pmu_ops intel_pmu_ops = { .find_fixed_event = intel_find_fixed_event, .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, - .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx, .is_valid_msr = intel_is_valid_msr, .get_msr = intel_pmu_get_msr, .set_msr = intel_pmu_set_msr, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d4575ffb3cec..d175429c91b0 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -106,8 +106,6 @@ module_param(enable_apicv, bool, S_IRUGO); static bool __read_mostly nested = 1; module_param(nested, bool, S_IRUGO); -static u64 __read_mostly host_xss; - bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); @@ -209,6 +207,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf) struct page *page; unsigned int i; + if (!boot_cpu_has_bug(X86_BUG_L1TF)) { + l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED; + return 0; + } + if (!enable_ept) { l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED; return 0; @@ -445,6 +448,7 @@ const u32 vmx_msr_index[] = { MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, + MSR_IA32_TSX_CTRL, }; #if IS_ENABLED(CONFIG_HYPERV) @@ -633,6 +637,23 @@ struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) return NULL; } +static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, u64 data) +{ + int ret = 0; + + u64 old_msr_data = msr->data; + msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) { + preempt_disable(); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); + preempt_enable(); + if (ret) + msr->data = old_msr_data; + } + return ret; +} + void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) { vmcs_clear(loaded_vmcs->vmcs); @@ -721,8 +742,8 @@ static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, bool ret; u32 mask = 1 << (seg * SEG_FIELD_NR + field); - if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { - vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) { + kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS); vmx->segment_cache.bitmask = 0; } ret = vmx->segment_cache.bitmask & mask; @@ -830,7 +851,7 @@ static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, vm_exit_controls_clearbit(vmx, exit); } -static int find_msr(struct vmx_msrs *m, unsigned int msr) +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr) { unsigned int i; @@ -864,7 +885,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) } break; } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (i < 0) goto skip_guest; --m->guest.nr; @@ -872,7 +893,7 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); skip_guest: - i = find_msr(&m->host, msr); + i = vmx_find_msr_index(&m->host, msr); if (i < 0) return; @@ -931,12 +952,12 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, wrmsrl(MSR_IA32_PEBS_ENABLE, 0); } - i = find_msr(&m->guest, msr); + i = vmx_find_msr_index(&m->guest, msr); if (!entry_only) - j = find_msr(&m->host, msr); + j = vmx_find_msr_index(&m->host, msr); - if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) || - (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) { + if ((i < 0 && m->guest.nr == NR_LOADSTORE_MSRS) || + (j < 0 && m->host.nr == NR_LOADSTORE_MSRS)) { printk_once(KERN_WARNING "Not enough msr switch entries. " "Can't add msr %x\n", msr); return; @@ -964,17 +985,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) u64 guest_efer = vmx->vcpu.arch.efer; u64 ignore_bits = 0; - if (!enable_ept) { - /* - * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing - * host CPUID is more efficient than testing guest CPUID - * or CR4. Host SMEP is anyway a requirement for guest SMEP. - */ - if (boot_cpu_has(X86_FEATURE_SMEP)) - guest_efer |= EFER_NX; - else if (!(guest_efer & EFER_NX)) - ignore_bits |= EFER_NX; - } + /* Shadow paging assumes NX to be available. */ + if (!enable_ept) + guest_efer |= EFER_NX; /* * LMA and LME handled by hardware; SCE meaningless outside long mode. @@ -1271,6 +1284,18 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu) return; + /* + * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change + * PI.NDST: pi_post_block is the one expected to change PID.NDST and the + * wakeup handler expects the vCPU to be on the blocked_vcpu_list that + * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up + * correctly. + */ + if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) { + pi_clear_sn(pi_desc); + goto after_clear_sn; + } + /* The full case. */ do { old.control = new.control = pi_desc->control; @@ -1286,6 +1311,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) } while (cmpxchg64(&pi_desc->control, old.control, new.control) != old.control); +after_clear_sn: + /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the @@ -1294,7 +1321,7 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) */ smp_mb__after_atomic(); - if (!bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS)) + if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } @@ -1407,35 +1434,44 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long rflags, save_rflags; - if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { + if (vmx->rmode.vm86_active) { rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; + save_rflags = vmx->rmode.save_rflags; rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; } - to_vmx(vcpu)->rflags = rflags; + vmx->rflags = rflags; } - return to_vmx(vcpu)->rflags; + return vmx->rflags; } void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - unsigned long old_rflags = vmx_get_rflags(vcpu); + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long old_rflags; - __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->rflags = rflags; - if (to_vmx(vcpu)->rmode.vm86_active) { - to_vmx(vcpu)->rmode.save_rflags = rflags; + if (enable_unrestricted_guest) { + kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS); + vmx->rflags = rflags; + vmcs_writel(GUEST_RFLAGS, rflags); + return; + } + + old_rflags = vmx_get_rflags(vcpu); + vmx->rflags = rflags; + if (vmx->rmode.vm86_active) { + vmx->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; } vmcs_writel(GUEST_RFLAGS, rflags); - if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM) - to_vmx(vcpu)->emulation_required = emulation_required(vcpu); + if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM) + vmx->emulation_required = emulation_required(vcpu); } u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) @@ -1672,6 +1708,9 @@ static void setup_msrs(struct vcpu_vmx *vmx) index = __find_msr_index(vmx, MSR_TSC_AUX); if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP)) move_msr_up(vmx, index, save_nmsrs++); + index = __find_msr_index(vmx, MSR_IA32_TSX_CTRL); + if (index >= 0) + move_msr_up(vmx, index, save_nmsrs++); vmx->save_nmsrs = save_nmsrs; vmx->guest_msrs_ready = false; @@ -1771,6 +1810,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) #endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + goto find_shared_msr; case MSR_IA32_UMWAIT_CONTROL: if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx)) return 1; @@ -1815,14 +1859,6 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index, &msr_info->data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - msr_info->data = vcpu->arch.ia32_xss; - break; case MSR_IA32_RTIT_CTL: if (pt_mode != PT_MODE_HOST_GUEST) return 1; @@ -1873,8 +1909,9 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) return 1; - /* Else, falls through */ + goto find_shared_msr; default: + find_shared_msr: msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; @@ -1990,6 +2027,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); break; + case MSR_IA32_TSX_CTRL: + if (!msr_info->host_initiated && + !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR)) + return 1; + if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR)) + return 1; + goto find_shared_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) @@ -2058,25 +2102,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!nested_vmx_allowed(vcpu)) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); - case MSR_IA32_XSS: - if (!vmx_xsaves_supported() || - (!msr_info->host_initiated && - !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)))) - return 1; - /* - * The only supported bit as of Skylake is bit 8, but - * it is not supported on KVM. - */ - if (data != 0) - return 1; - vcpu->arch.ia32_xss = data; - if (vcpu->arch.ia32_xss != host_xss) - add_atomic_switch_msr(vmx, MSR_IA32_XSS, - vcpu->arch.ia32_xss, host_xss, false); - else - clear_atomic_switch_msr(vmx, MSR_IA32_XSS); - break; case MSR_IA32_RTIT_CTL: if ((pt_mode != PT_MODE_HOST_GUEST) || vmx_rtit_ctl_check(vcpu, data) || @@ -2141,23 +2166,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* Check reserved bit, higher 32 bits should be zero */ if ((data >> 32) != 0) return 1; - /* Else, falls through */ + goto find_shared_msr; + default: + find_shared_msr: msr = find_msr_entry(vmx, msr_index); - if (msr) { - u64 old_msr_data = msr->data; - msr->data = data; - if (msr - vmx->guest_msrs < vmx->save_nmsrs) { - preempt_disable(); - ret = kvm_set_shared_msr(msr->index, msr->data, - msr->mask); - preempt_enable(); - if (ret) - msr->data = old_msr_data; - } - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); + if (msr) + ret = vmx_set_guest_msr(vmx, msr, data); + else + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2165,7 +2182,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, reg); + switch (reg) { case VCPU_REGS_RSP: vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); @@ -2177,7 +2195,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR3: + if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + break; default: + WARN_ON_ONCE(1); break; } } @@ -2848,13 +2871,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; } -static void vmx_decache_cr3(struct kvm_vcpu *vcpu) -{ - if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); -} - static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; @@ -2867,8 +2883,7 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty)) + if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR)) return; if (is_pae_paging(vcpu)) { @@ -2890,10 +2905,7 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -2902,8 +2914,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) - vmx_decache_cr3(vcpu); + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | @@ -2984,6 +2996,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; + bool update_guest_cr3 = true; unsigned long guest_cr3; u64 eptp; @@ -3000,15 +3013,20 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock); } - if (enable_unrestricted_guest || is_paging(vcpu) || - is_guest_mode(vcpu)) - guest_cr3 = kvm_read_cr3(vcpu); - else + /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */ + if (is_guest_mode(vcpu)) + update_guest_cr3 = false; + else if (!enable_unrestricted_guest && !is_paging(vcpu)) guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr; + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + guest_cr3 = vcpu->arch.cr3; + else /* vmcs01.GUEST_CR3 is already up-to-date. */ + update_guest_cr3 = false; ept_load_pdptrs(vcpu); } - vmcs_writel(GUEST_CR3, guest_cr3); + if (update_guest_cr3) + vmcs_writel(GUEST_CR3, guest_cr3); } int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -3742,7 +3760,7 @@ void pt_update_intercept_for_msr(struct vcpu_vmx *vmx) } } -static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) +static bool vmx_get_enable_apicv(struct kvm *kvm) { return enable_apicv; } @@ -4035,6 +4053,8 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && guest_cpuid_has(vcpu, X86_FEATURE_XSAVES); + vcpu->arch.xsaves_enabled = xsaves_enabled; + if (!xsaves_enabled) exec_control &= ~SECONDARY_EXEC_XSAVES; @@ -4147,14 +4167,13 @@ static void ept_set_mmio_spte_mask(void) #define VMX_XSS_EXIT_BITMAP 0 /* - * Sets up the vmcs for emulated real mode. + * Noting that the initialization of Guest-state Area of VMCS is in + * vmx_vcpu_reset(). */ -static void vmx_vcpu_setup(struct vcpu_vmx *vmx) +static void init_vmcs(struct vcpu_vmx *vmx) { - int i; - if (nested) - nested_vmx_vcpu_setup(); + nested_vmx_set_vmcs_shadowing_bitmap(); if (cpu_has_vmx_msr_bitmap()) vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap)); @@ -4163,7 +4182,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) /* Control */ pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx)); - vmx->hv_deadline_tsc = -1; exec_controls_set(vmx, vmx_exec_control(vmx)); @@ -4212,21 +4230,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); - for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { - u32 index = vmx_msr_index[i]; - u32 data_low, data_high; - int j = vmx->nmsrs; - - if (rdmsr_safe(index, &data_low, &data_high) < 0) - continue; - if (wrmsr_safe(index, data_low, data_high) < 0) - continue; - vmx->guest_msrs[j].index = i; - vmx->guest_msrs[j].data = 0; - vmx->guest_msrs[j].mask = -1ull; - ++vmx->nmsrs; - } - vm_exit_controls_set(vmx, vmx_vmexit_ctrl()); /* 22.2.1, 20.8.1 */ @@ -4237,6 +4240,9 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) set_cr4_guest_host_mask(vmx); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + if (vmx_xsaves_supported()) vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); @@ -4339,9 +4345,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (vmx->vpid != 0) - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; vmx->vcpu.arch.cr0 = cr0; vmx_set_cr0(vcpu, cr0); /* enter rmode */ @@ -4696,7 +4699,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu) +static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; @@ -4968,21 +4971,6 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) vmcs_writel(GUEST_DR7, val); } -static int handle_cpuid(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_cpuid(vcpu); -} - -static int handle_rdmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_rdmsr(vcpu); -} - -static int handle_wrmsr(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_wrmsr(vcpu); -} - static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { kvm_apic_update_ppr(vcpu); @@ -4999,11 +4987,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) return 1; } -static int handle_halt(struct kvm_vcpu *vcpu) -{ - return kvm_emulate_halt(vcpu); -} - static int handle_vmcall(struct kvm_vcpu *vcpu) { return kvm_emulate_hypercall(vcpu); @@ -5538,14 +5521,6 @@ static int handle_encls(struct kvm_vcpu *vcpu) return 1; } -static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu) -{ - kvm_skip_emulated_instruction(vcpu); - WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x", - vmcs_read32(VM_EXIT_REASON)); - return 1; -} - /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -5559,11 +5534,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, - [EXIT_REASON_CPUID] = handle_cpuid, - [EXIT_REASON_MSR_READ] = handle_rdmsr, - [EXIT_REASON_MSR_WRITE] = handle_wrmsr, + [EXIT_REASON_CPUID] = kvm_emulate_cpuid, + [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr, + [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, - [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_HLT] = kvm_emulate_halt, [EXIT_REASON_INVD] = handle_invd, [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_RDPMC] = handle_rdpmc, @@ -5597,15 +5572,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_INVVPID] = handle_vmx_instruction, [EXIT_REASON_RDRAND] = handle_invalid_op, [EXIT_REASON_RDSEED] = handle_invalid_op, - [EXIT_REASON_XSAVES] = handle_unexpected_vmexit, - [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit, [EXIT_REASON_PML_FULL] = handle_pml_full, [EXIT_REASON_INVPCID] = handle_invpcid, [EXIT_REASON_VMFUNC] = handle_vmx_instruction, [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer, [EXIT_REASON_ENCLS] = handle_encls, - [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit, - [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit, }; static const int kvm_vmx_max_exit_handlers = @@ -5940,9 +5911,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason < kvm_vmx_max_exit_handlers - && kvm_vmx_exit_handlers[exit_reason]) + && kvm_vmx_exit_handlers[exit_reason]) { +#ifdef CONFIG_RETPOLINE + if (exit_reason == EXIT_REASON_MSR_WRITE) + return kvm_emulate_wrmsr(vcpu); + else if (exit_reason == EXIT_REASON_PREEMPTION_TIMER) + return handle_preemption_timer(vcpu); + else if (exit_reason == EXIT_REASON_PENDING_INTERRUPT) + return handle_interrupt_window(vcpu); + else if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) + return handle_external_interrupt(vcpu); + else if (exit_reason == EXIT_REASON_HLT) + return kvm_emulate_halt(vcpu); + else if (exit_reason == EXIT_REASON_EPT_MISCONFIG) + return handle_ept_misconfig(vcpu); +#endif return kvm_vmx_exit_handlers[exit_reason](vcpu); - else { + } else { vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n", exit_reason); dump_vmcs(); @@ -6028,17 +6013,17 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + int tpr_threshold; if (is_guest_mode(vcpu) && nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) return; - if (irr == -1 || tpr < irr) { - vmcs_write32(TPR_THRESHOLD, 0); - return; - } - - vmcs_write32(TPR_THRESHOLD, irr); + tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr; + if (is_guest_mode(vcpu)) + to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold; + else + vmcs_write32(TPR_THRESHOLD, tpr_threshold); } void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) @@ -6152,7 +6137,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* - * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * IOMMU can write to PID.ON, so the barrier matters even on UP. * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); @@ -6182,7 +6167,10 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { - return pi_test_on(vcpu_to_pi_desc(vcpu)); + struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); + + return pi_test_on(pi_desc) || + (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -6512,9 +6500,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vmx->nested.need_vmcs12_to_shadow_sync) nested_sync_vmcs12_to_shadow(vcpu); - if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); cr3 = __get_current_cr3_fast(); @@ -6537,7 +6525,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); - kvm_load_guest_xcr0(vcpu); + kvm_load_guest_xsave_state(vcpu); if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && @@ -6644,7 +6632,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vmx->host_pkru); } - kvm_put_guest_xcr0(vcpu); + kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6698,7 +6686,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) int err; struct vcpu_vmx *vmx; unsigned long *msr_bitmap; - int cpu; + int i, cpu; BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0, "struct kvm_vcpu must be at offset 0 for arch usercopy region"); @@ -6750,6 +6738,34 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx->guest_msrs) goto free_pml; + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { + u32 index = vmx_msr_index[i]; + u32 data_low, data_high; + int j = vmx->nmsrs; + + if (rdmsr_safe(index, &data_low, &data_high) < 0) + continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; + + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + switch (index) { + case MSR_IA32_TSX_CTRL: + /* + * No need to pass TSX_CTRL_CPUID_CLEAR through, so + * let's avoid changing CPUID bits under the host + * kernel's feet. + */ + vmx->guest_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR; + break; + default: + vmx->guest_msrs[j].mask = -1ull; + break; + } + ++vmx->nmsrs; + } + err = alloc_loaded_vmcs(&vmx->vmcs01); if (err < 0) goto free_msrs; @@ -6774,7 +6790,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); vmx->vcpu.cpu = cpu; - vmx_vcpu_setup(vmx); + init_vmcs(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) { @@ -6994,6 +7010,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP)); + cr4_fixed1_update(X86_CR4_LA57, ecx, bit(X86_FEATURE_LA57)); #undef cr4_fixed1_update } @@ -7088,6 +7105,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ + vcpu->arch.xsaves_enabled = false; + if (cpu_has_secondary_exec_ctrls()) { vmx_compute_secondary_exec_control(vmx); vmcs_set_secondary_exec_control(vmx); @@ -7095,10 +7115,12 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= + FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= - ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + ~(FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX | + FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX); if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); @@ -7108,6 +7130,15 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (boot_cpu_has(X86_FEATURE_INTEL_PT) && guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT)) update_intel_pt_cfg(vcpu); + + if (boot_cpu_has(X86_FEATURE_RTM)) { + struct shared_msr_entry *msr; + msr = find_msr_entry(vmx, MSR_IA32_TSX_CTRL); + if (msr) { + bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM); + vmx_set_guest_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE); + } + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7596,9 +7627,6 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7779,7 +7807,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr3 = vmx_decache_cr3, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -7995,12 +8022,10 @@ static int __init vmx_init(void) * contain 'auto' which will be turned into the default 'cond' * mitigation mode. */ - if (boot_cpu_has(X86_BUG_L1TF)) { - r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); - if (r) { - vmx_exit(); - return r; - } + r = vmx_setup_l1d_flush(vmentry_l1d_flush_param); + if (r) { + vmx_exit(); + return r; } #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index bee16687dc0b..7c1b978b2df4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -22,11 +22,11 @@ extern u32 get_umwait_control_msr(void); #define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) -#define NR_AUTOLOAD_MSRS 8 +#define NR_LOADSTORE_MSRS 8 struct vmx_msrs { unsigned int nr; - struct vmx_msr_entry val[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry val[NR_LOADSTORE_MSRS]; }; struct shared_msr_entry { @@ -167,6 +167,9 @@ struct nested_vmx { u64 vmcs01_debugctl; u64 vmcs01_guest_bndcfgs; + /* to migrate it to L1 if L2 writes to L1's CR8 directly */ + int l1_tpr_threshold; + u16 vpid02; u16 last_vpid; @@ -230,6 +233,10 @@ struct vcpu_vmx { struct vmx_msrs host; } msr_autoload; + struct msr_autostore { + struct vmx_msrs guest; + } msr_autostore; + struct { int vm86_active; ulong save_rflags; @@ -334,6 +341,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu); struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr); void pt_update_intercept_for_msr(struct vcpu_vmx *vmx); void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp); +int vmx_find_msr_index(struct vmx_msrs *m, u32 msr); #define POSTED_INTR_ON 0 #define POSTED_INTR_SN 1 @@ -355,6 +363,11 @@ static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); } +static inline bool pi_is_pir_empty(struct pi_desc *pi_desc) +{ + return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS); +} + static inline void pi_set_sn(struct pi_desc *pi_desc) { set_bit(POSTED_INTR_SN, @@ -373,6 +386,12 @@ static inline void pi_clear_on(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_sn(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_SN, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ed07d8d2caa..3ed167e039e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif -#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM -#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__ +#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__ #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) @@ -176,6 +176,8 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +static u64 __read_mostly host_xss; + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -212,7 +214,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, - { "largepages", VM_STAT(lpages) }, + { "largepages", VM_STAT(lpages, .mode = 0444) }, + { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) }, { "max_mmu_page_hash_collisions", VM_STAT(max_mmu_page_hash_collisions) }, { NULL } @@ -259,23 +262,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn) } } -static void shared_msr_update(unsigned slot, u32 msr) -{ - u64 value; - unsigned int cpu = smp_processor_id(); - struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - - /* only read, and nobody should modify it at this time, - * so don't need lock */ - if (slot >= shared_msrs_global.nr) { - printk(KERN_ERR "kvm: invalid MSR slot!"); - return; - } - rdmsrl_safe(msr, &value); - smsr->values[slot].host = value; - smsr->values[slot].curr = value; -} - void kvm_define_shared_msr(unsigned slot, u32 msr) { BUG_ON(slot >= KVM_NR_SHARED_MSRS); @@ -287,10 +273,16 @@ EXPORT_SYMBOL_GPL(kvm_define_shared_msr); static void kvm_shared_msr_cpu_online(void) { - unsigned i; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + u64 value; + int i; - for (i = 0; i < shared_msrs_global.nr; ++i) - shared_msr_update(i, shared_msrs_global.msrs[i]); + for (i = 0; i < shared_msrs_global.nr; ++i) { + rdmsrl_safe(shared_msrs_global.msrs[i], &value); + smsr->values[i].host = value; + smsr->values[i].curr = value; + } } int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) @@ -299,13 +291,14 @@ int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); int err; - if (((value ^ smsr->values[slot].curr) & mask) == 0) + value = (value & mask) | (smsr->values[slot].host & ~mask); + if (value == smsr->values[slot].curr) return 0; - smsr->values[slot].curr = value; err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); if (err) return 1; + smsr->values[slot].curr = value; if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); @@ -360,8 +353,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base); asmlinkage __visible void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ - if (!kvm_rebooting) - BUG(); + BUG_ON(!kvm_rebooting); } EXPORT_SYMBOL_GPL(kvm_spurious_fault); @@ -709,10 +701,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) ret = 1; memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail); - __set_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_dirty); + kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); + out: return ret; @@ -722,7 +712,6 @@ EXPORT_SYMBOL_GPL(load_pdptrs); bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; - bool changed = true; int offset; gfn_t gfn; int r; @@ -730,8 +719,7 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) if (!is_pae_paging(vcpu)) return false; - if (!test_bit(VCPU_EXREG_PDPTR, - (unsigned long *)&vcpu->arch.regs_avail)) + if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) return true; gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT; @@ -739,11 +727,9 @@ bool pdptrs_changed(struct kvm_vcpu *vcpu) r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) - goto out; - changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; -out: + return true; - return changed; + return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; } EXPORT_SYMBOL_GPL(pdptrs_changed); @@ -812,27 +798,34 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) { - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && - !vcpu->guest_xcr0_loaded) { - /* kvm_set_xcr() also depends on this */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); - vcpu->guest_xcr0_loaded = 1; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } } -EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { - if (vcpu->guest_xcr0_loaded) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { + if (vcpu->arch.xcr0 != host_xcr0) xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); - vcpu->guest_xcr0_loaded = 0; + + if (vcpu->arch.xsaves_enabled && + vcpu->arch.ia32_xss != host_xss) + wrmsrl(MSR_IA32_XSS, host_xss); } + } -EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); +EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -885,34 +878,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | - X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; - if (cr4 & CR4_RESERVED_BITS) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) - return 1; + return -EINVAL; if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return -EINVAL; + + return 0; +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE; + + if (kvm_valid_cr4(vcpu, cr4)) return 1; if (is_long_mode(vcpu)) { @@ -976,7 +977,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); vcpu->arch.cr3 = cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); return 0; } @@ -1125,13 +1126,15 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * - * This list is modified at module load time to reflect the + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) + * extract the supported MSRs from the related const lists. + * msrs_to_save is selected from the msrs_to_save_all to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs * may depend on host virtualization features rather than host cpu features. */ -static u32 msrs_to_save[] = { +static const u32 msrs_to_save_all[] = { MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1161,13 +1164,6 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13, MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15, MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17, - MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19, - MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21, - MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23, - MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25, - MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27, - MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29, - MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31, MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, @@ -1177,18 +1173,12 @@ static u32 msrs_to_save[] = { MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13, MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15, MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17, - MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19, - MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21, - MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23, - MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25, - MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27, - MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29, - MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31, }; +static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)]; static unsigned num_msrs_to_save; -static u32 emulated_msrs[] = { +static const u32 emulated_msrs_all[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, @@ -1227,7 +1217,7 @@ static u32 emulated_msrs[] = { * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. * We always support the "true" VMX control MSRs, even if the host * processor does not, so I am putting these registers here rather - * than in msrs_to_save. + * than in msrs_to_save_all. */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, @@ -1246,13 +1236,14 @@ static u32 emulated_msrs[] = { MSR_KVM_POLL_CONTROL, }; +static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; static unsigned num_emulated_msrs; /* * List of msr numbers which are used to expose MSR-based features that * can be used by a hypervisor to validate requested CPU features. */ -static u32 msr_based_features[] = { +static const u32 msr_based_features_all[] = { MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, MSR_IA32_VMX_PINBASED_CTLS, @@ -1277,6 +1268,7 @@ static u32 msr_based_features[] = { MSR_IA32_ARCH_CAPABILITIES, }; +static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; static u64 kvm_get_arch_capabilities(void) @@ -1287,6 +1279,14 @@ static u64 kvm_get_arch_capabilities(void) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); /* + * If nx_huge_pages is enabled, KVM's shadow paging will ensure that + * the nested hypervisor runs with NX huge pages. If it is not, + * L1 is anyway vulnerable to ITLB_MULTIHIT explots from other + * L1 guests, so it need not worry about its own (L2) guests. + */ + data |= ARCH_CAP_PSCHANGE_MC_NO; + + /* * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us @@ -1305,6 +1305,17 @@ static u64 kvm_get_arch_capabilities(void) if (!boot_cpu_has_bug(X86_BUG_MDS)) data |= ARCH_CAP_MDS_NO; + /* + * On TAA affected systems: + * - nothing to do if TSX is disabled on the host. + * - we emulate TSX_CTRL if present on the host. + * This lets the guest use VERW to clear CPU buffers. + */ + if (!boot_cpu_has(X86_FEATURE_RTM)) + data &= ~(ARCH_CAP_TAA_NO | ARCH_CAP_TSX_CTRL_MSR); + else if (!boot_cpu_has_bug(X86_BUG_TAA)) + data |= ARCH_CAP_TAA_NO; + return data; } @@ -1451,8 +1462,8 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, - bool host_initiated) +int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, + bool host_initiated) { struct msr_data msr; int ret; @@ -1527,20 +1538,25 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) } #ifdef CONFIG_X86_64 +struct pvclock_clock { + int vclock_mode; + u64 cycle_last; + u64 mask; + u32 mult; + u32 shift; +}; + struct pvclock_gtod_data { seqcount_t seq; - struct { /* extract of a clocksource struct */ - int vclock_mode; - u64 cycle_last; - u64 mask; - u32 mult; - u32 shift; - } clock; + struct pvclock_clock clock; /* extract of a clocksource struct */ + struct pvclock_clock raw_clock; /* extract of a clocksource struct */ + u64 boot_ns_raw; u64 boot_ns; u64 nsec_base; u64 wall_time_sec; + u64 monotonic_raw_nsec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1548,9 +1564,10 @@ static struct pvclock_gtod_data pvclock_gtod_data; static void update_pvclock_gtod(struct timekeeper *tk) { struct pvclock_gtod_data *vdata = &pvclock_gtod_data; - u64 boot_ns; + u64 boot_ns, boot_ns_raw; boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); + boot_ns_raw = ktime_to_ns(ktime_add(tk->tkr_raw.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); @@ -1561,11 +1578,20 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->clock.mult = tk->tkr_mono.mult; vdata->clock.shift = tk->tkr_mono.shift; + vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->archdata.vclock_mode; + vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; + vdata->raw_clock.mask = tk->tkr_raw.mask; + vdata->raw_clock.mult = tk->tkr_raw.mult; + vdata->raw_clock.shift = tk->tkr_raw.shift; + vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; vdata->wall_time_sec = tk->xtime_sec; + vdata->boot_ns_raw = boot_ns_raw; + vdata->monotonic_raw_nsec = tk->tkr_raw.xtime_nsec; + write_seqcount_end(&vdata->seq); } #endif @@ -1989,21 +2015,21 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) +static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, + int *mode) { long v; - struct pvclock_gtod_data *gtod = &pvclock_gtod_data; u64 tsc_pg_val; - switch (gtod->clock.vclock_mode) { + switch (clock->vclock_mode) { case VCLOCK_HVCLOCK: tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), tsc_timestamp); if (tsc_pg_val != U64_MAX) { /* TSC page valid */ *mode = VCLOCK_HVCLOCK; - v = (tsc_pg_val - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (tsc_pg_val - clock->cycle_last) & + clock->mask; } else { /* TSC page invalid */ *mode = VCLOCK_NONE; @@ -2012,8 +2038,8 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) case VCLOCK_TSC: *mode = VCLOCK_TSC; *tsc_timestamp = read_tsc(); - v = (*tsc_timestamp - gtod->clock.cycle_last) & - gtod->clock.mask; + v = (*tsc_timestamp - clock->cycle_last) & + clock->mask; break; default: *mode = VCLOCK_NONE; @@ -2022,10 +2048,10 @@ static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) if (*mode == VCLOCK_NONE) *tsc_timestamp = v = 0; - return v * gtod->clock.mult; + return v * clock->mult; } -static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) +static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -2034,10 +2060,10 @@ static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) do { seq = read_seqcount_begin(>od->seq); - ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns = gtod->monotonic_raw_nsec; + ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; - ns += gtod->boot_ns; + ns += gtod->boot_ns_raw; } while (unlikely(read_seqcount_retry(>od->seq, seq))); *t = ns; @@ -2055,7 +2081,7 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(tsc_timestamp, &mode); + ns += vgettsc(>od->clock, tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -2072,7 +2098,7 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns, tsc_timestamp)); } @@ -2543,6 +2569,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { vcpu->arch.pv_time_enabled = false; + vcpu->arch.time = 0; } static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) @@ -2694,6 +2721,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_TSC: kvm_write_tsc(vcpu, msr_info); break; + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + /* + * We do support PT if kvm_x86_ops->pt_supported(), but we do + * not support IA32_XSS[bit 8]. Guests will have to use + * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT + * MSRs. + */ + if (data != 0) + return 1; + vcpu->arch.ia32_xss = data; + break; case MSR_SMI_COUNT: if (!msr_info->host_initiated) return 1; @@ -2708,8 +2749,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_SYSTEM_TIME: { struct kvm_arch *ka = &vcpu->kvm->arch; - kvmclock_reset(vcpu); - if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) { bool tmp = (msr == MSR_KVM_SYSTEM_TIME); @@ -2723,14 +2762,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ + vcpu->arch.pv_time_enabled = false; if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (!kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) - vcpu->arch.pv_time_enabled = false; - else vcpu->arch.pv_time_enabled = true; break; @@ -3024,6 +3062,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return get_msr_mce(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); + case MSR_IA32_XSS: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) + return 1; + msr_info->data = vcpu->arch.ia32_xss; + break; case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -3801,12 +3845,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; - if (lapic_in_kernel(vcpu)) { - if (events->smi.latched_init) - set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - else - clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); - } + } + + if (lapic_in_kernel(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } } @@ -4397,6 +4442,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_SET_NESTED_STATE: { struct kvm_nested_state __user *user_kvm_nested_state = argp; struct kvm_nested_state kvm_state; + int idx; r = -EINVAL; if (!kvm_x86_ops->set_nested_state) @@ -4420,7 +4466,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) break; + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } case KVM_GET_SUPPORTED_HV_CPUID: { @@ -4922,9 +4970,6 @@ set_identity_unlock: if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); - if (r) - goto set_irqchip_out; - r = 0; set_irqchip_out: kfree(chip); break; @@ -5097,23 +5142,28 @@ out: static void kvm_init_msr_list(void) { + struct x86_pmu_capability x86_pmu; u32 dummy[2]; - unsigned i, j; + unsigned i; BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4, - "Please update the fixed PMCs in msrs_to_save[]"); - BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32, - "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]"); + "Please update the fixed PMCs in msrs_to_saved_all[]"); + + perf_get_x86_pmu_capability(&x86_pmu); - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { - if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + num_msrs_to_save = 0; + num_emulated_msrs = 0; + num_msr_based_features = 0; + + for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) { + if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0) continue; /* * Even MSRs that are valid in the host may not be exposed * to the guests in some cases. */ - switch (msrs_to_save[i]) { + switch (msrs_to_save_all[i]) { case MSR_IA32_BNDCFGS: if (!kvm_mpx_supported()) continue; @@ -5141,43 +5191,43 @@ static void kvm_init_msr_list(void) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { if (!kvm_x86_ops->pt_supported() || - msrs_to_save[i] - MSR_IA32_RTIT_ADDR0_A >= + msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; break; + case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; + break; + case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17: + if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >= + min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp)) + continue; } default: break; } - if (j < i) - msrs_to_save[j] = msrs_to_save[i]; - j++; + msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i]; } - num_msrs_to_save = j; - for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) continue; - if (j < i) - emulated_msrs[j] = emulated_msrs[i]; - j++; + emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; } - num_emulated_msrs = j; - for (i = j = 0; i < ARRAY_SIZE(msr_based_features); i++) { + for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) { struct kvm_msr_entry msr; - msr.index = msr_based_features[i]; + msr.index = msr_based_features_all[i]; if (kvm_get_msr_feature(&msr)) continue; - if (j < i) - msr_based_features[j] = msr_based_features[i]; - j++; + msr_based_features[num_msr_based_features++] = msr_based_features_all[i]; } - num_msr_based_features = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -6107,7 +6157,7 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -7837,6 +7887,19 @@ static void process_smi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, + unsigned long *vcpu_bitmap) +{ + cpumask_var_t cpus; + + zalloc_cpumask_var(&cpus, GFP_ATOMIC); + + kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, + vcpu_bitmap, cpus); + + free_cpumask_var(cpus); +} + void kvm_make_scan_ioapic_request(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); @@ -7914,7 +7977,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) */ put_page(page); } -EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page); void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) { @@ -7937,8 +7999,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_immediate_exit = false; if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) - kvm_x86_ops->get_vmcs12_pages(vcpu); + if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { + if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + r = 0; + goto out; + } + } if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) @@ -8669,8 +8735,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) goto out; - /* INITs are latched while in SMM */ - if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + /* + * KVM_MP_STATE_INIT_RECEIVED means the processor is in + * INIT state; latched init should be reported using + * KVM_SET_VCPU_EVENTS, so reject it here. + */ + if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) goto out; @@ -8714,10 +8784,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch); static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && - (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; - if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { /* * When EFER.LME and CR0.PG are set, the processor is in @@ -8736,7 +8802,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) return -EINVAL; } - return 0; + return kvm_valid_cr4(vcpu, sregs->cr4); } static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) @@ -8766,7 +8832,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); kvm_set_cr8(vcpu, sregs->cr8); @@ -9293,6 +9359,9 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + kvm_init_msr_list(); return 0; } @@ -9346,7 +9415,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_pio_data; if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu->kvm); r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; @@ -9415,7 +9484,13 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + vcpu->arch.l1tf_flush_l1d = true; + if (pmu->version && unlikely(pmu->event_count)) { + pmu->need_cleanup = true; + kvm_make_request(KVM_REQ_PMU, vcpu); + } kvm_x86_ops->sched_in(vcpu, cpu); } @@ -9427,6 +9502,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); + INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); atomic_set(&kvm->arch.noncoherent_dma_count, 0); @@ -9455,6 +9531,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) return kvm_x86_ops->vm_init(kvm); } +int kvm_arch_post_init_vm(struct kvm *kvm) +{ + return kvm_mmu_post_init_vm(kvm); +} + static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); @@ -9556,6 +9637,11 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } EXPORT_SYMBOL_GPL(x86_set_memory_region); +void kvm_arch_pre_destroy_vm(struct kvm *kvm) +{ + kvm_mmu_pre_destroy_vm(kvm); +} + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index dbf7442a822b..29391af8871d 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -238,8 +238,7 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } -static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, - enum kvm_reg reg) +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, int reg) { unsigned long val = kvm_register_read(vcpu, reg); @@ -247,8 +246,7 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, } static inline void kvm_register_writel(struct kvm_vcpu *vcpu, - enum kvm_reg reg, - unsigned long val) + int reg, unsigned long val) { if (!is_64_bit_mode(vcpu)) val = (u32)val; @@ -260,6 +258,11 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) return !(kvm->arch.disabled_quirks & quirk); } +static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) +{ + return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu); +} + void kvm_set_pending_timer(struct kvm_vcpu *vcpu); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); @@ -366,7 +369,7 @@ static inline bool kvm_pat_valid(u64 data) return (data | ((data & 0x0202020202020202ull) << 1)) == data; } -void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); -void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); +void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index b7375dc6898f..c126571e5e2e 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops) __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0); /* - * AMD, like Intel, supports the EAX hint and EAX=0xf - * means, do not enter any deep C-state and we use it + * AMD, like Intel's MWAIT version, supports the EAX hint and + * EAX=0xf0 means, do not enter any deep C-state and we use it * here in delay() to minimize wakeup latency. */ __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h index f98a0c956764..9b41391867dc 100644 --- a/arch/x86/math-emu/fpu_system.h +++ b/arch/x86/math-emu/fpu_system.h @@ -107,6 +107,8 @@ static inline bool seg_writable(struct desc_struct *d) #define FPU_access_ok(y,z) if ( !access_ok(y,z) ) \ math_abort(FPU_info,SIGSEGV) #define FPU_abort math_abort(FPU_info, SIGSEGV) +#define FPU_copy_from_user(to, from, n) \ + do { if (copy_from_user(to, from, n)) FPU_abort; } while (0) #undef FPU_IGNORE_CODE_SEGV #ifdef FPU_IGNORE_CODE_SEGV @@ -122,7 +124,7 @@ static inline bool seg_writable(struct desc_struct *d) #define FPU_code_access_ok(z) FPU_access_ok((void __user *)FPU_EIP,z) #endif -#define FPU_get_user(x,y) get_user((x),(y)) -#define FPU_put_user(x,y) put_user((x),(y)) +#define FPU_get_user(x,y) do { if (get_user((x),(y))) FPU_abort; } while (0) +#define FPU_put_user(x,y) do { if (put_user((x),(y))) FPU_abort; } while (0) #endif diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index f3779743d15e..fe6246ff9887 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -85,7 +85,7 @@ int FPU_load_extended(long double __user *s, int stnr) RE_ENTRANT_CHECK_OFF; FPU_access_ok(s, 10); - __copy_from_user(sti_ptr, s, 10); + FPU_copy_from_user(sti_ptr, s, 10); RE_ENTRANT_CHECK_ON; return FPU_tagof(sti_ptr); @@ -1126,9 +1126,9 @@ void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) /* Copy all registers in stack order. */ RE_ENTRANT_CHECK_OFF; FPU_access_ok(s, 80); - __copy_from_user(register_base + offset, s, other); + FPU_copy_from_user(register_base + offset, s, other); if (offset) - __copy_from_user(register_base, s + other, offset); + FPU_copy_from_user(register_base, s + other, offset); RE_ENTRANT_CHECK_ON; for (i = 0; i < 8; i++) { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 84373dc9b341..3b89c201ac26 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,7 +13,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -23,7 +23,7 @@ CFLAGS_mem_encrypt_identity.o := $(nostackp) CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace -obj-$(CONFIG_X86_PAT) += pat_rbtree.o +obj-$(CONFIG_X86_PAT) += pat_interval.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 752ad11d6868..d9643647a9ce 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -178,7 +178,9 @@ static __init void setup_cpu_entry_area_ptes(void) #ifdef CONFIG_X86_32 unsigned long start, end; - BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); + /* The +1 is for the readonly IDT: */ + BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE); + BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE); BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); start = CPU_ENTRY_AREA_BASE; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a39dcdb5ae34..1ff9c2030b4f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -626,6 +626,17 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + memunmap(data); + return true; + } + + if (data->type == SETUP_INDIRECT && + ((struct setup_indirect *)data->data)->type != SETUP_INDIRECT) { + paddr = ((struct setup_indirect *)data->data)->addr; + len = ((struct setup_indirect *)data->data)->len; + } + memunmap(data); if ((phys_addr > paddr) && (phys_addr < (paddr + len))) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 79eb55ce69a9..49d7814b59a9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -193,8 +193,8 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) int ret; WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); if (f->armed) { - pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n", - f->addr, f->count, !!f->old_presence); + pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", + f->addr, f->count, !!f->old_presence); } ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), @@ -341,8 +341,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) * something external causing them (f.e. using a debugger while * mmio tracing enabled), or erroneous behaviour */ - pr_warning("unexpected debug trap on CPU %d.\n", - smp_processor_id()); + pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } diff --git a/arch/x86/mm/maccess.c b/arch/x86/mm/maccess.c new file mode 100644 index 000000000000..f5b85bdc0535 --- /dev/null +++ b/arch/x86/mm/maccess.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/uaccess.h> +#include <linux/kernel.h> + +#ifdef CONFIG_X86_64 +static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) +{ + return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); +} + +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + /* + * Range covering the highest possible canonical userspace address + * as well as non-canonical address range. For the canonical range + * we also need to include the userspace guard page. + */ + return vaddr < TASK_SIZE_MAX + PAGE_SIZE || + canonical_address(vaddr, boot_cpu_data.x86_virt_bits) != vaddr; +} +#else +static __always_inline bool invalid_probe_range(u64 vaddr) +{ + return vaddr < TASK_SIZE_MAX; +} +#endif + +long probe_kernel_read_strict(void *dst, const void *src, size_t size) +{ + if (unlikely(invalid_probe_range((unsigned long)src))) + return -EFAULT; + + return __probe_kernel_read(dst, src, size); +} + +long strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr, long count) +{ + if (unlikely(invalid_probe_range((unsigned long)unsafe_addr))) + return -EFAULT; + + return __strncpy_from_unsafe(dst, unsafe_addr, count); +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index b8ef8557d4b3..673de6063345 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -394,7 +394,7 @@ static void enter_uniprocessor(void) } out: if (num_online_cpus() > 1) - pr_warning("multiple CPUs still online, may miss events.\n"); + pr_warn("multiple CPUs still online, may miss events.\n"); } static void leave_uniprocessor(void) @@ -418,8 +418,8 @@ static void leave_uniprocessor(void) static void enter_uniprocessor(void) { if (num_online_cpus() > 1) - pr_warning("multiple CPUs are online, may miss events. " - "Suggest booting with maxcpus=1 kernel argument.\n"); + pr_warn("multiple CPUs are online, may miss events. " + "Suggest booting with maxcpus=1 kernel argument.\n"); } static void leave_uniprocessor(void) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4123100e0eaf..99f7a68738f0 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -699,7 +699,7 @@ static int __init dummy_numa_init(void) * x86_numa_init - Initialize NUMA * * Try each configured NUMA initialization method until one succeeds. The - * last fallback is dummy single node config encomapssing whole memory and + * last fallback is dummy single node config encompassing whole memory and * never fails. */ void __init x86_numa_init(void) diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index abffa0be80da..7f1d2034df1e 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -438,7 +438,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) goto no_emu; if (numa_cleanup_meminfo(&ei) < 0) { - pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); + pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); goto no_emu; } @@ -449,7 +449,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); if (!phys) { - pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); + pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } memblock_reserve(phys, phys_size); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d9fbd4f69920..2d758e19ef22 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -603,7 +603,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type, spin_lock(&memtype_lock); - err = rbt_memtype_check_insert(new, new_type); + err = memtype_check_insert(new, new_type); if (err) { pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n", start, end - 1, @@ -650,7 +650,7 @@ int free_memtype(u64 start, u64 end) } spin_lock(&memtype_lock); - entry = rbt_memtype_erase(start, end); + entry = memtype_erase(start, end); spin_unlock(&memtype_lock); if (IS_ERR(entry)) { @@ -693,7 +693,7 @@ static enum page_cache_mode lookup_memtype(u64 paddr) spin_lock(&memtype_lock); - entry = rbt_memtype_lookup(paddr); + entry = memtype_lookup(paddr); if (entry != NULL) rettype = entry->type; else @@ -1109,7 +1109,7 @@ static struct memtype *memtype_get_idx(loff_t pos) return NULL; spin_lock(&memtype_lock); - ret = rbt_memtype_copy_nth_element(print_entry, pos); + ret = memtype_copy_nth_element(print_entry, pos); spin_unlock(&memtype_lock); if (!ret) { diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index eeb5caeb089b..79a06684349e 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -29,20 +29,20 @@ static inline char *cattr_name(enum page_cache_mode pcm) } #ifdef CONFIG_X86_PAT -extern int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type); -extern struct memtype *rbt_memtype_erase(u64 start, u64 end); -extern struct memtype *rbt_memtype_lookup(u64 addr); -extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); +extern int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type); +extern struct memtype *memtype_erase(u64 start, u64 end); +extern struct memtype *memtype_lookup(u64 addr); +extern int memtype_copy_nth_element(struct memtype *out, loff_t pos); #else -static inline int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *new_type) +static inline int memtype_check_insert(struct memtype *new, + enum page_cache_mode *new_type) { return 0; } -static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +static inline struct memtype *memtype_erase(u64 start, u64 end) { return NULL; } -static inline struct memtype *rbt_memtype_lookup(u64 addr) +static inline struct memtype *memtype_lookup(u64 addr) { return NULL; } -static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) +static inline int memtype_copy_nth_element(struct memtype *out, loff_t pos) { return 0; } #endif diff --git a/arch/x86/mm/pat_interval.c b/arch/x86/mm/pat_interval.c new file mode 100644 index 000000000000..47a1bf30748f --- /dev/null +++ b/arch/x86/mm/pat_interval.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handle caching attributes in page tables (PAT) + * + * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * Suresh B Siddha <suresh.b.siddha@intel.com> + * + * Interval tree used to store the PAT memory type reservations. + */ + +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/interval_tree_generic.h> +#include <linux/sched.h> +#include <linux/gfp.h> + +#include <asm/pgtable.h> +#include <asm/pat.h> + +#include "pat_internal.h" + +/* + * The memtype tree keeps track of memory type for specific + * physical memory areas. Without proper tracking, conflicting memory + * types in different mappings can cause CPU cache corruption. + * + * The tree is an interval tree (augmented rbtree) with tree ordered + * on starting address. Tree can contain multiple entries for + * different regions which overlap. All the aliases have the same + * cache attributes of course. + * + * memtype_lock protects the rbtree. + */ +static inline u64 memtype_interval_start(struct memtype *memtype) +{ + return memtype->start; +} + +static inline u64 memtype_interval_end(struct memtype *memtype) +{ + return memtype->end - 1; +} +INTERVAL_TREE_DEFINE(struct memtype, rb, u64, subtree_max_end, + memtype_interval_start, memtype_interval_end, + static, memtype_interval) + +static struct rb_root_cached memtype_rbroot = RB_ROOT_CACHED; + +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_match(u64 start, u64 end, int match_type) +{ + struct memtype *match; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + while (match != NULL && match->start < end) { + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) + return match; + + match = memtype_interval_iter_next(match, start, end); + } + + return NULL; /* Returns NULL if there is no match */ +} + +static int memtype_check_conflict(u64 start, u64 end, + enum page_cache_mode reqtype, + enum page_cache_mode *newtype) +{ + struct memtype *match; + enum page_cache_mode found_type = reqtype; + + match = memtype_interval_iter_first(&memtype_rbroot, start, end); + if (match == NULL) + goto success; + + if (match->type != found_type && newtype == NULL) + goto failure; + + dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); + found_type = match->type; + + match = memtype_interval_iter_next(match, start, end); + while (match) { + if (match->type != found_type) + goto failure; + + match = memtype_interval_iter_next(match, start, end); + } +success: + if (newtype) + *newtype = found_type; + + return 0; + +failure: + pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", + current->comm, current->pid, start, end, + cattr_name(found_type), cattr_name(match->type)); + return -EBUSY; +} + +int memtype_check_insert(struct memtype *new, + enum page_cache_mode *ret_type) +{ + int err = 0; + + err = memtype_check_conflict(new->start, new->end, new->type, ret_type); + if (err) + return err; + + if (ret_type) + new->type = *ret_type; + + memtype_interval_insert(new, &memtype_rbroot); + return 0; +} + +struct memtype *memtype_erase(u64 start, u64 end) +{ + struct memtype *data; + + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_match(start, end, MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_match(start, end, MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + memtype_interval_remove(data, &memtype_rbroot); + } else { + /* mremap: update the end value of this node */ + memtype_interval_remove(data, &memtype_rbroot); + data->end = start; + memtype_interval_insert(data, &memtype_rbroot); + return NULL; + } + + return data; +} + +struct memtype *memtype_lookup(u64 addr) +{ + return memtype_interval_iter_first(&memtype_rbroot, addr, + addr + PAGE_SIZE); +} + +#if defined(CONFIG_DEBUG_FS) +int memtype_copy_nth_element(struct memtype *out, loff_t pos) +{ + struct memtype *match; + int i = 1; + + match = memtype_interval_iter_first(&memtype_rbroot, 0, ULONG_MAX); + while (match && pos != i) { + match = memtype_interval_iter_next(match, 0, ULONG_MAX); + i++; + } + + if (match) { /* pos == i */ + *out = *match; + return 0; + } else { + return 1; + } +} +#endif diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c deleted file mode 100644 index 65ebe4b88f7c..000000000000 --- a/arch/x86/mm/pat_rbtree.c +++ /dev/null @@ -1,268 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Handle caching attributes in page tables (PAT) - * - * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> - * Suresh B Siddha <suresh.b.siddha@intel.com> - * - * Interval tree (augmented rbtree) used to store the PAT memory type - * reservations. - */ - -#include <linux/seq_file.h> -#include <linux/debugfs.h> -#include <linux/kernel.h> -#include <linux/rbtree_augmented.h> -#include <linux/sched.h> -#include <linux/gfp.h> - -#include <asm/pgtable.h> -#include <asm/pat.h> - -#include "pat_internal.h" - -/* - * The memtype tree keeps track of memory type for specific - * physical memory areas. Without proper tracking, conflicting memory - * types in different mappings can cause CPU cache corruption. - * - * The tree is an interval tree (augmented rbtree) with tree ordered - * on starting address. Tree can contain multiple entries for - * different regions which overlap. All the aliases have the same - * cache attributes of course. - * - * memtype_lock protects the rbtree. - */ - -static struct rb_root memtype_rbroot = RB_ROOT; - -static int is_node_overlap(struct memtype *node, u64 start, u64 end) -{ - if (node->start >= end || node->end <= start) - return 0; - - return 1; -} - -static u64 get_subtree_max_end(struct rb_node *node) -{ - u64 ret = 0; - if (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - ret = data->subtree_max_end; - } - return ret; -} - -#define NODE_END(node) ((node)->end) - -RB_DECLARE_CALLBACKS_MAX(static, memtype_rb_augment_cb, - struct memtype, rb, u64, subtree_max_end, NODE_END) - -/* Find the first (lowest start addr) overlapping range from rb tree */ -static struct memtype *memtype_rb_lowest_match(struct rb_root *root, - u64 start, u64 end) -{ - struct rb_node *node = root->rb_node; - struct memtype *last_lower = NULL; - - while (node) { - struct memtype *data = rb_entry(node, struct memtype, rb); - - if (get_subtree_max_end(node->rb_left) > start) { - /* Lowest overlap if any must be on left side */ - node = node->rb_left; - } else if (is_node_overlap(data, start, end)) { - last_lower = data; - break; - } else if (start >= data->start) { - /* Lowest overlap if any must be on right side */ - node = node->rb_right; - } else { - break; - } - } - return last_lower; /* Returns NULL if there is no overlap */ -} - -enum { - MEMTYPE_EXACT_MATCH = 0, - MEMTYPE_END_MATCH = 1 -}; - -static struct memtype *memtype_rb_match(struct rb_root *root, - u64 start, u64 end, int match_type) -{ - struct memtype *match; - - match = memtype_rb_lowest_match(root, start, end); - while (match != NULL && match->start < end) { - struct rb_node *node; - - if ((match_type == MEMTYPE_EXACT_MATCH) && - (match->start == start) && (match->end == end)) - return match; - - if ((match_type == MEMTYPE_END_MATCH) && - (match->start < start) && (match->end == end)) - return match; - - node = rb_next(&match->rb); - if (node) - match = rb_entry(node, struct memtype, rb); - else - match = NULL; - } - - return NULL; /* Returns NULL if there is no match */ -} - -static int memtype_rb_check_conflict(struct rb_root *root, - u64 start, u64 end, - enum page_cache_mode reqtype, - enum page_cache_mode *newtype) -{ - struct rb_node *node; - struct memtype *match; - enum page_cache_mode found_type = reqtype; - - match = memtype_rb_lowest_match(&memtype_rbroot, start, end); - if (match == NULL) - goto success; - - if (match->type != found_type && newtype == NULL) - goto failure; - - dprintk("Overlap at 0x%Lx-0x%Lx\n", match->start, match->end); - found_type = match->type; - - node = rb_next(&match->rb); - while (node) { - match = rb_entry(node, struct memtype, rb); - - if (match->start >= end) /* Checked all possible matches */ - goto success; - - if (is_node_overlap(match, start, end) && - match->type != found_type) { - goto failure; - } - - node = rb_next(&match->rb); - } -success: - if (newtype) - *newtype = found_type; - - return 0; - -failure: - pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n", - current->comm, current->pid, start, end, - cattr_name(found_type), cattr_name(match->type)); - return -EBUSY; -} - -static void memtype_rb_insert(struct rb_root *root, struct memtype *newdata) -{ - struct rb_node **node = &(root->rb_node); - struct rb_node *parent = NULL; - - while (*node) { - struct memtype *data = rb_entry(*node, struct memtype, rb); - - parent = *node; - if (data->subtree_max_end < newdata->end) - data->subtree_max_end = newdata->end; - if (newdata->start <= data->start) - node = &((*node)->rb_left); - else if (newdata->start > data->start) - node = &((*node)->rb_right); - } - - newdata->subtree_max_end = newdata->end; - rb_link_node(&newdata->rb, parent, node); - rb_insert_augmented(&newdata->rb, root, &memtype_rb_augment_cb); -} - -int rbt_memtype_check_insert(struct memtype *new, - enum page_cache_mode *ret_type) -{ - int err = 0; - - err = memtype_rb_check_conflict(&memtype_rbroot, new->start, new->end, - new->type, ret_type); - - if (!err) { - if (ret_type) - new->type = *ret_type; - - new->subtree_max_end = new->end; - memtype_rb_insert(&memtype_rbroot, new); - } - return err; -} - -struct memtype *rbt_memtype_erase(u64 start, u64 end) -{ - struct memtype *data; - - /* - * Since the memtype_rbroot tree allows overlapping ranges, - * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free - * a whole node for the munmap case. If no such entry is found, - * it then checks with END_MATCH, i.e. shrink the size of a node - * from the end for the mremap case. - */ - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_EXACT_MATCH); - if (!data) { - data = memtype_rb_match(&memtype_rbroot, start, end, - MEMTYPE_END_MATCH); - if (!data) - return ERR_PTR(-EINVAL); - } - - if (data->start == start) { - /* munmap: erase this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - } else { - /* mremap: update the end value of this node */ - rb_erase_augmented(&data->rb, &memtype_rbroot, - &memtype_rb_augment_cb); - data->end = start; - data->subtree_max_end = data->end; - memtype_rb_insert(&memtype_rbroot, data); - return NULL; - } - - return data; -} - -struct memtype *rbt_memtype_lookup(u64 addr) -{ - return memtype_rb_lowest_match(&memtype_rbroot, addr, addr + PAGE_SIZE); -} - -#if defined(CONFIG_DEBUG_FS) -int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) -{ - struct rb_node *node; - int i = 1; - - node = rb_first(&memtype_rbroot); - while (node && pos != i) { - node = rb_next(node); - i++; - } - - if (node) { /* pos == i */ - struct memtype *this = rb_entry(node, struct memtype, rb); - *out = *this; - return 0; - } else { - return 1; - } -} -#endif diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3e4b9035bb9a..7bd2c3a52297 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -643,8 +643,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) fixmaps_set++; } -void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, - pgprot_t flags) +void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index a8bd952e136d..92153d054d6c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -127,9 +127,9 @@ static int __init init(void) return -ENXIO; } - pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " - "and writing 16 kB of rubbish in there.\n", - size >> 10, mmio_address); + pr_warn("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, " + "and writing 16 kB of rubbish in there.\n", + size >> 10, mmio_address); do_test(size); do_test_bulk_ioremapping(); pr_info("All done.\n"); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 991549a1c5f3..b8be18427277 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -9,9 +9,11 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <linux/bpf.h> - +#include <linux/memory.h> +#include <asm/extable.h> #include <asm/set_memory.h> #include <asm/nospec-branch.h> +#include <asm/text-patching.h> static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) { @@ -96,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size) /* Pick a register outside of BPF range for JIT internal work */ #define AUX_REG (MAX_BPF_JIT_REG + 1) +#define X86_REG_R9 (MAX_BPF_JIT_REG + 2) /* * The following table maps BPF registers to x86-64 registers. @@ -104,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size) * register in load/store instructions, it always needs an * extra byte of encoding and is callee saved. * - * Also x86-64 register R9 is unused. x86-64 register R10 is - * used for blinding (if enabled). + * x86-64 register R9 is not used by BPF programs, but can be used by BPF + * trampoline. x86-64 register R10 is used for blinding (if enabled). */ static const int reg2hex[] = { [BPF_REG_0] = 0, /* RAX */ @@ -121,6 +124,20 @@ static const int reg2hex[] = { [BPF_REG_FP] = 5, /* RBP readonly */ [BPF_REG_AX] = 2, /* R10 temp register */ [AUX_REG] = 3, /* R11 temp register */ + [X86_REG_R9] = 1, /* R9 register, 6th function argument */ +}; + +static const int reg2pt_regs[] = { + [BPF_REG_0] = offsetof(struct pt_regs, ax), + [BPF_REG_1] = offsetof(struct pt_regs, di), + [BPF_REG_2] = offsetof(struct pt_regs, si), + [BPF_REG_3] = offsetof(struct pt_regs, dx), + [BPF_REG_4] = offsetof(struct pt_regs, cx), + [BPF_REG_5] = offsetof(struct pt_regs, r8), + [BPF_REG_6] = offsetof(struct pt_regs, bx), + [BPF_REG_7] = offsetof(struct pt_regs, r13), + [BPF_REG_8] = offsetof(struct pt_regs, r14), + [BPF_REG_9] = offsetof(struct pt_regs, r15), }; /* @@ -135,6 +152,7 @@ static bool is_ereg(u32 reg) BIT(BPF_REG_7) | BIT(BPF_REG_8) | BIT(BPF_REG_9) | + BIT(X86_REG_R9) | BIT(BPF_REG_AX)); } @@ -186,7 +204,10 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -#define PROLOGUE_SIZE 20 +/* Number of bytes emit_patch() needs to generate instructions */ +#define X86_PATCH_SIZE 5 + +#define PROLOGUE_SIZE 25 /* * Emit x86-64 prologue code for BPF program and check its size. @@ -195,8 +216,13 @@ struct jit_context { static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) { u8 *prog = *pprog; - int cnt = 0; + int cnt = X86_PATCH_SIZE; + /* BPF trampoline can be made to work without these nops, + * but let's waste 5 bytes for now and optimize later + */ + memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt); + prog += cnt; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ /* sub rsp, rounded_stack_depth */ @@ -213,6 +239,89 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) *pprog = prog; } +static int emit_patch(u8 **pprog, void *func, void *ip, u8 opcode) +{ + u8 *prog = *pprog; + int cnt = 0; + s64 offset; + + offset = func - (ip + X86_PATCH_SIZE); + if (!is_simm32(offset)) { + pr_err("Target call %p is out of range\n", func); + return -ERANGE; + } + EMIT1_off32(opcode, offset); + *pprog = prog; + return 0; +} + +static int emit_call(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE8); +} + +static int emit_jump(u8 **pprog, void *func, void *ip) +{ + return emit_patch(pprog, func, ip, 0xE9); +} + +static int __bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr, + const bool text_live) +{ + const u8 *nop_insn = ideal_nops[NOP_ATOMIC5]; + u8 old_insn[X86_PATCH_SIZE]; + u8 new_insn[X86_PATCH_SIZE]; + u8 *prog; + int ret; + + memcpy(old_insn, nop_insn, X86_PATCH_SIZE); + if (old_addr) { + prog = old_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, old_addr, ip) : + emit_jump(&prog, old_addr, ip); + if (ret) + return ret; + } + + memcpy(new_insn, nop_insn, X86_PATCH_SIZE); + if (new_addr) { + prog = new_insn; + ret = t == BPF_MOD_CALL ? + emit_call(&prog, new_addr, ip) : + emit_jump(&prog, new_addr, ip); + if (ret) + return ret; + } + + ret = -EBUSY; + mutex_lock(&text_mutex); + if (memcmp(ip, old_insn, X86_PATCH_SIZE)) + goto out; + if (memcmp(ip, new_insn, X86_PATCH_SIZE)) { + if (text_live) + text_poke_bp(ip, new_insn, X86_PATCH_SIZE, NULL); + else + memcpy(ip, new_insn, X86_PATCH_SIZE); + } + ret = 0; +out: + mutex_unlock(&text_mutex); + return ret; +} + +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, + void *old_addr, void *new_addr) +{ + if (!is_kernel_text((long)ip) && + !is_bpf_text_address((long)ip)) + /* BPF poking in modules is not supported */ + return -EINVAL; + + return __bpf_arch_text_poke(ip, t, old_addr, new_addr, true); +} + /* * Generate the following code: * @@ -227,7 +336,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf) * goto *(prog->bpf_func + prologue_size); * out: */ -static void emit_bpf_tail_call(u8 **pprog) +static void emit_bpf_tail_call_indirect(u8 **pprog) { u8 *prog = *pprog; int label1, label2, label3; @@ -294,6 +403,68 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } +static void emit_bpf_tail_call_direct(struct bpf_jit_poke_descriptor *poke, + u8 **pprog, int addr, u8 *image) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* + * if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -36 - MAX_BPF_STACK); /* mov eax, dword ptr [rbp - 548] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ + EMIT2(X86_JA, 14); /* ja out */ + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -36 - MAX_BPF_STACK); /* mov dword ptr [rbp -548], eax */ + + poke->ip = image + (addr - X86_PATCH_SIZE); + poke->adj_off = PROLOGUE_SIZE; + + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; + /* out: */ + + *pprog = prog; +} + +static void bpf_tail_call_direct_fixup(struct bpf_prog *prog) +{ + struct bpf_jit_poke_descriptor *poke; + struct bpf_array *array; + struct bpf_prog *target; + int i, ret; + + for (i = 0; i < prog->aux->size_poke_tab; i++) { + poke = &prog->aux->poke_tab[i]; + WARN_ON_ONCE(READ_ONCE(poke->ip_stable)); + + if (poke->reason != BPF_POKE_REASON_TAIL_CALL) + continue; + + array = container_of(poke->tail_call.map, struct bpf_array, map); + mutex_lock(&array->aux->poke_mutex); + target = array->ptrs[poke->tail_call.key]; + if (target) { + /* Plain memcpy is used when image is not live yet + * and still not locked as read-only. Once poke + * location is active (poke->ip_stable), any parallel + * bpf_arch_text_poke() might occur still on the + * read-write image until we finally locked it as + * read-only. Both modifications on the given image + * are under text_mutex to avoid interference. + */ + ret = __bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP, NULL, + (u8 *)target->bpf_func + + poke->adj_off, false); + BUG_ON(ret < 0); + } + WRITE_ONCE(poke->ip_stable, true); + mutex_unlock(&array->aux->poke_mutex); + } +} + static void emit_mov_imm32(u8 **pprog, bool sign_propagate, u32 dst_reg, const u32 imm32) { @@ -377,6 +548,96 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg) *pprog = prog; } +/* LDX: dst_reg = *(u8*)(src_reg + off) */ +static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'movzx rax, byte ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); + break; + case BPF_H: + /* Emit 'movzx rax, word ptr [rax + off]' */ + EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); + break; + case BPF_W: + /* Emit 'mov eax, dword ptr [rax+0x14]' */ + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); + else + EMIT1(0x8B); + break; + case BPF_DW: + /* Emit 'mov rax, qword ptr [rax+0x14]' */ + EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); + break; + } + /* + * If insn->off == 0 we can save one extra byte, but + * special case of x86 R13 which always needs an offset + * is not worth the hassle + */ + if (is_imm8(off)) + EMIT2(add_2reg(0x40, src_reg, dst_reg), off); + else + EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off); + *pprog = prog; +} + +/* STX: *(u8*)(dst_reg + off) = src_reg */ +static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) +{ + u8 *prog = *pprog; + int cnt = 0; + + switch (size) { + case BPF_B: + /* Emit 'mov byte ptr [rax + off], al' */ + if (is_ereg(dst_reg) || is_ereg(src_reg) || + /* We have to add extra byte for x86 SIL, DIL regs */ + src_reg == BPF_REG_1 || src_reg == BPF_REG_2) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); + else + EMIT1(0x88); + break; + case BPF_H: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT2(0x66, 0x89); + break; + case BPF_W: + if (is_ereg(dst_reg) || is_ereg(src_reg)) + EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); + else + EMIT1(0x89); + break; + case BPF_DW: + EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); + break; + } + if (is_imm8(off)) + EMIT2(add_2reg(0x40, dst_reg, src_reg), off); + else + EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off); + *pprog = prog; +} + +static bool ex_handler_bpf(const struct exception_table_entry *x, + struct pt_regs *regs, int trapnr, + unsigned long error_code, unsigned long fault_addr) +{ + u32 reg = x->fixup >> 8; + + /* jump over faulting load and clear dest register */ + *(unsigned long *)((void *)regs + reg) = 0; + regs->ip += x->fixup & 0xff; + return true; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -384,7 +645,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int insn_cnt = bpf_prog->len; bool seen_exit = false; u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i, cnt = 0; + int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; @@ -747,64 +1008,64 @@ st: if (is_imm8(insn->off)) /* STX: *(u8*)(dst_reg + off) = src_reg */ case BPF_STX | BPF_MEM | BPF_B: - /* Emit 'mov byte ptr [rax + off], al' */ - if (is_ereg(dst_reg) || is_ereg(src_reg) || - /* We have to add extra byte for x86 SIL, DIL regs */ - src_reg == BPF_REG_1 || src_reg == BPF_REG_2) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88); - else - EMIT1(0x88); - goto stx; case BPF_STX | BPF_MEM | BPF_H: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT2(0x66, 0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_W: - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89); - else - EMIT1(0x89); - goto stx; case BPF_STX | BPF_MEM | BPF_DW: - EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89); -stx: if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), - insn->off); + emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); break; /* LDX: dst_reg = *(u8*)(src_reg + off) */ case BPF_LDX | BPF_MEM | BPF_B: - /* Emit 'movzx rax, byte ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_B: case BPF_LDX | BPF_MEM | BPF_H: - /* Emit 'movzx rax, word ptr [rax + off]' */ - EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_H: case BPF_LDX | BPF_MEM | BPF_W: - /* Emit 'mov eax, dword ptr [rax+0x14]' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B); - else - EMIT1(0x8B); - goto ldx; + case BPF_LDX | BPF_PROBE_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_DW: - /* Emit 'mov rax, qword ptr [rax+0x14]' */ - EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B); -ldx: /* - * If insn->off == 0 we can save one extra byte, but - * special case of x86 R13 which always needs an offset - * is not worth the hassle - */ - if (is_imm8(insn->off)) - EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off); - else - EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), - insn->off); + case BPF_LDX | BPF_PROBE_MEM | BPF_DW: + emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off); + if (BPF_MODE(insn->code) == BPF_PROBE_MEM) { + struct exception_table_entry *ex; + u8 *_insn = image + proglen; + s64 delta; + + if (!bpf_prog->aux->extable) + break; + + if (excnt >= bpf_prog->aux->num_exentries) { + pr_err("ex gen bug\n"); + return -EFAULT; + } + ex = &bpf_prog->aux->extable[excnt++]; + + delta = _insn - (u8 *)&ex->insn; + if (!is_simm32(delta)) { + pr_err("extable->insn doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->insn = delta; + + delta = (u8 *)ex_handler_bpf - (u8 *)&ex->handler; + if (!is_simm32(delta)) { + pr_err("extable->handler doesn't fit into 32-bit\n"); + return -EFAULT; + } + ex->handler = delta; + + if (dst_reg > BPF_REG_9) { + pr_err("verifier error\n"); + return -EFAULT; + } + /* + * Compute size of x86 insn and its target dest x86 register. + * ex_handler_bpf() will use lower 8 bits to adjust + * pt_regs->ip to jump over this x86 instruction + * and upper bits to figure out which pt_regs to zero out. + * End result: x86 insn "mov rbx, qword ptr [rax+0x14]" + * of 4 bytes will be ignored and rbx will be zero inited. + */ + ex->fixup = (prog - temp) | (reg2pt_regs[dst_reg] << 8); + } break; /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ @@ -827,17 +1088,16 @@ xadd: if (is_imm8(insn->off)) /* call */ case BPF_JMP | BPF_CALL: func = (u8 *) __bpf_call_base + imm32; - jmp_offset = func - (image + addrs[i]); - if (!imm32 || !is_simm32(jmp_offset)) { - pr_err("unsupported BPF func %d addr %p image %p\n", - imm32, func, image); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1])) return -EINVAL; - } - EMIT1_off32(0xE8, jmp_offset); break; case BPF_JMP | BPF_TAIL_CALL: - emit_bpf_tail_call(&prog); + if (imm32) + emit_bpf_tail_call_direct(&bpf_prog->aux->poke_tab[imm32 - 1], + &prog, addrs[i], image); + else + emit_bpf_tail_call_indirect(&prog); break; /* cond jump */ @@ -909,6 +1169,16 @@ xadd: if (is_imm8(insn->off)) case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: + /* test dst_reg, dst_reg to save one extra byte */ + if (imm32 == 0) { + if (BPF_CLASS(insn->code) == BPF_JMP) + EMIT1(add_2mod(0x48, dst_reg, dst_reg)); + else if (is_ereg(dst_reg)) + EMIT1(add_2mod(0x40, dst_reg, dst_reg)); + EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg)); + goto emit_cond_jmp; + } + /* cmp dst_reg, imm8/32 */ if (BPF_CLASS(insn->code) == BPF_JMP) EMIT1(add_1mod(0x48, dst_reg)); @@ -1048,9 +1318,218 @@ emit_jmp: addrs[i] = proglen; prog = temp; } + + if (image && excnt != bpf_prog->aux->num_exentries) { + pr_err("extable is not populated\n"); + return -EFAULT; + } return proglen; } +static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]), + BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); +} + +static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args, + int stack_size) +{ + int i; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ + for (i = 0; i < min(nr_args, 6); i++) + emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]), + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); +} + +static int invoke_bpf(struct btf_func_model *m, u8 **pprog, + struct bpf_prog **progs, int prog_cnt, int stack_size) +{ + u8 *prog = *pprog; + int cnt = 0, i; + + for (i = 0; i < prog_cnt; i++) { + if (emit_call(&prog, __bpf_prog_enter, prog)) + return -EINVAL; + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + + /* arg1: lea rdi, [rbp - stack_size] */ + EMIT4(0x48, 0x8D, 0x7D, -stack_size); + /* arg2: progs[i]->insnsi for interpreter */ + if (!progs[i]->jited) + emit_mov_imm64(&prog, BPF_REG_2, + (long) progs[i]->insnsi >> 32, + (u32) (long) progs[i]->insnsi); + /* call JITed bpf program or interpreter */ + if (emit_call(&prog, progs[i]->bpf_func, prog)) + return -EINVAL; + + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32, + (u32) (long) progs[i]); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, __bpf_prog_exit, prog)) + return -EINVAL; + } + *pprog = prog; + return 0; +} + +/* Example: + * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); + * its 'struct btf_func_model' will be nr_args=2 + * The assembly code when eth_type_trans is executing after trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 16 // space for skb and dev + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 16], rdi // save skb pointer to stack + * mov qword ptr [rbp - 8], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 16] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 16] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 8] // restore dev pointer from stack + * pop rbx + * leave + * ret + * + * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be + * replaced with 'call generated_bpf_trampoline'. When it returns + * eth_type_trans will continue executing with original skb and dev pointers. + * + * The assembly code when eth_type_trans is called from trampoline: + * + * push rbp + * mov rbp, rsp + * sub rsp, 24 // space for skb, dev, return value + * push rbx // temp regs to pass start time + * mov qword ptr [rbp - 24], rdi // save skb pointer to stack + * mov qword ptr [rbp - 16], rsi // save dev pointer to stack + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time if bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FENTRY_prog // bpf prog can access skb and dev + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rdi, qword ptr [rbp - 24] // restore skb pointer from stack + * mov rsi, qword ptr [rbp - 16] // restore dev pointer from stack + * call eth_type_trans+5 // execute body of eth_type_trans + * mov qword ptr [rbp - 8], rax // save return value + * call __bpf_prog_enter // rcu_read_lock and preempt_disable + * mov rbx, rax // remember start time in bpf stats are enabled + * lea rdi, [rbp - 24] // R1==ctx of bpf prog + * call addr_of_jited_FEXIT_prog // bpf prog can access skb, dev, return value + * movabsq rdi, 64bit_addr_of_struct_bpf_prog // unused if bpf stats are off + * mov rsi, rbx // prog start time + * call __bpf_prog_exit // rcu_read_unlock, preempt_enable and stats math + * mov rax, qword ptr [rbp - 8] // restore eth_type_trans's return value + * pop rbx + * leave + * add rsp, 8 // skip eth_type_trans's frame + * ret // return to its caller + */ +int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags, + struct bpf_prog **fentry_progs, int fentry_cnt, + struct bpf_prog **fexit_progs, int fexit_cnt, + void *orig_call) +{ + int cnt = 0, nr_args = m->nr_args; + int stack_size = nr_args * 8; + u8 *prog; + + /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ + if (nr_args > 6) + return -ENOTSUPP; + + if ((flags & BPF_TRAMP_F_RESTORE_REGS) && + (flags & BPF_TRAMP_F_SKIP_FRAME)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) + stack_size += 8; /* room for return value of orig_call */ + + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip patched call instruction and point orig_call to actual + * body of the kernel function. + */ + orig_call += X86_PATCH_SIZE; + + prog = image; + + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ + EMIT1(0x53); /* push rbx */ + + save_regs(m, &prog, nr_args, stack_size); + + if (fentry_cnt) + if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + if (fentry_cnt) + restore_regs(m, &prog, nr_args, stack_size); + + /* call original function */ + if (emit_call(&prog, orig_call, prog)) + return -EINVAL; + /* remember return value in a stack for bpf prog to access */ + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + } + + if (fexit_cnt) + if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size)) + return -EINVAL; + + if (flags & BPF_TRAMP_F_RESTORE_REGS) + restore_regs(m, &prog, nr_args, stack_size); + + if (flags & BPF_TRAMP_F_CALL_ORIG) + /* restore original return value back into RAX */ + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + + EMIT1(0x5B); /* pop rbx */ + EMIT1(0xC9); /* leave */ + if (flags & BPF_TRAMP_F_SKIP_FRAME) + /* skip our return address and return to parent */ + EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */ + EMIT1(0xC3); /* ret */ + /* One half of the page has active running trampoline. + * Another half is an area for next trampoline. + * Make sure the trampoline generation logic doesn't overflow. + */ + if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY)) + return -EFAULT; + return 0; +} + struct x64_jit_data { struct bpf_binary_header *header; int *addrs; @@ -1148,12 +1627,24 @@ out_image: break; } if (proglen == oldproglen) { - header = bpf_jit_binary_alloc(proglen, &image, - 1, jit_fill_hole); + /* + * The number of entries in extable is the number of BPF_LDX + * insns that access kernel memory via "pointer to BTF type". + * The verifier changed their opcode from LDX|MEM|size + * to LDX|PROBE_MEM|size to make JITing easier. + */ + u32 align = __alignof__(struct exception_table_entry); + u32 extable_size = prog->aux->num_exentries * + sizeof(struct exception_table_entry); + + /* allocate module memory for x86 insns and extable */ + header = bpf_jit_binary_alloc(roundup(proglen, align) + extable_size, + &image, align, jit_fill_hole); if (!header) { prog = orig_prog; goto out_addrs; } + prog->aux->extable = (void *) image + roundup(proglen, align); } oldproglen = proglen; cond_resched(); @@ -1164,6 +1655,7 @@ out_image: if (image) { if (!prog->is_func || extra_pass) { + bpf_tail_call_direct_fixup(prog); bpf_jit_binary_lock_ro(header); } else { jit_data->addrs = addrs; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 71e8a67337e2..276cf79b5d24 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -67,13 +67,13 @@ static inline void op_x86_warn_in_use(int counter) * cannot be monitored by any other counter, contact your * hardware or BIOS vendor. */ - pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", - counter, smp_processor_id()); + pr_warn("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); } static inline void op_x86_warn_reserved(int counter) { - pr_warning("oprofile: counter #%d is already reserved\n", counter); + pr_warn("oprofile: counter #%d is already reserved\n", counter); } extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c202e1b07e29..425e025341db 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -917,9 +917,6 @@ static void __init kexec_enter_virtual_mode(void) if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX)) runtime_code_page_mkexec(); - - /* clean DUMMY object */ - efi_delete_dummy_variable(); #endif } diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 6d193bb36021..089413cd944e 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -39,7 +39,7 @@ static int set_lid_wake_behavior(bool wake_on_close) status = acpi_execute_simple_method(NULL, "\\_SB.PCI0.LID.LIDW", wake_on_close); if (ACPI_FAILURE(status)) { - pr_warning(PFX "failed to set lid behavior\n"); + pr_warn(PFX "failed to set lid behavior\n"); return 1; } diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c index bf6016f8db4e..6259563760f9 100644 --- a/arch/x86/platform/sfi/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -26,8 +26,7 @@ static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static void __init mp_sfi_register_lapic(u8 id) { if (MAX_LOCAL_APIC - id <= 0) { - pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_LOCAL_APIC); + pr_warn("Processor #%d invalid (max %d)\n", id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index c2ee31953372..ece9cb9c1189 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -184,20 +184,20 @@ int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) } EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); -void uv_bios_init(void) +int uv_bios_init(void) { uv_systab = NULL; if ((uv_systab_phys == EFI_INVALID_TABLE_ADDR) || !uv_systab_phys || efi_runtime_disabled()) { pr_crit("UV: UVsystab: missing\n"); - return; + return -EEXIST; } uv_systab = ioremap(uv_systab_phys, sizeof(struct uv_systab)); if (!uv_systab || strncmp(uv_systab->signature, UV_SYSTAB_SIG, 4)) { pr_err("UV: UVsystab: bad signature!\n"); iounmap(uv_systab); - return; + return -EINVAL; } /* Starting with UV4 the UV systab size is variable */ @@ -208,8 +208,9 @@ void uv_bios_init(void) uv_systab = ioremap(uv_systab_phys, size); if (!uv_systab) { pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); - return; + return -EFAULT; } } pr_info("UV: UVsystab: Revision:%x\n", uv_systab->revision); + return 0; } diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 3b95410ff0f8..2961234d0795 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -14,28 +14,10 @@ #include "../boot/string.h" -unsigned long purgatory_backup_dest __section(.kexec-purgatory); -unsigned long purgatory_backup_src __section(.kexec-purgatory); -unsigned long purgatory_backup_sz __section(.kexec-purgatory); - u8 purgatory_sha256_digest[SHA256_DIGEST_SIZE] __section(.kexec-purgatory); struct kexec_sha_region purgatory_sha_regions[KEXEC_SEGMENT_MAX] __section(.kexec-purgatory); -/* - * On x86, second kernel requries first 640K of memory to boot. Copy - * first 640K to a backup region in reserved memory range so that second - * kernel can use first 640K. - */ -static int copy_backup_region(void) -{ - if (purgatory_backup_dest) { - memcpy((void *)purgatory_backup_dest, - (void *)purgatory_backup_src, purgatory_backup_sz); - } - return 0; -} - static int verify_sha256_digest(void) { struct kexec_sha_region *ptr, *end; @@ -66,7 +48,6 @@ void purgatory(void) for (;;) ; } - copy_backup_region(); } /* diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index 7dce39c8c034..262f83cad355 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -8,6 +8,7 @@ #include <asm/pgtable.h> #include <asm/realmode.h> #include <asm/tlbflush.h> +#include <asm/crash.h> struct real_mode_header *real_mode_header; u32 *trampoline_cr4_features; @@ -34,6 +35,7 @@ void __init reserve_real_mode(void) memblock_reserve(mem, size); set_real_mode_mem(mem); + crash_reserve_low_1M(); } static void __init setup_real_mode(void) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index 3bb980800c58..64d135d1ee63 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -11,6 +11,7 @@ OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) +ENTRY(pa_text_start) SECTIONS { diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index b02a36b2c14f..a42015b305f4 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -69,7 +69,7 @@ BEGIN { lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\((F2|!F3|66\\&F2)\\)" + lprefix3_expr = "\\((F2|!F3|66&F2)\\)" lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 @@ -257,7 +257,7 @@ function convert_operands(count,opnd, i,j,imm,mod) return add_flags(imm, mod) } -/^[0-9a-f]+\:/ { +/^[0-9a-f]+:/ { if (NR == 1) next # get index diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 0d3365cb64de..a04551ee5568 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -57,19 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void) return NULL; /* Here we know that Xen runs on EFI platform. */ - - efi.get_time = xen_efi_get_time; - efi.set_time = xen_efi_set_time; - efi.get_wakeup_time = xen_efi_get_wakeup_time; - efi.set_wakeup_time = xen_efi_set_wakeup_time; - efi.get_variable = xen_efi_get_variable; - efi.get_next_variable = xen_efi_get_next_variable; - efi.set_variable = xen_efi_set_variable; - efi.query_variable_info = xen_efi_query_variable_info; - efi.update_capsule = xen_efi_update_capsule; - efi.query_capsule_caps = xen_efi_query_capsule_caps; - efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; - efi.reset_system = xen_efi_reset_system; + xen_efi_runtime_setup(); efi_systab_xen.tables = info->cfg.addr; efi_systab_xen.nr_tables = info->cfg.nent; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 750f46ad018a..205b1176084f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -269,19 +269,41 @@ void xen_reboot(int reason) BUG(); } +static int reboot_reason = SHUTDOWN_reboot; +static bool xen_legacy_crash; void xen_emergency_restart(void) { - xen_reboot(SHUTDOWN_reboot); + xen_reboot(reboot_reason); } static int xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { - if (!kexec_crash_loaded()) - xen_reboot(SHUTDOWN_crash); + if (!kexec_crash_loaded()) { + if (xen_legacy_crash) + xen_reboot(SHUTDOWN_crash); + + reboot_reason = SHUTDOWN_crash; + + /* + * If panic_timeout==0 then we are supposed to wait forever. + * However, to preserve original dom0 behavior we have to drop + * into hypervisor. (domU behavior is controlled by its + * config file) + */ + if (panic_timeout == 0) + panic_timeout = -1; + } return NOTIFY_DONE; } +static int __init parse_xen_legacy_crash(char *arg) +{ + xen_legacy_crash = true; + return 0; +} +early_param("xen_legacy_crash", parse_xen_legacy_crash); + static struct notifier_block xen_panic_block = { .notifier_call = xen_panic_event, .priority = INT_MIN diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 58f79ab32358..5bfea374a160 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -117,6 +117,14 @@ static void __init xen_banner(void) printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); + +#ifdef CONFIG_X86_32 + pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n" + "Support for running as 32-bit PV-guest under Xen will soon be removed\n" + "from the Linux kernel!\n" + "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n" + "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"); +#endif } static void __init xen_pv_init_platform(void) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 548d1e0a5ba1..33b0e20df7fc 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -412,7 +412,7 @@ static unsigned long __init xen_set_identity_and_remap_chunk( remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { - pr_warning("Unable to find available pfn range, not remapping identity pages\n"); + pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, cur_pfn + left, nr_pages); break; diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 8b8f8355b938..2712e9155306 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -127,10 +127,9 @@ SYM_CODE_END(xen_iret) .globl xen_iret_start_crit, xen_iret_end_crit /* - * This is called by xen_hypervisor_callback in entry.S when it sees + * This is called by xen_hypervisor_callback in entry_32.S when it sees * that the EIP at the time of interrupt was between - * xen_iret_start_crit and xen_iret_end_crit. We're passed the EIP in - * %eax so we can do a more refined determination of what to do. + * xen_iret_start_crit and xen_iret_end_crit. * * The stack format at this point is: * ---------------- @@ -139,70 +138,46 @@ SYM_CODE_END(xen_iret) * eflags } outer exception info * cs } * eip } - * ---------------- <- edi (copy dest) - * eax : outer eax if it hasn't been restored * ---------------- - * eflags } nested exception info - * cs } (no ss/esp because we're nested - * eip } from the same ring) - * orig_eax }<- esi (copy src) - * - - - - - - - - - * fs } - * es } - * ds } SAVE_ALL state - * eax } - * : : - * ebx }<- esp + * eax : outer eax if it hasn't been restored * ---------------- + * eflags } + * cs } nested exception info + * eip } + * return address : (into xen_hypervisor_callback) * - * In order to deliver the nested exception properly, we need to shift - * everything from the return addr up to the error code so it sits - * just under the outer exception info. This means that when we - * handle the exception, we do it in the context of the outer - * exception rather than starting a new one. + * In order to deliver the nested exception properly, we need to discard the + * nested exception frame such that when we handle the exception, we do it + * in the context of the outer exception rather than starting a new one. * - * The only caveat is that if the outer eax hasn't been restored yet - * (ie, it's still on stack), we need to insert its value into the - * SAVE_ALL state before going on, since it's usermode state which we - * eventually need to restore. + * The only caveat is that if the outer eax hasn't been restored yet (i.e. + * it's still on stack), we need to restore its value here. */ SYM_CODE_START(xen_iret_crit_fixup) /* * Paranoia: Make sure we're really coming from kernel space. * One could imagine a case where userspace jumps into the * critical range address, but just before the CPU delivers a - * GP, it decides to deliver an interrupt instead. Unlikely? - * Definitely. Easy to avoid? Yes. The Intel documents - * explicitly say that the reported EIP for a bad jump is the - * jump instruction itself, not the destination, but some - * virtual environments get this wrong. + * PF, it decides to deliver an interrupt instead. Unlikely? + * Definitely. Easy to avoid? Yes. */ - movl PT_CS(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX(%esp), %esi - lea PT_EFLAGS(%esp), %edi + testb $2, 2*4(%esp) /* nested CS */ + jnz 2f /* * If eip is before iret_restore_end then stack * hasn't been restored yet. */ - cmp $iret_restore_end, %eax + cmpl $iret_restore_end, 1*4(%esp) jae 1f - movl 0+4(%edi), %eax /* copy EAX (just above top of frame) */ - movl %eax, PT_EAX(%esp) - - lea ESP_OFFSET(%edi), %edi /* move dest up over saved regs */ + movl 4*4(%esp), %eax /* load outer EAX */ + ret $4*4 /* discard nested EIP, CS, and EFLAGS as + * well as the just restored EAX */ - /* set up the copy */ -1: std - mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ - rep movsl - cld +1: + ret $3*4 /* discard nested EIP, CS, and EFLAGS */ - lea 4(%edi), %esp /* point esp to new frame */ -2: jmp xen_do_upcall +2: + ret SYM_CODE_END(xen_iret_crit_fixup) |