From e87f4152e542610d0b4c6c8548964a68a59d2040 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 20:02:41 +0100 Subject: task_stack, x86/cea: Force-inline stack helpers Force-inline two stack helpers to fix the following objtool warnings: vmlinux.o: warning: objtool: in_task_stack()+0xc: call to task_stack_page() leaves .noinstr.text section vmlinux.o: warning: objtool: in_entry_stack()+0x10: call to cpu_entry_stack() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-2-bp@alien8.de --- arch/x86/include/asm/cpu_entry_area.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index dd5ea1bdf04c..75efc4c6f076 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -143,7 +143,7 @@ extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); extern struct cpu_entry_area *get_cpu_entry_area(int cpu); -static inline struct entry_stack *cpu_entry_stack(int cpu) +static __always_inline struct entry_stack *cpu_entry_stack(int cpu) { return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } -- cgit v1.2.3-59-g8ed1b From 6b91ec4ad290a808a894e7d3448eb82d0b8ef8d5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 21:52:44 +0100 Subject: x86/kvm/svm: Force-inline GHCB accessors In order to fix: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x4c: call to ghcb_set_sw_exit_code() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-3-bp@alien8.de --- arch/x86/include/asm/svm.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f70a5108d464..e255039b36b6 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -444,23 +444,23 @@ struct vmcb { (offsetof(struct vmcb_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ - static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ + static __always_inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ { \ return test_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ } \ \ - static inline u64 ghcb_get_##field(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field(struct ghcb *ghcb) \ { \ return ghcb->save.field; \ } \ \ - static inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ + static __always_inline u64 ghcb_get_##field##_if_valid(struct ghcb *ghcb) \ { \ return ghcb_##field##_is_valid(ghcb) ? ghcb->save.field : 0; \ } \ \ - static inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ + static __always_inline void ghcb_set_##field(struct ghcb *ghcb, u64 value) \ { \ __set_bit(GHCB_BITMAP_IDX(field), \ (unsigned long *)&ghcb->save.valid_bitmap); \ -- cgit v1.2.3-59-g8ed1b From ace1a98519270c586c0d4179419292df67441cd1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 23 Mar 2022 23:24:12 +0100 Subject: x86/mm: Force-inline __phys_addr_nodebug() Fix: vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x8b: call to __phys_addr_nodebug() leaves .noinstr.text section Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220324183607.31717-4-bp@alien8.de --- arch/x86/include/asm/page_64.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index e9c86299b835..baa70451b8df 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -16,7 +16,7 @@ extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -static inline unsigned long __phys_addr_nodebug(unsigned long x) +static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { unsigned long y = x - __START_KERNEL_map; -- cgit v1.2.3-59-g8ed1b From 1625c833db93516faaac5feedadf8d19c14238b6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:21 +0100 Subject: x86/cpu: Allow feature bit names from /proc/cpuinfo in clearcpuid= Having to give the X86_FEATURE array indices in order to disable a feature bit for testing is not really user-friendly. So accept the feature bit names too. Some feature bits don't have names so there the array indices are still accepted, of course. Clearing CPUID flags is not something which should be done in production so taint the kernel too. An exemplary cmdline would then be something like: clearcpuid=de,440,smca,succory,bmi1,3dnow ("succory" is wrong on purpose). And it says: [ ... ] Clearing CPUID bits: de 13:24 smca (unknown: succory) bmi1 3dnow [ Fix CONFIG_X86_FEATURE_NAMES=n build error as reported by the 0day robot: https://lore.kernel.org/r/202203292206.ICsY2RKX-lkp@intel.com ] Signed-off-by: Borislav Petkov Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-2-bp@alien8.de --- Documentation/admin-guide/kernel-parameters.txt | 11 +++-- arch/x86/include/asm/cpufeature.h | 7 ++- arch/x86/kernel/cpu/common.c | 64 ++++++++++++++++++++----- 3 files changed, 65 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..0ea17869a089 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -631,12 +631,17 @@ Defaults to zero when built as a module and to 10 seconds when built into the kernel. - clearcpuid=BITNUM[,BITNUM...] [X86] + clearcpuid=X[,X...] [X86] Disable CPUID feature X for the kernel. See arch/x86/include/asm/cpufeatures.h for the valid bit - numbers. Note the Linux specific bits are not necessarily - stable over kernel options, but the vendor specific + numbers X. Note the Linux-specific bits are not necessarily + stable over kernel options, but the vendor-specific ones should be. + X can also be a string as appearing in the flags: line + in /proc/cpuinfo which does not have the above + instability issue. However, not all features have names + in /proc/cpuinfo. + Note that using this option will taint your kernel. Also note that user programs calling CPUID directly or using the feature without checking anything will still see it. This just prevents it from diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1261842d006c..66d3e3b1d24d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -34,14 +34,17 @@ enum cpuid_leafs CPUID_8000_001F_EAX, }; +#define X86_CAP_FMT_NUM "%d:%d" +#define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) + #ifdef CONFIG_X86_FEATURE_NAMES extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] #else -#define X86_CAP_FMT "%d:%d" -#define x86_cap_flag(flag) ((flag) >> 5), ((flag) & 31) +#define X86_CAP_FMT X86_CAP_FMT_NUM +#define x86_cap_flag x86_cap_flag_num #endif /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ed4417500700..69c7ea84b005 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1368,8 +1368,8 @@ static void detect_nopl(void) static void __init cpu_parse_early_param(void) { char arg[128]; - char *argptr = arg; - int arglen, res, bit; + char *argptr = arg, *opt; + int arglen, taint = 0; #ifdef CONFIG_X86_32 if (cmdline_find_option_bool(boot_command_line, "no387")) @@ -1397,21 +1397,61 @@ static void __init cpu_parse_early_param(void) return; pr_info("Clearing CPUID bits:"); - do { - res = get_option(&argptr, &bit); - if (res == 0 || res == 3) - break; - /* If the argument was too long, the last bit may be cut off */ - if (res == 1 && arglen >= sizeof(arg)) - break; + while (argptr) { + bool found __maybe_unused = false; + unsigned int bit; + + opt = strsep(&argptr, ","); + + /* + * Handle naked numbers first for feature flags which don't + * have names. + */ + if (!kstrtouint(opt, 10, &bit)) { + if (bit < NCAPINTS * 32) { + +#ifdef CONFIG_X86_FEATURE_NAMES + /* empty-string, i.e., ""-defined feature flags */ + if (!x86_cap_flags[bit]) + pr_cont(" " X86_CAP_FMT_NUM, x86_cap_flag_num(bit)); + else +#endif + pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + + setup_clear_cpu_cap(bit); + taint++; + } + /* + * The assumption is that there are no feature names with only + * numbers in the name thus go to the next argument. + */ + continue; + } + +#ifdef CONFIG_X86_FEATURE_NAMES + for (bit = 0; bit < 32 * NCAPINTS; bit++) { + if (!x86_cap_flag(bit)) + continue; - if (bit >= 0 && bit < NCAPINTS * 32) { - pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit)); + if (strcmp(x86_cap_flag(bit), opt)) + continue; + + pr_cont(" %s", opt); setup_clear_cpu_cap(bit); + taint++; + found = true; + break; } - } while (res == 2); + + if (!found) + pr_cont(" (unknown: %s)", opt); +#endif + } pr_cont("\n"); + + if (taint) + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); } /* -- cgit v1.2.3-59-g8ed1b From dbae0a934f09208075ec3e73491bd0844e1397b3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:23 +0100 Subject: x86/cpu: Remove CONFIG_X86_SMAP and "nosmap" Those were added as part of the SMAP enablement but SMAP is currently an integral part of kernel proper and there's no need to disable it anymore. Rip out that functionality. Leave --uaccess default on for objtool as this is what objtool should do by default anyway. If still needed - clearcpuid=smap. Signed-off-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-4-bp@alien8.de --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/x86/cpuinfo.rst | 5 ++--- arch/x86/Kconfig | 11 ----------- arch/x86/include/asm/disabled-features.h | 8 +------- arch/x86/include/asm/smap.h | 24 ------------------------ arch/x86/kernel/cpu/common.c | 15 +-------------- scripts/Makefile.build | 2 +- scripts/link-vmlinux.sh | 6 +++--- tools/arch/x86/include/asm/disabled-features.h | 8 +------- 9 files changed, 10 insertions(+), 71 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a9f3d3158e77..e0bb710f0fa9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3461,7 +3461,7 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings - nosmap [X86,PPC] + nosmap [PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. diff --git a/Documentation/x86/cpuinfo.rst b/Documentation/x86/cpuinfo.rst index 5d54c39a063f..12fbe2b1e98a 100644 --- a/Documentation/x86/cpuinfo.rst +++ b/Documentation/x86/cpuinfo.rst @@ -140,9 +140,8 @@ from #define X86_FEATURE_UMIP (16*32 + 2). In addition, there exists a variety of custom command-line parameters that disable specific features. The list of parameters includes, but is not limited -to, nofsgsbase, nosmap, and nosmep. 5-level paging can also be disabled using -"no5lvl". SMAP and SMEP are disabled with the aforementioned parameters, -respectively. +to, nofsgsbase, and nosmep. 5-level paging can also be disabled using +"no5lvl". SMEP is disabled with the aforementioned parameter. e: The feature was known to be non-functional. ---------------------------------------------- diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..5bc8bee64bb0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1816,17 +1816,6 @@ config ARCH_RANDOM If supported, this is a high bandwidth, cryptographically secure hardware random number generator. -config X86_SMAP - def_bool y - prompt "Supervisor Mode Access Prevention" if EXPERT - help - Supervisor Mode Access Prevention (SMAP) is a security - feature in newer Intel processors. There is a small - performance cost if this enabled and turned on; there is - also a small increase in the kernel size if this is enabled. - - If unsure, say Y. - config X86_UMIP def_bool y prompt "User Mode Instruction Prevention" if EXPERT diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..1ae0fab7d902 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -10,12 +10,6 @@ * cpu_feature_enabled(). */ -#ifdef CONFIG_X86_SMAP -# define DISABLE_SMAP 0 -#else -# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) -#endif - #ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else @@ -80,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) +#define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index d17b39893b79..bab490379c65 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -19,25 +19,14 @@ #ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_SMAP - #define ASM_CLAC \ ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP -#else /* CONFIG_X86_SMAP */ - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #else /* __ASSEMBLY__ */ -#ifdef CONFIG_X86_SMAP - static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ @@ -76,19 +65,6 @@ static __always_inline void smap_restore(unsigned long flags) #define ASM_STAC \ ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) -#else /* CONFIG_X86_SMAP */ - -static inline void clac(void) { } -static inline void stac(void) { } - -static inline unsigned long smap_save(void) { return 0; } -static inline void smap_restore(unsigned long flags) { } - -#define ASM_CLAC -#define ASM_STAC - -#endif /* CONFIG_X86_SMAP */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c71d1075db93..747df070fb5e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -382,13 +382,6 @@ static __always_inline void setup_smep(struct cpuinfo_x86 *c) cr4_set_bits(X86_CR4_SMEP); } -static __init int setup_disable_smap(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_SMAP); - return 1; -} -__setup("nosmap", setup_disable_smap); - static __always_inline void setup_smap(struct cpuinfo_x86 *c) { unsigned long eflags = native_save_fl(); @@ -396,14 +389,8 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) /* This should have been cleared long ago */ BUG_ON(eflags & X86_EFLAGS_AC); - if (cpu_has(c, X86_FEATURE_SMAP)) { -#ifdef CONFIG_X86_SMAP + if (cpu_has(c, X86_FEATURE_SMAP)) cr4_set_bits(X86_CR4_SMAP); -#else - clear_cpu_cap(c, X86_FEATURE_SMAP); - cr4_clear_bits(X86_CR4_SMAP); -#endif - } } static __always_inline void setup_umip(struct cpuinfo_x86 *c) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 9717e6f6fb31..7e7aa1d030a6 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -233,7 +233,7 @@ objtool_args = \ $(if $(CONFIG_FRAME_POINTER),, --no-fp) \ $(if $(CONFIG_GCOV_KERNEL)$(CONFIG_LTO_CLANG), --no-unreachable)\ $(if $(CONFIG_RETPOLINE), --retpoline) \ - $(if $(CONFIG_X86_SMAP), --uaccess) \ + --uaccess \ $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount) \ $(if $(CONFIG_SLS), --sls) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 20f44504a644..3a2fffdf49d4 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -146,9 +146,9 @@ objtool_link() if is_enabled CONFIG_RETPOLINE; then objtoolopt="${objtoolopt} --retpoline" fi - if is_enabled CONFIG_X86_SMAP; then - objtoolopt="${objtoolopt} --uaccess" - fi + + objtoolopt="${objtoolopt} --uaccess" + if is_enabled CONFIG_SLS; then objtoolopt="${objtoolopt} --sls" fi diff --git a/tools/arch/x86/include/asm/disabled-features.h b/tools/arch/x86/include/asm/disabled-features.h index 1231d63f836d..1ae0fab7d902 100644 --- a/tools/arch/x86/include/asm/disabled-features.h +++ b/tools/arch/x86/include/asm/disabled-features.h @@ -10,12 +10,6 @@ * cpu_feature_enabled(). */ -#ifdef CONFIG_X86_SMAP -# define DISABLE_SMAP 0 -#else -# define DISABLE_SMAP (1<<(X86_FEATURE_SMAP & 31)) -#endif - #ifdef CONFIG_X86_UMIP # define DISABLE_UMIP 0 #else @@ -80,7 +74,7 @@ #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 -#define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) +#define DISABLED_MASK9 (DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 #define DISABLED_MASK12 0 -- cgit v1.2.3-59-g8ed1b From 76ea0025a214cdf0d2c204f4c21cbffa9fb57c32 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 27 Jan 2022 12:56:25 +0100 Subject: x86/cpu: Remove "noexec" It doesn't make any sense to disable non-executable mappings - security-wise or else. So rip out that switch and move the remaining code into setup.c and delete setup_nx.c Signed-off-by: Borislav Petkov Reviewed-by: Lai Jiangshan Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20220127115626.14179-6-bp@alien8.de --- Documentation/admin-guide/kernel-parameters.txt | 5 -- Documentation/x86/x86_64/boot-options.rst | 9 ---- arch/x86/include/asm/proto.h | 1 - arch/x86/kernel/setup.c | 28 +++++++++-- arch/x86/mm/Makefile | 3 +- arch/x86/mm/init_64.c | 1 - arch/x86/mm/setup_nx.c | 62 ------------------------- 7 files changed, 26 insertions(+), 83 deletions(-) delete mode 100644 arch/x86/mm/setup_nx.c (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 39ac2c14dd71..a2299b2ff2c8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3456,11 +3456,6 @@ noexec [IA-64] - noexec [X86] - On X86-32 available only on PAE configured kernels. - noexec=on: enable non-executable mappings (default) - noexec=off: disable non-executable mappings - nosmap [PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. diff --git a/Documentation/x86/x86_64/boot-options.rst b/Documentation/x86/x86_64/boot-options.rst index 07aa0007f346..fb6030a25f08 100644 --- a/Documentation/x86/x86_64/boot-options.rst +++ b/Documentation/x86/x86_64/boot-options.rst @@ -157,15 +157,6 @@ Rebooting newer BIOS, or newer board) using this option will ignore the built-in quirk table, and use the generic default reboot actions. -Non Executable Mappings -======================= - - noexec=on|off - on - Enable(default) - off - Disable - NUMA ==== diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..0f899c8d7a4e 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -35,7 +35,6 @@ void xen_entry_INT80_compat(void); #endif void x86_configure_nx(void); -void x86_report_nx(void); extern int reboot_force; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c95b9ac5a457..249981bf3d8a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -756,6 +756,30 @@ dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) return 0; } +void x86_configure_nx(void) +{ + if (boot_cpu_has(X86_FEATURE_NX)) + __supported_pte_mask |= _PAGE_NX; + else + __supported_pte_mask &= ~_PAGE_NX; +} + +static void __init x86_report_nx(void) +{ + if (!boot_cpu_has(X86_FEATURE_NX)) { + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "missing in CPU!\n"); + } else { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#else + /* 32bit non-PAE kernel, NX cannot be used */ + printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " + "cannot be enabled: non-PAE kernel!\n"); +#endif + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -896,9 +920,7 @@ void __init setup_arch(char **cmdline_p) /* * x86_configure_nx() is called before parse_early_param() to detect * whether hardware doesn't support NX (so that the early EHCI debug - * console setup can safely call set_fixmap()). It may then be called - * again from within noexec_setup() during parsing early parameters - * to honor the respective command line option. + * console setup can safely call set_fixmap()). */ x86_configure_nx(); diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..d957dc15b371 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,13 +20,12 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o obj-y += pat/ # Make sure __phys_addr has no stackprotector CFLAGS_physaddr.o := -fno-stack-protector -CFLAGS_setup_nx.o := -fno-stack-protector CFLAGS_mem_encrypt_identity.o := -fno-stack-protector CFLAGS_fault.o := -I $(srctree)/$(src)/../include/asm/trace diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..d2e484efdfa1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -110,7 +110,6 @@ int force_personality32; /* * noexec32=on|off * Control non executable heap for 32bit processes. - * To control the stack too use noexec=off * * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) * off PROT_READ implies PROT_EXEC diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c deleted file mode 100644 index ed5667f5169f..000000000000 --- a/arch/x86/mm/setup_nx.c +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include - -#include -#include - -static int disable_nx; - -/* - * noexec = on|off - * - * Control non-executable mappings for processes. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - } - x86_configure_nx(); - return 0; -} -early_param("noexec", noexec_setup); - -void x86_configure_nx(void) -{ - if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) - __supported_pte_mask |= _PAGE_NX; - else - __supported_pte_mask &= ~_PAGE_NX; -} - -void __init x86_report_nx(void) -{ - if (!boot_cpu_has(X86_FEATURE_NX)) { - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU!\n"); - } else { -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) - if (disable_nx) { - printk(KERN_INFO "NX (Execute Disable) protection: " - "disabled by kernel command line option\n"); - } else { - printk(KERN_INFO "NX (Execute Disable) protection: " - "active\n"); - } -#else - /* 32bit non-PAE kernel, NX cannot be used */ - printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "cannot be enabled: non-PAE kernel!\n"); -#endif - } -} -- cgit v1.2.3-59-g8ed1b From 70431c63d7ed31baf18b0083ba5473274363a174 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 31 Mar 2022 11:05:54 -0700 Subject: x86/pkeys: Clean up arch_set_user_pkey_access() declaration arch_set_user_pkey_access() was declared two times in the header. Remove the 2nd declaration. Suggested-by: "Edgecombe, Rick P" Signed-off-by: Ira Weiny Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220331180554.2945884-1-ira.weiny@intel.com --- arch/x86/include/asm/pkeys.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 1d5f14aff5f6..9c530530b9a7 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -118,8 +118,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); -- cgit v1.2.3-59-g8ed1b From 5a0893088a20252cc268cbeabb25e883c2b6f94f Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 31 Mar 2022 11:06:55 -0700 Subject: x86/pkeys: Remove __arch_set_user_pkey_access() declaration In the x86 code __arch_set_user_pkey_access() is not used and is not defined. Remove the dead declaration. Signed-off-by: Ira Weiny Signed-off-by: Dave Hansen Link: https://lkml.kernel.org/r/20220331180655.2946086-1-ira.weiny@intel.com --- arch/x86/include/asm/pkeys.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 9c530530b9a7..2e6c04d8a45b 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -41,9 +41,6 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, return __arch_override_mprotect_pkey(vma, prot, pkey); } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) @@ -118,9 +115,6 @@ int mm_pkey_free(struct mm_struct *mm, int pkey) return 0; } -extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); - static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | -- cgit v1.2.3-59-g8ed1b From a77d41ac3a0f41c80120ec5b8b08ab284fec950a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:06 -0700 Subject: x86/cpufeatures: Add AMD Fam19h Branch Sampling feature Add a cpu feature for AMD Fam19h Branch Sampling feature as bit 31 of EBX on CPUID leaf function 0x80000008. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-3-eranian@google.com --- arch/x86/include/asm/cpufeatures.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..0d62afd525e3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -315,6 +315,7 @@ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ +#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -- cgit v1.2.3-59-g8ed1b From ada543459cab7f653dcacdaba4011a8bb19c627c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:07 -0700 Subject: perf/x86/amd: Add AMD Fam19h Branch Sampling support Add support for the AMD Fam19h 16-deep branch sampling feature as described in the AMD PPR Fam19h Model 01h Revision B1. This is a model specific extension. It is not an architected AMD feature. The Branch Sampling (BRS) operates with a 16-deep saturating buffer in MSR registers. There is no branch type filtering. All control flow changes are captured. BRS relies on specific programming of the core PMU of Fam19h. In particular, the following requirements must be met: - the sampling period be greater than 16 (BRS depth) - the sampling period must use a fixed and not frequency mode BRS interacts with the NMI interrupt as well. Because enabling BRS is expensive, it is only activated after P event occurrences, where P is the desired sampling period. At P occurrences of the event, the counter overflows, the CPU catches the interrupt, activates BRS for 16 branches until it saturates, and then delivers the NMI to the kernel. Between the overflow and the time BRS activates more branches may be executed skewing the period. All along, the sampling event keeps counting. The skid may be attenuated by reducing the sampling period by 16 (subsequent patch). BRS is integrated into perf_events seamlessly via the same PERF_RECORD_BRANCH_STACK sample format. BRS generates perf_branch_entry records in the sampling buffer. No prediction information is supported. The branches are stored in reverse order of execution. The most recent branch is the first entry in each record. No modification to the perf tool is necessary. BRS can be used with any sampling event. However, it is recommended to use the RETIRED_BRANCH_INSTRUCTIONS event because it matches what the BRS captures. $ perf record -b -c 1000037 -e cpu/event=0xc2,name=ret_br_instructions/ test $ perf report -D 56531696056126 0x193c000 [0x1a8]: PERF_RECORD_SAMPLE(IP, 0x2): 18122/18230: 0x401d24 period: 1000037 addr: 0 ... branch stack: nr:16 ..... 0: 0000000000401d24 -> 0000000000401d5a 0 cycles 0 ..... 1: 0000000000401d5c -> 0000000000401d24 0 cycles 0 ..... 2: 0000000000401d22 -> 0000000000401d5c 0 cycles 0 ..... 3: 0000000000401d5e -> 0000000000401d22 0 cycles 0 ..... 4: 0000000000401d20 -> 0000000000401d5e 0 cycles 0 ..... 5: 0000000000401d3e -> 0000000000401d20 0 cycles 0 ..... 6: 0000000000401d42 -> 0000000000401d3e 0 cycles 0 ..... 7: 0000000000401d3c -> 0000000000401d42 0 cycles 0 ..... 8: 0000000000401d44 -> 0000000000401d3c 0 cycles 0 ..... 9: 0000000000401d3a -> 0000000000401d44 0 cycles 0 ..... 10: 0000000000401d46 -> 0000000000401d3a 0 cycles 0 ..... 11: 0000000000401d38 -> 0000000000401d46 0 cycles 0 ..... 12: 0000000000401d48 -> 0000000000401d38 0 cycles 0 ..... 13: 0000000000401d36 -> 0000000000401d48 0 cycles 0 ..... 14: 0000000000401d4a -> 0000000000401d36 0 cycles 0 ..... 15: 0000000000401d34 -> 0000000000401d4a 0 cycles 0 ... thread: test:18230 ...... dso: test Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-4-eranian@google.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/brs.c | 317 +++++++++++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 233 +++++++++++++++++++++++++++- arch/x86/events/core.c | 10 +- arch/x86/events/perf_event.h | 101 +++++++++++-- arch/x86/include/asm/msr-index.h | 4 + 6 files changed, 645 insertions(+), 22 deletions(-) create mode 100644 arch/x86/events/amd/brs.c (limited to 'arch/x86/include') diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index 6cbe38d5fd9d..cf323ffab5cd 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..3c13c484c637 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian + */ +#include +#include +#include + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!boot_cpu_has(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + /* can only capture at all priv levels due to the way BRS works */ + if ((type & PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_PLM_ALL) + return -EINVAL; + + return 0; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + rdmsrl(brs_from(brs_idx), from); + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9687a8aef01c..c7ac70d8ed9a 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -325,8 +325,16 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + static int amd_core_hw_config(struct perf_event *event) { + int ret = 0; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -343,7 +351,66 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - return 0; + /* + * if branch stack is requested + */ + if (has_branch_stack(event)) { + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + } + return ret; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) @@ -366,7 +433,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -555,6 +622,8 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_brs_reset(); } static void amd_pmu_cpu_dead(int cpu) @@ -610,6 +679,8 @@ static void amd_pmu_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; + amd_brs_disable_all(); + x86_pmu_disable_all(); /* @@ -634,6 +705,30 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc; + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + hwc = &cpuc->events[idx]->hw; + + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +746,18 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_add(event); +} + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + amd_pmu_brs_del(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -671,11 +778,31 @@ static void amd_pmu_disable_event(struct perf_event *event) */ static int amd_pmu_handle_irq(struct pt_regs *regs) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); /* Process any counter overflows */ handled = x86_pmu_handle_irq(regs); + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -897,6 +1024,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1077,19 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (sched_in && x86_pmu.lbr_nr) + amd_pmu_brs_sched_task(ctx, sched_in); +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1099,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,6 +1119,37 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_brs_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_brs = { + .name = "caps", + .attrs = amd_pmu_brs_attrs, + .is_visible = amd_brs_is_visible, +}; + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_brs, + NULL, +}; + static int __init amd_core_pmu_init(void) { u64 even_ctr_mask = 0ULL; @@ -989,6 +1201,19 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_sched_task; + /* + * put_event_constraints callback same as Fam17h, set above + */ + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index eef816fc216d..7ada9172b074 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1338,6 +1338,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1704,11 +1708,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); if (!x86_perf_event_set_period(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 150261d929b9..6f1265163c9f 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -67,22 +67,23 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x8000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ + +#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ +#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ +#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ +#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ +#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ +#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ +#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ static inline bool is_topdown_count(struct perf_event *event) { @@ -325,6 +326,8 @@ struct cpu_hw_events { * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ @@ -1105,6 +1108,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -1210,6 +1218,50 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_disable_all(void); +int amd_brs_setup_filter(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); + +/* + * check if BRS is activated on the CPU + * active defined as it has non-zero users and DBG_EXT_CFG.BRSEN=1 + */ +static inline bool amd_brs_active(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return cpuc->brs_active; +} #else /* CONFIG_CPU_SUP_AMD */ @@ -1218,6 +1270,23 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} + #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..8179ea351bd8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -688,6 +688,10 @@ #define MSR_IA32_PERF_CTL 0x00000199 #define INTEL_PERF_CTL_MASK 0xffff +/* AMD Branch Sampling configuration */ +#define MSR_AMD_DBG_EXTN_CFG 0xc000010f +#define MSR_AMD_SAMP_BR_FROM 0xc0010300 + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- cgit v1.2.3-59-g8ed1b From d5616bac7adadbf42a3b63b8717e75eb82a2cc2c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 22 Mar 2022 15:15:13 -0700 Subject: perf/x86/amd: Add idle hooks for branch sampling On AMD Fam19h Zen3, the branch sampling (BRS) feature must be disabled before entering low power and re-enabled (if was active) when returning from low power. Otherwise, the NMI interrupt may be held up for too long and cause problems. Stopping BRS will cause the NMI to be delivered if it was held up. Define a perf_amd_brs_lopwr_cb() callback to stop/restart BRS. The callback is protected by a jump label which is enabled only when AMD BRS is detected. In all other cases, the callback is never called. Signed-off-by: Stephane Eranian [peterz: static_call() and build fixes] Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220322221517.2510440-10-eranian@google.com --- arch/x86/events/amd/brs.c | 33 +++++++++++++++++++++++++++++++++ arch/x86/events/amd/core.c | 4 ++++ arch/x86/events/perf_event.h | 1 + arch/x86/include/asm/perf_event.h | 23 +++++++++++++++++++++++ 4 files changed, 61 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index 40461c3ce714..895c82165d85 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -7,6 +7,7 @@ * Contributed by Stephane Eranian */ #include +#include #include #include @@ -329,3 +330,35 @@ void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) if (sched_in) amd_brs_poison_buffer(); } + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index f7bce8364fe4..8e1e818f8195 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -1225,6 +1226,9 @@ static int __init amd_core_pmu_init(void) /* * put_event_constraints callback same as Fam17h, set above */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); } x86_pmu.attr_update = amd_attr_update; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ef27aee04b13..3b0324584da3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1226,6 +1226,7 @@ void amd_brs_enable(void); void amd_brs_enable_all(void); void amd_brs_disable_all(void); void amd_brs_drain(void); +void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); int amd_brs_setup_filter(struct perf_event *event); void amd_brs_reset(void); diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 58d9e4b1fa0a..8199fc5a37ea 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_PERF_EVENT_H #define _ASM_X86_PERF_EVENT_H +#include + /* * Performance event hw details: */ @@ -513,6 +515,27 @@ static inline void intel_pt_handle_vmx(int on) #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) extern void amd_pmu_enable_virt(void); extern void amd_pmu_disable_virt(void); + +#if defined(CONFIG_PERF_EVENTS_AMD_BRS) + +#define PERF_NEEDS_LOPWR_CB 1 + +/* + * architectural low power callback impacts + * drivers/acpi/processor_idle.c + * drivers/acpi/acpi_pad.c + */ +extern void perf_amd_brs_lopwr_cb(bool lopwr_in); + +DECLARE_STATIC_CALL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); + +static inline void perf_lopwr_cb(bool lopwr_in) +{ + static_call_mod(perf_lopwr_cb)(lopwr_in); +} + +#endif /* PERF_NEEDS_LOPWR_CB */ + #else static inline void amd_pmu_enable_virt(void) { } static inline void amd_pmu_disable_virt(void) { } -- cgit v1.2.3-59-g8ed1b From 046f773be106ec8eb92b13414c90f8e279deffe0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:11 -0600 Subject: KVM: SVM: Define sev_features and VMPL field in the VMSA The hypervisor uses the sev_features field (offset 3B0h) in the Save State Area to control the SEV-SNP guest features such as SNPActive, vTOM, ReflectVC etc. An SEV-SNP guest can read the sev_features field through the SEV_STATUS MSR. While at it, update dump_vmcb() to log the VMPL level. See APM2 Table 15-34 and B-4 for more details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-2-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 6 ++++-- arch/x86/kvm/svm/svm.c | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f70a5108d464..f2d01f351e95 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -282,7 +282,8 @@ struct vmcb_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[43]; + u8 reserved_1[42]; + u8 vmpl; u8 cpl; u8 reserved_2[4]; u64 efer; @@ -347,7 +348,8 @@ struct vmcb_save_area { u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; - u8 reserved_11[56]; + u64 sev_features; + u8 reserved_11[48]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index bd4c64b362d2..81cb518170a8 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3117,8 +3117,8 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "tr:", save01->tr.selector, save01->tr.attrib, save01->tr.limit, save01->tr.base); - pr_err("cpl: %d efer: %016llx\n", - save->cpl, save->efer); + pr_err("vmpl: %d cpl: %d efer: %016llx\n", + save->vmpl, save->cpl, save->efer); pr_err("%-15s %016llx %-13s %016llx\n", "cr0:", save->cr0, "cr2:", save->cr2); pr_err("%-15s %016llx %-13s %016llx\n", -- cgit v1.2.3-59-g8ed1b From e1907d37514b8564ba18b4a768a35beee71cb011 Mon Sep 17 00:00:00 2001 From: Muralidhara M K Date: Thu, 24 Mar 2022 17:57:29 +0530 Subject: x86/amd_nb: Unexport amd_cache_northbridges() amd_cache_northbridges() is exported by amd_nb.c and is called by amd64-agp.c and amd64_edac.c modules at module_init() time so that NB descriptors are properly cached before those drivers can use them. However, the init_amd_nbs() initcall already does call amd_cache_northbridges() unconditionally and thus makes sure the NB descriptors are enumerated. That initcall is a fs_initcall type which is on the 5th group (starting from 0) of initcalls that gets run in increasing numerical order by the init code. The module_init() call is turned into an __initcall() in the MODULE=n case and those are device-level initcalls, i.e., group 6. Therefore, the northbridges caching is already finished by the time module initialization starts and thus the correct initialization order is retained. Unexport amd_cache_northbridges(), update dependent modules to call amd_nb_num() instead. While at it, simplify the checks in amd_cache_northbridges(). [ bp: Heavily massage and *actually* explain why the change is ok. ] Signed-off-by: Muralidhara M K Signed-off-by: Naveen Krishna Chatradhi Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220324122729.221765-1-nchatrad@amd.com --- arch/x86/include/asm/amd_nb.h | 1 - arch/x86/kernel/amd_nb.c | 7 +++---- drivers/char/agp/amd64-agp.c | 2 +- drivers/edac/amd64_edac.c | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 00d1a400b7a1..ed0eaf65c437 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -16,7 +16,6 @@ extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; extern bool early_is_amd_nb(u32 value); extern struct resource *amd_get_mmconfig_range(struct resource *res); -extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); extern int amd_get_subcaches(int); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 020c906f7934..190e0f763375 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -188,7 +188,7 @@ int amd_smn_write(u16 node, u32 address, u32 value) EXPORT_SYMBOL_GPL(amd_smn_write); -int amd_cache_northbridges(void) +static int amd_cache_northbridges(void) { const struct pci_device_id *misc_ids = amd_nb_misc_ids; const struct pci_device_id *link_ids = amd_nb_link_ids; @@ -210,14 +210,14 @@ int amd_cache_northbridges(void) } misc = NULL; - while ((misc = next_northbridge(misc, misc_ids)) != NULL) + while ((misc = next_northbridge(misc, misc_ids))) misc_count++; if (!misc_count) return -ENODEV; root = NULL; - while ((root = next_northbridge(root, root_ids)) != NULL) + while ((root = next_northbridge(root, root_ids))) root_count++; if (root_count) { @@ -290,7 +290,6 @@ int amd_cache_northbridges(void) return 0; } -EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* * Ignores subdevice/subvendor but as far as I can figure out diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index dc78a4fb879e..84a4aa9312cf 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -327,7 +327,7 @@ static int cache_nbs(struct pci_dev *pdev, u32 cap_ptr) { int i; - if (amd_cache_northbridges() < 0) + if (!amd_nb_num()) return -ENODEV; if (!amd_nb_has_feature(AMD_NB_GART)) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 812baa48b290..2f854feeeb23 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -4336,7 +4336,7 @@ static int __init amd64_edac_init(void) if (!x86_match_cpu(amd64_cpuids)) return -ENODEV; - if (amd_cache_northbridges() < 0) + if (!amd_nb_num()) return -ENODEV; opstate_init(); -- cgit v1.2.3-59-g8ed1b From 3dd2775b74c9b1b01d19805877ab45bc47c4a5a5 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Tue, 5 Apr 2022 13:27:43 -0500 Subject: KVM: SVM: Create a separate mapping for the SEV-ES save area The save area for SEV-ES/SEV-SNP guests, as used by the hardware, is different from the save area of a non SEV-ES/SEV-SNP guest. This is the first step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Create an SEV-ES/SEV-SNP save area and adjust usage to the new save area definition where needed. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220405182743.308853-1-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 87 +++++++++++++++++++++++++++++++++++----------- arch/x86/kvm/svm/sev.c | 22 ++++++------ arch/x86/kvm/svm/svm.c | 4 +-- arch/x86/kvm/svm/svm.h | 4 +-- 4 files changed, 82 insertions(+), 35 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f2d01f351e95..788a43f99080 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -271,6 +271,7 @@ struct vmcb_seg { u64 base; } __packed; +/* Save area definition for legacy and SEV-MEM guests */ struct vmcb_save_area { struct vmcb_seg es; struct vmcb_seg cs; @@ -287,8 +288,58 @@ struct vmcb_save_area { u8 cpl; u8 reserved_2[4]; u64 efer; + u8 reserved_3[112]; + u64 cr4; + u64 cr3; + u64 cr0; + u64 dr7; + u64 dr6; + u64 rflags; + u64 rip; + u8 reserved_4[88]; + u64 rsp; + u64 s_cet; + u64 ssp; + u64 isst_addr; + u64 rax; + u64 star; + u64 lstar; + u64 cstar; + u64 sfmask; + u64 kernel_gs_base; + u64 sysenter_cs; + u64 sysenter_esp; + u64 sysenter_eip; + u64 cr2; + u8 reserved_5[32]; + u64 g_pat; + u64 dbgctl; + u64 br_from; + u64 br_to; + u64 last_excp_from; + u64 last_excp_to; + u8 reserved_6[72]; + u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ +} __packed; + +/* Save area definition for SEV-ES and SEV-SNP guests */ +struct sev_es_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + u8 reserved_1[43]; + u8 cpl; + u8 reserved_2[4]; + u64 efer; u8 reserved_3[104]; - u64 xss; /* Valid for SEV-ES only */ + u64 xss; u64 cr4; u64 cr3; u64 cr0; @@ -316,22 +367,14 @@ struct vmcb_save_area { u64 br_to; u64 last_excp_from; u64 last_excp_to; - - /* - * The following part of the save area is valid only for - * SEV-ES guests when referenced through the GHCB or for - * saving to the host save area. - */ - u8 reserved_7[72]; - u32 spec_ctrl; /* Guest version of SPEC_CTRL at 0x2E0 */ - u8 reserved_7b[4]; + u8 reserved_7[80]; u32 pkru; - u8 reserved_7a[20]; - u64 reserved_8; /* rax already available at 0x01f8 */ + u8 reserved_9[20]; + u64 reserved_10; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; u64 rbx; - u64 reserved_9; /* rsp already available at 0x01d8 */ + u64 reserved_11; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -343,23 +386,25 @@ struct vmcb_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_10[16]; + u8 reserved_12[16]; u64 sw_exit_code; u64 sw_exit_info_1; u64 sw_exit_info_2; u64 sw_scratch; u64 sev_features; - u8 reserved_11[48]; + u8 reserved_13[48]; u64 xcr0; u8 valid_bitmap[16]; u64 x87_state_gpa; } __packed; +#define GHCB_SHARED_BUF_SIZE 2032 + struct ghcb { - struct vmcb_save_area save; - u8 reserved_save[2048 - sizeof(struct vmcb_save_area)]; + struct sev_es_save_area save; + u8 reserved_save[2048 - sizeof(struct sev_es_save_area)]; - u8 shared_buffer[2032]; + u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; u8 reserved_1[10]; u16 protocol_version; /* negotiated SEV-ES/GHCB protocol version */ @@ -367,13 +412,15 @@ struct ghcb { } __packed; -#define EXPECTED_VMCB_SAVE_AREA_SIZE 1032 +#define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); } @@ -443,7 +490,7 @@ struct vmcb { /* GHCB Accessor functions */ #define GHCB_BITMAP_IDX(field) \ - (offsetof(struct vmcb_save_area, field) / sizeof(u64)) + (offsetof(struct sev_es_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 75fa6dd268f0..6e18ec1839f0 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -559,12 +559,20 @@ e_unpin: static int sev_es_sync_vmsa(struct vcpu_svm *svm) { - struct vmcb_save_area *save = &svm->vmcb->save; + struct sev_es_save_area *save = svm->sev_es.vmsa; /* Check some debug related fields before encrypting the VMSA */ - if (svm->vcpu.guest_debug || (save->dr7 & ~DR7_FIXED_1)) + if (svm->vcpu.guest_debug || (svm->vmcb->save.dr7 & ~DR7_FIXED_1)) return -EINVAL; + /* + * SEV-ES will use a VMSA that is pointed to by the VMCB, not + * the traditional VMSA that is part of the VMCB. Copy the + * traditional VMSA as it has been built so far (in prep + * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. + */ + memcpy(save, &svm->vmcb->save, sizeof(svm->vmcb->save)); + /* Sync registgers */ save->rax = svm->vcpu.arch.regs[VCPU_REGS_RAX]; save->rbx = svm->vcpu.arch.regs[VCPU_REGS_RBX]; @@ -592,14 +600,6 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xss = svm->vcpu.arch.ia32_xss; save->dr6 = svm->vcpu.arch.dr6; - /* - * SEV-ES will use a VMSA that is pointed to by the VMCB, not - * the traditional VMSA that is part of the VMCB. Copy the - * traditional VMSA as it has been built so far (in prep - * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. - */ - memcpy(svm->sev_es.vmsa, save, sizeof(*save)); - return 0; } @@ -2932,7 +2932,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm) sev_enc_bit)); } -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa) +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa) { /* * As an SEV-ES guest, hardware will restore the host state on VMEXIT, diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 81cb518170a8..6ff595f74e3a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1270,8 +1270,8 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ vmsave(__sme_page_pa(sd->save_area)); if (sev_es_guest(vcpu->kvm)) { - struct vmcb_save_area *hostsa; - hostsa = (struct vmcb_save_area *)(page_address(sd->save_area) + 0x400); + struct sev_es_save_area *hostsa; + hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); sev_es_prepare_switch_to_guest(hostsa); } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f77a7d2d39dd..cc857deaee5e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -181,7 +181,7 @@ struct svm_nested_state { struct vcpu_sev_es_state { /* SEV-ES support */ - struct vmcb_save_area *vmsa; + struct sev_es_save_area *vmsa; struct ghcb *ghcb; struct kvm_host_map ghcb_map; bool received_first_sipi; @@ -620,7 +620,7 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in); void sev_es_init_vmcb(struct vcpu_svm *svm); void sev_es_vcpu_reset(struct vcpu_svm *svm); void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); -void sev_es_prepare_switch_to_guest(struct vmcb_save_area *hostsa); +void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa); void sev_es_unmap_ghcb(struct vcpu_svm *svm); /* vmenter.S */ -- cgit v1.2.3-59-g8ed1b From a4690359eaec985a1351786da887df1ba92440a0 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:13 -0600 Subject: KVM: SVM: Create a separate mapping for the GHCB save area The initial implementation of the GHCB spec was based on trying to keep the register state offsets the same relative to the VM save area. However, the save area for SEV-ES has changed within the hardware causing the relation between the SEV-ES save area to change relative to the GHCB save area. This is the second step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Create a GHCB save area that matches the GHCB specification. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-4-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 48 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 788a43f99080..0789ad83b2bd 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -398,11 +398,51 @@ struct sev_es_save_area { u64 x87_state_gpa; } __packed; +struct ghcb_save_area { + u8 reserved_1[203]; + u8 cpl; + u8 reserved_2[116]; + u64 xss; + u8 reserved_3[24]; + u64 dr7; + u8 reserved_4[16]; + u64 rip; + u8 reserved_5[88]; + u64 rsp; + u8 reserved_6[24]; + u64 rax; + u8 reserved_7[264]; + u64 rcx; + u64 rdx; + u64 rbx; + u8 reserved_8[8]; + u64 rbp; + u64 rsi; + u64 rdi; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; + u8 reserved_9[16]; + u64 sw_exit_code; + u64 sw_exit_info_1; + u64 sw_exit_info_2; + u64 sw_scratch; + u8 reserved_10[56]; + u64 xcr0; + u8 valid_bitmap[16]; + u64 x87_state_gpa; +} __packed; + #define GHCB_SHARED_BUF_SIZE 2032 struct ghcb { - struct sev_es_save_area save; - u8 reserved_save[2048 - sizeof(struct sev_es_save_area)]; + struct ghcb_save_area save; + u8 reserved_save[2048 - sizeof(struct ghcb_save_area)]; u8 shared_buffer[GHCB_SHARED_BUF_SIZE]; @@ -413,6 +453,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 740 +#define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 #define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE @@ -420,6 +461,7 @@ struct ghcb { static inline void __unused_size_checks(void) { BUILD_BUG_ON(sizeof(struct vmcb_save_area) != EXPECTED_VMCB_SAVE_AREA_SIZE); + BUILD_BUG_ON(sizeof(struct ghcb_save_area) != EXPECTED_GHCB_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct sev_es_save_area) != EXPECTED_SEV_ES_SAVE_AREA_SIZE); BUILD_BUG_ON(sizeof(struct vmcb_control_area) != EXPECTED_VMCB_CONTROL_AREA_SIZE); BUILD_BUG_ON(sizeof(struct ghcb) != EXPECTED_GHCB_SIZE); @@ -490,7 +532,7 @@ struct vmcb { /* GHCB Accessor functions */ #define GHCB_BITMAP_IDX(field) \ - (offsetof(struct sev_es_save_area, field) / sizeof(u64)) + (offsetof(struct ghcb_save_area, field) / sizeof(u64)) #define DEFINE_GHCB_ACCESSORS(field) \ static inline bool ghcb_##field##_is_valid(const struct ghcb *ghcb) \ -- cgit v1.2.3-59-g8ed1b From 6d3b3d34e39eb4ee9a7dbe7e28e2588e160f9c0f Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:14 -0600 Subject: KVM: SVM: Update the SEV-ES save area mapping This is the final step in defining the multiple save areas to keep them separate and ensuring proper operation amongst the different types of guests. Update the SEV-ES/SEV-SNP save area to match the APM. This save area will be used for the upcoming SEV-SNP AP Creation NAE event support. Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-5-brijesh.singh@amd.com --- arch/x86/include/asm/svm.h | 66 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 16 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 0789ad83b2bd..ec623e7da33d 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -334,7 +334,13 @@ struct sev_es_save_area { struct vmcb_seg ldtr; struct vmcb_seg idtr; struct vmcb_seg tr; - u8 reserved_1[43]; + u64 vmpl0_ssp; + u64 vmpl1_ssp; + u64 vmpl2_ssp; + u64 vmpl3_ssp; + u64 u_cet; + u8 reserved_1[2]; + u8 vmpl; u8 cpl; u8 reserved_2[4]; u64 efer; @@ -347,9 +353,19 @@ struct sev_es_save_area { u64 dr6; u64 rflags; u64 rip; - u8 reserved_4[88]; + u64 dr0; + u64 dr1; + u64 dr2; + u64 dr3; + u64 dr0_addr_mask; + u64 dr1_addr_mask; + u64 dr2_addr_mask; + u64 dr3_addr_mask; + u8 reserved_4[24]; u64 rsp; - u8 reserved_5[24]; + u64 s_cet; + u64 ssp; + u64 isst_addr; u64 rax; u64 star; u64 lstar; @@ -360,7 +376,7 @@ struct sev_es_save_area { u64 sysenter_esp; u64 sysenter_eip; u64 cr2; - u8 reserved_6[32]; + u8 reserved_5[32]; u64 g_pat; u64 dbgctl; u64 br_from; @@ -369,12 +385,12 @@ struct sev_es_save_area { u64 last_excp_to; u8 reserved_7[80]; u32 pkru; - u8 reserved_9[20]; - u64 reserved_10; /* rax already available at 0x01f8 */ + u8 reserved_8[20]; + u64 reserved_9; /* rax already available at 0x01f8 */ u64 rcx; u64 rdx; u64 rbx; - u64 reserved_11; /* rsp already available at 0x01d8 */ + u64 reserved_10; /* rsp already available at 0x01d8 */ u64 rbp; u64 rsi; u64 rdi; @@ -386,16 +402,34 @@ struct sev_es_save_area { u64 r13; u64 r14; u64 r15; - u8 reserved_12[16]; - u64 sw_exit_code; - u64 sw_exit_info_1; - u64 sw_exit_info_2; - u64 sw_scratch; + u8 reserved_11[16]; + u64 guest_exit_info_1; + u64 guest_exit_info_2; + u64 guest_exit_int_info; + u64 guest_nrip; u64 sev_features; - u8 reserved_13[48]; + u64 vintr_ctrl; + u64 guest_exit_code; + u64 virtual_tom; + u64 tlb_id; + u64 pcpu_id; + u64 event_inj; u64 xcr0; - u8 valid_bitmap[16]; - u64 x87_state_gpa; + u8 reserved_12[16]; + + /* Floating point area */ + u64 x87_dp; + u32 mxcsr; + u16 x87_ftw; + u16 x87_fsw; + u16 x87_fcw; + u16 x87_fop; + u16 x87_ds; + u16 x87_cs; + u64 x87_rip; + u8 fpreg_x87[80]; + u8 fpreg_xmm[256]; + u8 fpreg_ymm[256]; } __packed; struct ghcb_save_area { @@ -454,7 +488,7 @@ struct ghcb { #define EXPECTED_VMCB_SAVE_AREA_SIZE 740 #define EXPECTED_GHCB_SAVE_AREA_SIZE 1032 -#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1032 +#define EXPECTED_SEV_ES_SAVE_AREA_SIZE 1648 #define EXPECTED_VMCB_CONTROL_AREA_SIZE 1024 #define EXPECTED_GHCB_SIZE PAGE_SIZE -- cgit v1.2.3-59-g8ed1b From 176db622573f028f85221873ea4577e096785315 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:09:59 -0600 Subject: x86/boot: Introduce helpers for MSR reads/writes The current set of helpers used throughout the run-time kernel have dependencies on code/facilities outside of the boot kernel, so there are a number of call-sites throughout the boot kernel where inline assembly is used instead. More will be added with subsequent patches that add support for SEV-SNP, so take the opportunity to provide a basic set of helpers that can be used by the boot kernel to reduce reliance on inline assembly. Use boot_* prefix so that it's clear these are helpers specific to the boot kernel to avoid any confusion with the various other MSR read/write helpers. [ bp: Disambiguate parameter names and trim comment. ] Suggested-by: Borislav Petkov Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-6-brijesh.singh@amd.com --- arch/x86/boot/msr.h | 26 ++++++++++++++++++++++++++ arch/x86/include/asm/msr.h | 11 +---------- arch/x86/include/asm/shared/msr.h | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 10 deletions(-) create mode 100644 arch/x86/boot/msr.h create mode 100644 arch/x86/include/asm/shared/msr.h (limited to 'arch/x86/include') diff --git a/arch/x86/boot/msr.h b/arch/x86/boot/msr.h new file mode 100644 index 000000000000..aed66f7ae199 --- /dev/null +++ b/arch/x86/boot/msr.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Helpers/definitions related to MSR access. + */ + +#ifndef BOOT_MSR_H +#define BOOT_MSR_H + +#include + +/* + * The kernel proper already defines rdmsr()/wrmsr(), but they are not for the + * boot kernel since they rely on tracepoint/exception handling infrastructure + * that's not available here. + */ +static inline void boot_rdmsr(unsigned int reg, struct msr *m) +{ + asm volatile("rdmsr" : "=a" (m->l), "=d" (m->h) : "c" (reg)); +} + +static inline void boot_wrmsr(unsigned int reg, const struct msr *m) +{ + asm volatile("wrmsr" : : "c" (reg), "a"(m->l), "d" (m->h) : "memory"); +} + +#endif /* BOOT_MSR_H */ diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index d42e6c6b47b1..65ec1965cd28 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -10,16 +10,7 @@ #include #include #include - -struct msr { - union { - struct { - u32 l; - u32 h; - }; - u64 q; - }; -}; +#include struct msr_info { u32 msr_no; diff --git a/arch/x86/include/asm/shared/msr.h b/arch/x86/include/asm/shared/msr.h new file mode 100644 index 000000000000..1e6ec10b3a15 --- /dev/null +++ b/arch/x86/include/asm/shared/msr.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_MSR_H +#define _ASM_X86_SHARED_MSR_H + +struct msr { + union { + struct { + u32 l; + u32 h; + }; + u64 q; + }; +}; + +#endif /* _ASM_X86_SHARED_MSR_H */ -- cgit v1.2.3-59-g8ed1b From f742b90e61bb53b27771f64bdae05db03a6ab1f2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:55:49 -0600 Subject: x86/mm: Extend cc_attr to include AMD SEV-SNP The CC_ATTR_GUEST_SEV_SNP can be used by the guest to query whether the SNP (Secure Nested Paging) feature is active. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-10-brijesh.singh@amd.com --- arch/x86/coco/core.c | 3 +++ arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/mm/mem_encrypt.c | 4 ++++ include/linux/cc_platform.h | 8 ++++++++ 4 files changed, 17 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index fc1365dd927e..dafd4881ce29 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -57,6 +57,9 @@ static bool amd_cc_platform_has(enum cc_attr attr) return (sev_status & MSR_AMD64_SEV_ENABLED) && !(sev_status & MSR_AMD64_SEV_ES_ENABLED); + case CC_ATTR_GUEST_SEV_SNP: + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; + default: return false; } diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..ef96f166b1b6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -502,8 +502,10 @@ #define MSR_AMD64_SEV 0xc0010131 #define MSR_AMD64_SEV_ENABLED_BIT 0 #define MSR_AMD64_SEV_ES_ENABLED_BIT 1 +#define MSR_AMD64_SEV_SNP_ENABLED_BIT 2 #define MSR_AMD64_SEV_ENABLED BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT) #define MSR_AMD64_SEV_ES_ENABLED BIT_ULL(MSR_AMD64_SEV_ES_ENABLED_BIT) +#define MSR_AMD64_SEV_SNP_ENABLED BIT_ULL(MSR_AMD64_SEV_SNP_ENABLED_BIT) #define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 50d209939c66..f85868c031c6 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -62,6 +62,10 @@ static void print_mem_encrypt_feature_info(void) if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) pr_cont(" SEV-ES"); + /* Secure Nested Paging */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + pr_cont(" SEV-SNP"); + pr_cont("\n"); } diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h index efd8205282da..d08dd65b5c43 100644 --- a/include/linux/cc_platform.h +++ b/include/linux/cc_platform.h @@ -72,6 +72,14 @@ enum cc_attr { * Examples include TDX guest & SEV. */ CC_ATTR_GUEST_UNROLL_STRING_IO, + + /** + * @CC_ATTR_SEV_SNP: Guest SNP is active. + * + * The platform/OS is running as a guest/virtual machine and actively + * using AMD SEV-SNP features. + */ + CC_ATTR_GUEST_SEV_SNP, }; #ifdef CONFIG_ARCH_HAS_CC_PLATFORM -- cgit v1.2.3-59-g8ed1b From 6c0f74d678c94060932683738b3e227995b363d3 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:04 -0600 Subject: x86/sev: Define the Linux-specific guest termination reasons The GHCB specification defines the reason code for reason set 0. The reason codes defined in the set 0 do not cover all possible causes for a guest to request termination. The reason sets 1 to 255 are reserved for the vendor-specific codes. Reserve the reason set 1 for the Linux guest. Define the error codes for reason set 1 so that one can have meaningful termination reasons and thus better guest failure diagnosis. While at it, change sev_es_terminate() to accept a reason set parameter. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-11-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 6 +++--- arch/x86/include/asm/sev-common.h | 8 ++++++++ arch/x86/kernel/sev-shared.c | 11 ++++------- arch/x86/kernel/sev.c | 4 ++-- 4 files changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 27ccd5a5ff60..56e941d5e092 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -119,7 +119,7 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, static bool early_setup_sev_es(void) { if (!sev_es_negotiate_protocol()) - sev_es_terminate(GHCB_SEV_ES_PROT_UNSUPPORTED); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -172,7 +172,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) enum es_result result; if (!boot_ghcb && !early_setup_sev_es()) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); @@ -199,7 +199,7 @@ finish: if (result == ES_OK) vc_finish_insn(&ctxt); else if (result != ES_RETRY) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } void sev_enable(struct boot_params *bp) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1b2fd32b42fe..94f0ea574049 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -73,9 +73,17 @@ /* GHCBData[23:16] */ \ ((((u64)reason_val) & 0xff) << 16)) +/* Error codes from reason set 0 */ +#define SEV_TERM_SET_GEN 0 #define GHCB_SEV_ES_GEN_REQ 0 #define GHCB_SEV_ES_PROT_UNSUPPORTED 1 +/* Linux-specific reason codes (used with reason set 1) */ +#define SEV_TERM_SET_LINUX 1 +#define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */ +#define GHCB_TERM_PSC 1 /* Page State Change failure */ +#define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ + #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) /* diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index ce987688bbc0..2abf8a7d75e5 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -24,15 +24,12 @@ static bool __init sev_es_check_cpu_features(void) return true; } -static void __noreturn sev_es_terminate(unsigned int reason) +static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) { u64 val = GHCB_MSR_TERM_REQ; - /* - * Tell the hypervisor what went wrong - only reason-set 0 is - * currently supported. - */ - val |= GHCB_SEV_TERM_REASON(0, reason); + /* Tell the hypervisor what went wrong. */ + val |= GHCB_SEV_TERM_REASON(set, reason); /* Request Guest Termination from Hypvervisor */ sev_es_wr_ghcb_msr(val); @@ -221,7 +218,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) fail: /* Terminate the guest */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } static enum es_result vc_insn_string_read(struct es_em_ctxt *ctxt, diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e6d316a01fdd..19ad09712902 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1337,7 +1337,7 @@ DEFINE_IDTENTRY_VC_KERNEL(exc_vmm_communication) show_regs(regs); /* Ask hypervisor to sev_es_terminate */ - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); /* If that fails and we get here - just panic */ panic("Returned from Terminate-Request to Hypervisor\n"); @@ -1385,7 +1385,7 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) /* Do initial setup or terminate the guest */ if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(GHCB_SEV_ES_GEN_REQ); + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); -- cgit v1.2.3-59-g8ed1b From 2ea29c5abbc27147c2d9e2ab5e05436aca706b65 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:05 -0600 Subject: x86/sev: Save the negotiated GHCB version The SEV-ES guest calls sev_es_negotiate_protocol() to negotiate the GHCB protocol version before establishing the GHCB. Cache the negotiated GHCB version so that it can be used later. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-12-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev-shared.c | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index ec060c433589..9b9c190e8c3b 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -12,7 +12,7 @@ #include #include -#define GHCB_PROTO_OUR 0x0001UL +#define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 1ULL #define GHCB_DEFAULT_USAGE 0ULL diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 2abf8a7d75e5..91105f5a02a8 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -14,6 +14,15 @@ #define has_cpuflag(f) boot_cpu_has(f) #endif +/* + * Since feature negotiation related variables are set early in the boot + * process they must reside in the .data section so as not to be zeroed + * out when the .bss section is later cleared. + * + * GHCB protocol version negotiated with the hypervisor. + */ +static u16 ghcb_version __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -51,10 +60,12 @@ static bool sev_es_negotiate_protocol(void) if (GHCB_MSR_INFO(val) != GHCB_MSR_SEV_INFO_RESP) return false; - if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTO_OUR || - GHCB_MSR_PROTO_MIN(val) > GHCB_PROTO_OUR) + if (GHCB_MSR_PROTO_MAX(val) < GHCB_PROTOCOL_MIN || + GHCB_MSR_PROTO_MIN(val) > GHCB_PROTOCOL_MAX) return false; + ghcb_version = min_t(size_t, GHCB_MSR_PROTO_MAX(val), GHCB_PROTOCOL_MAX); + return true; } @@ -127,7 +138,7 @@ enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, bool set_ghcb_msr, u64 exit_info_1, u64 exit_info_2) { /* Fill in protocol and format specifiers */ - ghcb->protocol_version = GHCB_PROTOCOL_MAX; + ghcb->protocol_version = ghcb_version; ghcb->ghcb_usage = GHCB_DEFAULT_USAGE; ghcb_set_sw_exit_code(ghcb, exit_code); -- cgit v1.2.3-59-g8ed1b From cbd3d4f7c4e5a93edae68e5142a269368fde77d6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:06 -0600 Subject: x86/sev: Check SEV-SNP features support Version 2 of the GHCB specification added the advertisement of features that are supported by the hypervisor. If the hypervisor supports SEV-SNP then it must set the SEV-SNP features bit to indicate that the base functionality is supported. Check that feature bit while establishing the GHCB; if failed, terminate the guest. Version 2 of the GHCB specification adds several new Non-Automatic Exits (NAEs), most of them are optional except the hypervisor feature. Now that the hypervisor feature NAE is implemented, bump the GHCB maximum supported protocol version. While at it, move the GHCB protocol negotiation check from the #VC exception handler to sev_enable() so that all feature detection happens before the first #VC exception. While at it, document why the GHCB page cannot be setup from load_stage2_idt(). [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-13-brijesh.singh@amd.com --- arch/x86/boot/compressed/idt_64.c | 18 +++++++++++++++++- arch/x86/boot/compressed/sev.c | 20 +++++++++++++++----- arch/x86/include/asm/sev-common.h | 6 ++++++ arch/x86/include/asm/sev.h | 2 +- arch/x86/include/uapi/asm/svm.h | 2 ++ arch/x86/kernel/sev-shared.c | 20 ++++++++++++++++++++ arch/x86/kernel/sev.c | 14 ++++++++++++++ 7 files changed, 75 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/idt_64.c b/arch/x86/boot/compressed/idt_64.c index 9b93567d663a..6debb816e83d 100644 --- a/arch/x86/boot/compressed/idt_64.c +++ b/arch/x86/boot/compressed/idt_64.c @@ -39,7 +39,23 @@ void load_stage1_idt(void) load_boot_idt(&boot_idt_desc); } -/* Setup IDT after kernel jumping to .Lrelocated */ +/* + * Setup IDT after kernel jumping to .Lrelocated. + * + * initialize_identity_maps() needs a #PF handler to be setup + * in order to be able to fault-in identity mapping ranges; see + * do_boot_page_fault(). + * + * This #PF handler setup needs to happen in load_stage2_idt() where the + * IDT is loaded and there the #VC IDT entry gets setup too. + * + * In order to be able to handle #VCs, one needs a GHCB which + * gets setup with an already set up pagetable, which is done in + * initialize_identity_maps(). And there's the catch 22: the boot #VC + * handler do_boot_stage2_vc() needs to call early_setup_ghcb() itself + * (and, especially set_page_decrypted()) because the SEV-ES setup code + * cannot initialize a GHCB as there's no #PF handler yet... + */ void load_stage2_idt(void) { boot_idt_desc.address = (unsigned long)boot_idt; diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 56e941d5e092..5b389310be87 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -116,11 +116,8 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" -static bool early_setup_sev_es(void) +static bool early_setup_ghcb(void) { - if (!sev_es_negotiate_protocol()) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); - if (set_page_decrypted((unsigned long)&boot_ghcb_page)) return false; @@ -171,7 +168,7 @@ void do_boot_stage2_vc(struct pt_regs *regs, unsigned long exit_code) struct es_em_ctxt ctxt; enum es_result result; - if (!boot_ghcb && !early_setup_sev_es()) + if (!boot_ghcb && !early_setup_ghcb()) sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); vc_ghcb_invalidate(boot_ghcb); @@ -235,5 +232,18 @@ void sev_enable(struct boot_params *bp) if (!(sev_status & MSR_AMD64_SEV_ENABLED)) return; + /* Negotiate the GHCB protocol version. */ + if (sev_status & MSR_AMD64_SEV_ES_ENABLED) { + if (!sev_es_negotiate_protocol()) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_PROT_UNSUPPORTED); + } + + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED && !(get_hv_features() & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + sme_me_mask = BIT_ULL(ebx & 0x3f); } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 94f0ea574049..6f037c29a46e 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -60,6 +60,11 @@ /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 +#define GHCB_MSR_HV_FT_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + +#define GHCB_HV_FT_SNP BIT_ULL(0) #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 @@ -77,6 +82,7 @@ #define SEV_TERM_SET_GEN 0 #define GHCB_SEV_ES_GEN_REQ 0 #define GHCB_SEV_ES_PROT_UNSUPPORTED 1 +#define GHCB_SNP_UNSUPPORTED 2 /* Linux-specific reason codes (used with reason set 1) */ #define SEV_TERM_SET_LINUX 1 diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9b9c190e8c3b..17b75f6ee11a 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -13,7 +13,7 @@ #include #define GHCB_PROTOCOL_MIN 1ULL -#define GHCB_PROTOCOL_MAX 1ULL +#define GHCB_PROTOCOL_MAX 2ULL #define GHCB_DEFAULT_USAGE 0ULL #define VMGEXIT() { asm volatile("rep; vmmcall\n\r"); } diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index efa969325ede..b0ad00f4c1e1 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -108,6 +108,7 @@ #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff /* Exit code reserved for hypervisor/software use */ @@ -218,6 +219,7 @@ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ + { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 91105f5a02a8..4a876e684f67 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -48,6 +48,26 @@ static void __noreturn sev_es_terminate(unsigned int set, unsigned int reason) asm volatile("hlt\n" : : : "memory"); } +/* + * The hypervisor features are available from GHCB version 2 onward. + */ +static u64 get_hv_features(void) +{ + u64 val; + + if (ghcb_version < 2) + return 0; + + sev_es_wr_ghcb_msr(GHCB_MSR_HV_FT_REQ); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + if (GHCB_RESP_CODE(val) != GHCB_MSR_HV_FT_RESP) + return 0; + + return GHCB_MSR_HV_FT_RESP_VAL(val); +} + static bool sev_es_negotiate_protocol(void) { u64 val; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 19ad09712902..cb20fb0c608e 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -43,6 +43,9 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); */ static struct ghcb __initdata *boot_ghcb; +/* Bitmap of SEV features supported by the hypervisor */ +static u64 sev_hv_features __ro_after_init; + /* #VC handler runtime per-CPU data */ struct sev_es_runtime_data { struct ghcb ghcb_page; @@ -766,6 +769,17 @@ void __init sev_es_init_vc_handling(void) if (!sev_es_check_cpu_features()) panic("SEV-ES CPU Features missing"); + /* + * SNP is supported in v2 of the GHCB spec which mandates support for HV + * features. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + sev_hv_features = get_hv_features(); + + if (!(sev_hv_features & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + } + /* Enable SEV-ES special handling */ static_branch_enable(&sev_es_enable_key); -- cgit v1.2.3-59-g8ed1b From 0bd6f1e526070271dbe0f626b123b4f6b01dc79c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:07 -0600 Subject: x86/sev: Add a helper for the PVALIDATE instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An SNP-active guest uses the PVALIDATE instruction to validate or rescind the validation of a guest page’s RMP entry. Upon completion, a return code is stored in EAX and rFLAGS bits are set based on the return code. If the instruction completed successfully, the carry flag (CF) indicates if the content of the RMP were changed or not. See AMD APM Volume 3 for additional details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-14-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 17b75f6ee11a..4ee98976aed8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -60,6 +60,9 @@ extern void vc_no_ghcb(void); extern void vc_boot_ghcb(void); extern bool handle_vc_boot_ghcb(struct pt_regs *regs); +/* Software defined (when rFlags.CF = 1) */ +#define PVALIDATE_FAIL_NOUPDATE 255 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -87,12 +90,30 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) +{ + bool no_rmpupdate; + int rc; + + /* "pvalidate" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF2, 0x0F, 0x01, 0xFF\n\t" + CC_SET(c) + : CC_OUT(c) (no_rmpupdate), "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(validate) + : "memory", "cc"); + + if (no_rmpupdate) + return PVALIDATE_FAIL_NOUPDATE; + + return rc; +} #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { return 0; } static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } +static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } #endif #endif -- cgit v1.2.3-59-g8ed1b From 81cc3df9a90e7817494421ecc48ede6bd5e8132b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:08 -0600 Subject: x86/sev: Check the VMPL level The Virtual Machine Privilege Level (VMPL) feature in the SEV-SNP architecture allows a guest VM to divide its address space into four levels. The level can be used to provide hardware isolated abstraction layers within a VM. VMPL0 is the highest privilege level, and VMPL3 is the least privilege level. Certain operations must be done by the VMPL0 software, such as: * Validate or invalidate memory range (PVALIDATE instruction) * Allocate VMSA page (RMPADJUST instruction when VMSA=1) The initial SNP support requires that the guest kernel is running at VMPL0. Add such a check to verify the guest is running at level 0 before continuing the boot. There is no easy method to query the current VMPL level, so use the RMPADJUST instruction to determine whether the guest is running at the VMPL0. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-15-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 28 ++++++++++++++++++++++++++-- arch/x86/include/asm/sev-common.h | 1 + arch/x86/include/asm/sev.h | 16 ++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 5b389310be87..eb421787d50a 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -199,6 +199,26 @@ finish: sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); } +static void enforce_vmpl0(void) +{ + u64 attrs; + int err; + + /* + * RMPADJUST modifies RMP permissions of a lesser-privileged (numerically + * higher) privilege level. Here, clear the VMPL1 permission mask of the + * GHCB page. If the guest is not running at VMPL0, this will fail. + * + * If the guest is running at VMPL0, it will succeed. Even if that operation + * modifies permission bits, it is still ok to do so currently because Linux + * SNP guests are supported only on VMPL0 so VMPL1 or higher permission masks + * changing is a don't-care. + */ + attrs = 1; + if (rmpadjust((unsigned long)&boot_ghcb_page, RMP_PG_SIZE_4K, attrs)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_NOT_VMPL0); +} + void sev_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; @@ -242,8 +262,12 @@ void sev_enable(struct boot_params *bp) * SNP is supported in v2 of the GHCB spec which mandates support for HV * features. */ - if (sev_status & MSR_AMD64_SEV_SNP_ENABLED && !(get_hv_features() & GHCB_HV_FT_SNP)) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + if (sev_status & MSR_AMD64_SEV_SNP_ENABLED) { + if (!(get_hv_features() & GHCB_HV_FT_SNP)) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + enforce_vmpl0(); + } sme_me_mask = BIT_ULL(ebx & 0x3f); } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 6f037c29a46e..7ac5842e32b6 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -89,6 +89,7 @@ #define GHCB_TERM_REGISTER 0 /* GHCB GPA registration failure */ #define GHCB_TERM_PSC 1 /* Page State Change failure */ #define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ +#define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 4ee98976aed8..e37451849165 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -63,6 +63,9 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* Software defined (when rFlags.CF = 1) */ #define PVALIDATE_FAIL_NOUPDATE 255 +/* RMP page size */ +#define RMP_PG_SIZE_4K 0 + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -90,6 +93,18 @@ extern enum es_result sev_es_ghcb_hv_call(struct ghcb *ghcb, struct es_em_ctxt *ctxt, u64 exit_code, u64 exit_info_1, u64 exit_info_2); +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) +{ + int rc; + + /* "rmpadjust" mnemonic support in binutils 2.36 and newer */ + asm volatile(".byte 0xF3,0x0F,0x01,0xFE\n\t" + : "=a"(rc) + : "a"(vaddr), "c"(rmp_psize), "d"(attrs) + : "memory", "cc"); + + return rc; +} static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { bool no_rmpupdate; @@ -114,6 +129,7 @@ static inline int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { ret static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } +static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } #endif #endif -- cgit v1.2.3-59-g8ed1b From 4f9c403e44e5e88feb27d5e617d1adc9cc7ef684 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:09 -0600 Subject: x86/compressed: Add helper for validating pages in the decompression stage Many of the integrity guarantees of SEV-SNP are enforced through the Reverse Map Table (RMP). Each RMP entry contains the GPA at which a particular page of DRAM should be mapped. The VMs can request the hypervisor to add pages in the RMP table via the Page State Change VMGEXIT defined in the GHCB specification. Inside each RMP entry is a Validated flag; this flag is automatically cleared to 0 by the CPU hardware when a new RMP entry is created for a guest. Each VM page can be either validated or invalidated, as indicated by the Validated flag in the RMP entry. Memory access to a private page that is not validated generates a #VC. A VM must use the PVALIDATE instruction to validate a private page before using it. To maintain the security guarantee of SEV-SNP guests, when transitioning pages from private to shared, the guest must invalidate the pages before asking the hypervisor to change the page state to shared in the RMP table. After the pages are mapped private in the page table, the guest must issue a page state change VMGEXIT to mark the pages private in the RMP table and validate them. Upon boot, BIOS should have validated the entire system memory. During the kernel decompression stage, early_setup_ghcb() uses set_page_decrypted() to make the GHCB page shared (i.e. clear encryption attribute). And while exiting from the decompression, it calls set_page_encrypted() to make the page private. Add snp_set_page_{private,shared}() helpers that are used by set_page_{decrypted,encrypted}() to change the page state in the RMP table. [ bp: Massage commit message and comments. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-16-brijesh.singh@amd.com --- arch/x86/boot/compressed/ident_map_64.c | 18 ++++++++++++- arch/x86/boot/compressed/misc.h | 4 +++ arch/x86/boot/compressed/sev.c | 46 +++++++++++++++++++++++++++++++++ arch/x86/include/asm/sev-common.h | 26 +++++++++++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/ident_map_64.c b/arch/x86/boot/compressed/ident_map_64.c index f7213d0943b8..613367e7e09e 100644 --- a/arch/x86/boot/compressed/ident_map_64.c +++ b/arch/x86/boot/compressed/ident_map_64.c @@ -275,15 +275,31 @@ static int set_clr_page_flags(struct x86_mapping_info *info, * Changing encryption attributes of a page requires to flush it from * the caches. */ - if ((set | clr) & _PAGE_ENC) + if ((set | clr) & _PAGE_ENC) { clflush_page(address); + /* + * If the encryption attribute is being cleared, change the page state + * to shared in the RMP table. + */ + if (clr) + snp_set_page_shared(__pa(address & PAGE_MASK)); + } + /* Update PTE */ pte = *ptep; pte = pte_set_flags(pte, set); pte = pte_clear_flags(pte, clr); set_pte(ptep, pte); + /* + * If the encryption attribute is being set, then change the page state to + * private in the RMP entry. The page state change must be done after the PTE + * is updated. + */ + if (set & _PAGE_ENC) + snp_set_page_private(__pa(address & PAGE_MASK)); + /* Flush TLB after changing encryption attribute */ write_cr3(top_level_pgt); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 23e0e395084a..01cc13c12059 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -124,6 +124,8 @@ static inline void console_init(void) void sev_enable(struct boot_params *bp); void sev_es_shutdown_ghcb(void); extern bool sev_es_check_ghcb_fault(unsigned long address); +void snp_set_page_private(unsigned long paddr); +void snp_set_page_shared(unsigned long paddr); #else static inline void sev_enable(struct boot_params *bp) { } static inline void sev_es_shutdown_ghcb(void) { } @@ -131,6 +133,8 @@ static inline bool sev_es_check_ghcb_fault(unsigned long address) { return false; } +static inline void snp_set_page_private(unsigned long paddr) { } +static inline void snp_set_page_shared(unsigned long paddr) { } #endif /* acpi.c */ diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index eb421787d50a..5f2c26860df7 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -116,6 +116,52 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt, /* Include code for early handlers */ #include "../../kernel/sev-shared.c" +static inline bool sev_snp_enabled(void) +{ + return sev_status & MSR_AMD64_SEV_SNP_ENABLED; +} + +static void __page_state_change(unsigned long paddr, enum psc_op op) +{ + u64 val; + + if (!sev_snp_enabled()) + return; + + /* + * If private -> shared then invalidate the page before requesting the + * state change in the RMP table. + */ + if (op == SNP_PAGE_STATE_SHARED && pvalidate(paddr, RMP_PG_SIZE_4K, 0)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + /* Issue VMGEXIT to change the page state in RMP table. */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + /* Read the response of the VMGEXIT. */ + val = sev_es_rd_ghcb_msr(); + if ((GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP) || GHCB_MSR_PSC_RESP_VAL(val)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); + + /* + * Now that page state is changed in the RMP table, validate it so that it is + * consistent with the RMP entry. + */ + if (op == SNP_PAGE_STATE_PRIVATE && pvalidate(paddr, RMP_PG_SIZE_4K, 1)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); +} + +void snp_set_page_private(unsigned long paddr) +{ + __page_state_change(paddr, SNP_PAGE_STATE_PRIVATE); +} + +void snp_set_page_shared(unsigned long paddr) +{ + __page_state_change(paddr, SNP_PAGE_STATE_SHARED); +} + static bool early_setup_ghcb(void) { if (set_page_decrypted((unsigned long)&boot_ghcb_page)) diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 7ac5842e32b6..fe7fe16e5fd5 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -57,6 +57,32 @@ #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +/* + * SNP Page State Change Operation + * + * GHCBData[55:52] - Page operation: + * 0x0001 Page assignment, Private + * 0x0002 Page assignment, Shared + */ +enum psc_op { + SNP_PAGE_STATE_PRIVATE = 1, + SNP_PAGE_STATE_SHARED, +}; + +#define GHCB_MSR_PSC_REQ 0x014 +#define GHCB_MSR_PSC_REQ_GFN(gfn, op) \ + /* GHCBData[55:52] */ \ + (((u64)((op) & 0xf) << 52) | \ + /* GHCBData[51:12] */ \ + ((u64)((gfn) & GENMASK_ULL(39, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_PSC_REQ) + +#define GHCB_MSR_PSC_RESP 0x015 +#define GHCB_MSR_PSC_RESP_VAL(val) \ + /* GHCBData[63:32] */ \ + (((u64)(val) & GENMASK_ULL(63, 32)) >> 32) + /* GHCB Hypervisor Feature Request/Response */ #define GHCB_MSR_HV_FT_REQ 0x080 #define GHCB_MSR_HV_FT_RESP 0x081 -- cgit v1.2.3-59-g8ed1b From 87294bdb7b4b73555b0fba45da1cdecdc6a0d5a8 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:10 -0600 Subject: x86/compressed: Register GHCB memory when SEV-SNP is active The SEV-SNP guest is required by the GHCB spec to register the GHCB's Guest Physical Address (GPA). This is because the hypervisor may prefer that a guest use a consistent and/or specific GPA for the GHCB associated with a vCPU. For more information, see the GHCB specification section "GHCB GPA Registration". If hypervisor can not work with the guest provided GPA then terminate the guest boot. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-17-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 4 ++++ arch/x86/include/asm/sev-common.h | 13 +++++++++++++ arch/x86/kernel/sev-shared.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 5f2c26860df7..f31b434e2ce4 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -175,6 +175,10 @@ static bool early_setup_ghcb(void) /* Initialize lookup tables for the instruction decoder */ inat_init_tables(); + /* SNP guest requires the GHCB GPA must be registered */ + if (sev_snp_enabled()) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); + return true; } diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index fe7fe16e5fd5..f077a6c95e67 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -57,6 +57,19 @@ #define GHCB_MSR_AP_RESET_HOLD_REQ 0x006 #define GHCB_MSR_AP_RESET_HOLD_RESP 0x007 +/* GHCB GPA Register */ +#define GHCB_MSR_REG_GPA_REQ 0x012 +#define GHCB_MSR_REG_GPA_REQ_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)((v) & GENMASK_ULL(51, 0)) << 12) | \ + /* GHCBData[11:0] */ \ + GHCB_MSR_REG_GPA_REQ) + +#define GHCB_MSR_REG_GPA_RESP 0x013 +#define GHCB_MSR_REG_GPA_RESP_VAL(v) \ + /* GHCBData[63:12] */ \ + (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) + /* * SNP Page State Change Operation * diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 4a876e684f67..e9ff13cd90b0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -68,6 +68,22 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } +static void __maybe_unused snp_register_ghcb_early(unsigned long paddr) +{ + unsigned long pfn = paddr >> PAGE_SHIFT; + u64 val; + + sev_es_wr_ghcb_msr(GHCB_MSR_REG_GPA_REQ_VAL(pfn)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + /* If the response GPA is not ours then abort the guest */ + if ((GHCB_RESP_CODE(val) != GHCB_MSR_REG_GPA_RESP) || + (GHCB_MSR_REG_GPA_RESP_VAL(val) != pfn)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_REGISTER); +} + static bool sev_es_negotiate_protocol(void) { u64 val; -- cgit v1.2.3-59-g8ed1b From 95d33bfaa3e169cfec1926e0d0f0c6b0ea75d763 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:11 -0600 Subject: x86/sev: Register GHCB memory when SEV-SNP is active The SEV-SNP guest is required by the GHCB spec to register the GHCB's Guest Physical Address (GPA). This is because the hypervisor may prefer that a guest uses a consistent and/or specific GPA for the GHCB associated with a vCPU. For more information, see the GHCB specification section "GHCB GPA Registration". [ bp: Cleanup comments. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-18-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/cpu/common.c | 4 ++++ arch/x86/kernel/head64.c | 4 +++- arch/x86/kernel/sev-shared.c | 2 +- arch/x86/kernel/sev.c | 46 ++++++++++++++++++++++++++++++++------------ 5 files changed, 44 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index e37451849165..48df02713ee0 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -122,6 +122,7 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } +void setup_ghcb(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -130,6 +131,7 @@ static inline void sev_es_nmi_complete(void) { } static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } +static inline void setup_ghcb(void) { } #endif #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ed4417500700..9e4552133872 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "cpu.h" @@ -2124,6 +2125,9 @@ void cpu_init_exception_handling(void) load_TR_desc(); + /* GHCB needs to be setup to handle #VC. */ + setup_ghcb(); + /* Finally load the IDT */ load_current_idt(); } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index cbc285ddc4ac..83514b9827e6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -597,8 +597,10 @@ static void startup_64_load_idt(unsigned long physbase) void early_setup_idt(void) { /* VMM Communication Exception */ - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) { + setup_ghcb(); set_bringup_idt_handler(bringup_idt_table, X86_TRAP_VC, vc_boot_ghcb); + } bringup_idt_descr.address = (unsigned long)bringup_idt_table; native_load_idt(&bringup_idt_descr); diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index e9ff13cd90b0..3aaef1a18ffe 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -68,7 +68,7 @@ static u64 get_hv_features(void) return GHCB_MSR_HV_FT_RESP_VAL(val); } -static void __maybe_unused snp_register_ghcb_early(unsigned long paddr) +static void snp_register_ghcb_early(unsigned long paddr) { unsigned long pfn = paddr >> PAGE_SHIFT; u64 val; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index cb20fb0c608e..39c87469ee0b 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -41,7 +41,7 @@ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); * Needs to be in the .data section because we need it NULL before bss is * cleared */ -static struct ghcb __initdata *boot_ghcb; +static struct ghcb *boot_ghcb __section(".data"); /* Bitmap of SEV features supported by the hypervisor */ static u64 sev_hv_features __ro_after_init; @@ -647,15 +647,39 @@ static enum es_result vc_handle_msr(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } -/* - * This function runs on the first #VC exception after the kernel - * switched to virtual addresses. - */ -static bool __init sev_es_setup_ghcb(void) +static void snp_register_per_cpu_ghcb(void) { + struct sev_es_runtime_data *data; + struct ghcb *ghcb; + + data = this_cpu_read(runtime_data); + ghcb = &data->ghcb_page; + + snp_register_ghcb_early(__pa(ghcb)); +} + +void setup_ghcb(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) + return; + /* First make sure the hypervisor talks a supported protocol. */ if (!sev_es_negotiate_protocol()) - return false; + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); + + /* + * Check whether the runtime #VC exception handler is active. It uses + * the per-CPU GHCB page which is set up by sev_es_init_vc_handling(). + * + * If SNP is active, register the per-CPU GHCB page so that the runtime + * exception handler can use it. + */ + if (initial_vc_handler == (unsigned long)kernel_exc_vmm_communication) { + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_per_cpu_ghcb(); + + return; + } /* * Clear the boot_ghcb. The first exception comes in before the bss @@ -666,7 +690,9 @@ static bool __init sev_es_setup_ghcb(void) /* Alright - Make the boot-ghcb public */ boot_ghcb = &boot_ghcb_page; - return true; + /* SNP guest requires that GHCB GPA must be registered. */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + snp_register_ghcb_early(__pa(&boot_ghcb_page)); } #ifdef CONFIG_HOTPLUG_CPU @@ -1397,10 +1423,6 @@ bool __init handle_vc_boot_ghcb(struct pt_regs *regs) struct es_em_ctxt ctxt; enum es_result result; - /* Do initial setup or terminate the guest */ - if (unlikely(boot_ghcb == NULL && !sev_es_setup_ghcb())) - sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SEV_ES_GEN_REQ); - vc_ghcb_invalidate(boot_ghcb); result = vc_init_em_ctxt(&ctxt, regs, exit_code); -- cgit v1.2.3-59-g8ed1b From 5e5ccff60a2977142d39b987a8b90e422d9fc634 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Wed, 9 Feb 2022 12:10:12 -0600 Subject: x86/sev: Add helper for validating pages in early enc attribute changes early_set_memory_{encrypted,decrypted}() are used for changing the page state from decrypted (shared) to encrypted (private) and vice versa. When SEV-SNP is active, the page state transition needs to go through additional steps. If the page is transitioned from shared to private, then perform the following after the encryption attribute is set in the page table: 1. Issue the page state change VMGEXIT to add the page as a private in the RMP table. 2. Validate the page after its successfully added in the RMP table. To maintain the security guarantees, if the page is transitioned from private to shared, then perform the following before clearing the encryption attribute from the page table. 1. Invalidate the page. 2. Issue the page state change VMGEXIT to make the page shared in the RMP table. early_set_memory_{encrypted,decrypted}() can be called before the GHCB is setup so use the SNP page state MSR protocol VMGEXIT defined in the GHCB specification to request the page state change in the RMP table. While at it, add a helper snp_prep_memory() which will be used in probe_roms(), in a later patch. [ bp: Massage commit message. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-19-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 10 +++++ arch/x86/kernel/sev.c | 99 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_amd.c | 58 +++++++++++++++++++++++-- 3 files changed, 163 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 48df02713ee0..f65d257e3d4a 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -123,6 +123,11 @@ static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) return rc; } void setup_ghcb(void); +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages); +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -132,6 +137,11 @@ static inline int sev_es_efi_map_ghcbs(pgd_t *pgd) { return 0; } static inline int pvalidate(unsigned long vaddr, bool rmp_psize, bool validate) { return 0; } static inline int rmpadjust(unsigned long vaddr, bool rmp_psize, unsigned long attrs) { return 0; } static inline void setup_ghcb(void) { } +static inline void __init +early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init +early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } +static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } #endif #endif diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 39c87469ee0b..e3fca8615fe8 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -556,6 +556,105 @@ static u64 get_jump_table_addr(void) return ret; } +static void pvalidate_pages(unsigned long vaddr, unsigned int npages, bool validate) +{ + unsigned long vaddr_end; + int rc; + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + rc = pvalidate(vaddr, RMP_PG_SIZE_4K, validate); + if (WARN(rc, "Failed to validate address 0x%lx ret %d", vaddr, rc)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE); + + vaddr = vaddr + PAGE_SIZE; + } +} + +static void __init early_set_pages_state(unsigned long paddr, unsigned int npages, enum psc_op op) +{ + unsigned long paddr_end; + u64 val; + + paddr = paddr & PAGE_MASK; + paddr_end = paddr + (npages << PAGE_SHIFT); + + while (paddr < paddr_end) { + /* + * Use the MSR protocol because this function can be called before + * the GHCB is established. + */ + sev_es_wr_ghcb_msr(GHCB_MSR_PSC_REQ_GFN(paddr >> PAGE_SHIFT, op)); + VMGEXIT(); + + val = sev_es_rd_ghcb_msr(); + + if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP, + "Wrong PSC response code: 0x%x\n", + (unsigned int)GHCB_RESP_CODE(val))) + goto e_term; + + if (WARN(GHCB_MSR_PSC_RESP_VAL(val), + "Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n", + op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared", + paddr, GHCB_MSR_PSC_RESP_VAL(val))) + goto e_term; + + paddr = paddr + PAGE_SIZE; + } + + return; + +e_term: + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Ask the hypervisor to mark the memory pages as private in the RMP + * table. + */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_PRIVATE); + + /* Validate the memory pages after they've been added in the RMP table. */ + pvalidate_pages(vaddr, npages, true); +} + +void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, + unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* Invalidate the memory pages before they are marked shared in the RMP table. */ + pvalidate_pages(vaddr, npages, false); + + /* Ask hypervisor to mark the memory pages shared in the RMP table. */ + early_set_pages_state(paddr, npages, SNP_PAGE_STATE_SHARED); +} + +void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) +{ + unsigned long vaddr, npages; + + vaddr = (unsigned long)__va(paddr); + npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (op == SNP_PAGE_STATE_PRIVATE) + early_snp_set_memory_private(vaddr, paddr, npages); + else if (op == SNP_PAGE_STATE_SHARED) + early_snp_set_memory_shared(vaddr, paddr, npages); + else + WARN(1, "invalid memory op %d\n", op); +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 6169053c2854..8539dd6f24ff 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -47,6 +48,36 @@ EXPORT_SYMBOL(sme_me_mask); /* Buffer used for early in-place encryption by BSP, no locking needed */ static char sme_early_buffer[PAGE_SIZE] __initdata __aligned(PAGE_SIZE); +/* + * SNP-specific routine which needs to additionally change the page state from + * private to shared before copying the data from the source to destination and + * restore after the copy. + */ +static inline void __init snp_memcpy(void *dst, void *src, size_t sz, + unsigned long paddr, bool decrypt) +{ + unsigned long npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; + + if (decrypt) { + /* + * @paddr needs to be accessed decrypted, mark the page shared in + * the RMP table before copying it. + */ + early_snp_set_memory_shared((unsigned long)__va(paddr), paddr, npages); + + memcpy(dst, src, sz); + + /* Restore the page state after the memcpy. */ + early_snp_set_memory_private((unsigned long)__va(paddr), paddr, npages); + } else { + /* + * @paddr need to be accessed encrypted, no need for the page state + * change. + */ + memcpy(dst, src, sz); + } +} + /* * This routine does not change the underlying encryption setting of the * page(s) that map this memory. It assumes that eventually the memory is @@ -95,8 +126,13 @@ static void __init __sme_early_enc_dec(resource_size_t paddr, * Use a temporary buffer, of cache-line multiple size, to * avoid data corruption as documented in the APM. */ - memcpy(sme_early_buffer, src, len); - memcpy(dst, sme_early_buffer, len); + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) { + snp_memcpy(sme_early_buffer, src, len, paddr, enc); + snp_memcpy(dst, sme_early_buffer, len, paddr, !enc); + } else { + memcpy(sme_early_buffer, src, len); + memcpy(dst, sme_early_buffer, len); + } early_memunmap(dst, len); early_memunmap(src, len); @@ -322,14 +358,28 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) clflush_cache_range(__va(pa), size); /* Encrypt/decrypt the contents in-place */ - if (enc) + if (enc) { sme_early_encrypt(pa, size); - else + } else { sme_early_decrypt(pa, size); + /* + * ON SNP, the page state in the RMP table must happen + * before the page table updates. + */ + early_snp_set_memory_shared((unsigned long)__va(pa), pa, 1); + } + /* Change the page encryption mask. */ new_pte = pfn_pte(pfn, new_prot); set_pte_atomic(kpte, new_pte); + + /* + * If page is set encrypted in the page table, then update the RMP table to + * add this page as private. + */ + if (enc) + early_snp_set_memory_private((unsigned long)__va(pa), pa, 1); } static int __init early_set_memory_enc_dec(unsigned long vaddr, -- cgit v1.2.3-59-g8ed1b From dc3f3d2474b80eaee8be89f4c5eb344f10648f42 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:56:01 -0600 Subject: x86/mm: Validate memory when changing the C-bit Add the needed functionality to change pages state from shared to private and vice-versa using the Page State Change VMGEXIT as documented in the GHCB spec. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-22-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 22 +++++ arch/x86/include/asm/sev.h | 4 + arch/x86/include/uapi/asm/svm.h | 2 + arch/x86/kernel/sev.c | 168 ++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_amd.c | 13 +++ 5 files changed, 209 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index f077a6c95e67..1aa72b5c2490 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -105,6 +105,28 @@ enum psc_op { #define GHCB_HV_FT_SNP BIT_ULL(0) +/* SNP Page State Change NAE event */ +#define VMGEXIT_PSC_MAX_ENTRY 253 + +struct psc_hdr { + u16 cur_entry; + u16 end_entry; + u32 reserved; +} __packed; + +struct psc_entry { + u64 cur_page : 12, + gfn : 40, + operation : 4, + pagesize : 1, + reserved : 7; +} __packed; + +struct snp_psc_desc { + struct psc_hdr hdr; + struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; +} __packed; + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index f65d257e3d4a..feeb93e6ec97 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -128,6 +128,8 @@ void __init early_snp_set_memory_private(unsigned long vaddr, unsigned long padd void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages); void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); +void snp_set_memory_private(unsigned long vaddr, unsigned int npages); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -142,6 +144,8 @@ early_snp_set_memory_private(unsigned long vaddr, unsigned long paddr, unsigned static inline void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned int npages) { } static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } +static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b0ad00f4c1e1..64404b47b773 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -108,6 +108,7 @@ #define SVM_VMGEXIT_AP_JUMP_TABLE 0x80000005 #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 +#define SVM_VMGEXIT_PSC 0x80000010 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff @@ -219,6 +220,7 @@ { SVM_VMGEXIT_NMI_COMPLETE, "vmgexit_nmi_complete" }, \ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ + { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index e3fca8615fe8..bf4b57835694 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -655,6 +655,174 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op WARN(1, "invalid memory op %d\n", op); } +static int vmgexit_psc(struct snp_psc_desc *desc) +{ + int cur_entry, end_entry, ret = 0; + struct snp_psc_desc *data; + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = 1; + goto out_unlock; + } + + /* Copy the input desc into GHCB shared buffer */ + data = (struct snp_psc_desc *)ghcb->shared_buffer; + memcpy(ghcb->shared_buffer, desc, min_t(int, GHCB_SHARED_BUF_SIZE, sizeof(*desc))); + + /* + * As per the GHCB specification, the hypervisor can resume the guest + * before processing all the entries. Check whether all the entries + * are processed. If not, then keep retrying. Note, the hypervisor + * will update the data memory directly to indicate the status, so + * reference the data->hdr everywhere. + * + * The strategy here is to wait for the hypervisor to change the page + * state in the RMP table before guest accesses the memory pages. If the + * page state change was not successful, then later memory access will + * result in a crash. + */ + cur_entry = data->hdr.cur_entry; + end_entry = data->hdr.end_entry; + + while (data->hdr.cur_entry <= data->hdr.end_entry) { + ghcb_set_sw_scratch(ghcb, (u64)__pa(data)); + + /* This will advance the shared buffer data points to. */ + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, SVM_VMGEXIT_PSC, 0, 0); + + /* + * Page State Change VMGEXIT can pass error code through + * exit_info_2. + */ + if (WARN(ret || ghcb->save.sw_exit_info_2, + "SNP: PSC failed ret=%d exit_info_2=%llx\n", + ret, ghcb->save.sw_exit_info_2)) { + ret = 1; + goto out; + } + + /* Verify that reserved bit is not set */ + if (WARN(data->hdr.reserved, "Reserved bit is set in the PSC header\n")) { + ret = 1; + goto out; + } + + /* + * Sanity check that entry processing is not going backwards. + * This will happen only if hypervisor is tricking us. + */ + if (WARN(data->hdr.end_entry > end_entry || cur_entry > data->hdr.cur_entry, +"SNP: PSC processing going backward, end_entry %d (got %d) cur_entry %d (got %d)\n", + end_entry, data->hdr.end_entry, cur_entry, data->hdr.cur_entry)) { + ret = 1; + goto out; + } + } + +out: + __sev_put_ghcb(&state); + +out_unlock: + local_irq_restore(flags); + + return ret; +} + +static void __set_pages_state(struct snp_psc_desc *data, unsigned long vaddr, + unsigned long vaddr_end, int op) +{ + struct psc_hdr *hdr; + struct psc_entry *e; + unsigned long pfn; + int i; + + hdr = &data->hdr; + e = data->entries; + + memset(data, 0, sizeof(*data)); + i = 0; + + while (vaddr < vaddr_end) { + if (is_vmalloc_addr((void *)vaddr)) + pfn = vmalloc_to_pfn((void *)vaddr); + else + pfn = __pa(vaddr) >> PAGE_SHIFT; + + e->gfn = pfn; + e->operation = op; + hdr->end_entry = i; + + /* + * Current SNP implementation doesn't keep track of the RMP page + * size so use 4K for simplicity. + */ + e->pagesize = RMP_PG_SIZE_4K; + + vaddr = vaddr + PAGE_SIZE; + e++; + i++; + } + + if (vmgexit_psc(data)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PSC); +} + +static void set_pages_state(unsigned long vaddr, unsigned int npages, int op) +{ + unsigned long vaddr_end, next_vaddr; + struct snp_psc_desc *desc; + + desc = kmalloc(sizeof(*desc), GFP_KERNEL_ACCOUNT); + if (!desc) + panic("SNP: failed to allocate memory for PSC descriptor\n"); + + vaddr = vaddr & PAGE_MASK; + vaddr_end = vaddr + (npages << PAGE_SHIFT); + + while (vaddr < vaddr_end) { + /* Calculate the last vaddr that fits in one struct snp_psc_desc. */ + next_vaddr = min_t(unsigned long, vaddr_end, + (VMGEXIT_PSC_MAX_ENTRY * PAGE_SIZE) + vaddr); + + __set_pages_state(desc, vaddr, next_vaddr, op); + + vaddr = next_vaddr; + } + + kfree(desc); +} + +void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + pvalidate_pages(vaddr, npages, false); + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_SHARED); +} + +void snp_set_memory_private(unsigned long vaddr, unsigned int npages) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + set_pages_state(vaddr, npages, SNP_PAGE_STATE_PRIVATE); + + pvalidate_pages(vaddr, npages, true); +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index 8539dd6f24ff..d3c88d9ef8d6 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -316,11 +316,24 @@ static void enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) static void amd_enc_status_change_prepare(unsigned long vaddr, int npages, bool enc) { + /* + * To maintain the security guarantees of SEV-SNP guests, make sure + * to invalidate the memory before encryption attribute is cleared. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && !enc) + snp_set_memory_shared(vaddr, npages); } /* Return true unconditionally: return value doesn't matter for the SEV side */ static bool amd_enc_status_change_finish(unsigned long vaddr, int npages, bool enc) { + /* + * After memory is mapped encrypted in the page table, validate it + * so that it is consistent with the page table updates. + */ + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP) && enc) + snp_set_memory_private(vaddr, npages); + if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) enc_dec_hypercall(vaddr, npages, enc); -- cgit v1.2.3-59-g8ed1b From 0afb6b660a6b58cb336d1175ed687bf9525849a4 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 7 Mar 2022 15:33:32 -0600 Subject: x86/sev: Use SEV-SNP AP creation to start secondary CPUs To provide a more secure way to start APs under SEV-SNP, use the SEV-SNP AP Creation NAE event. This allows for guest control over the AP register state rather than trusting the hypervisor with the SEV-ES Jump Table address. During native_smp_prepare_cpus(), invoke an SEV-SNP function that, if SEV-SNP is active, will set/override apic->wakeup_secondary_cpu. This will allow the SEV-SNP AP Creation NAE event method to be used to boot the APs. As a result of installing the override when SEV-SNP is active, this method of starting the APs becomes the required method. The override function will fail to start the AP if the hypervisor does not have support for AP creation. [ bp: Work in forgotten review comments. ] Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-23-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 1 + arch/x86/include/asm/sev.h | 4 + arch/x86/include/uapi/asm/svm.h | 5 + arch/x86/kernel/sev.c | 242 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 3 + 5 files changed, 255 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 1aa72b5c2490..e9b6815b3b3d 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -104,6 +104,7 @@ enum psc_op { (((u64)(v) & GENMASK_ULL(63, 12)) >> 12) #define GHCB_HV_FT_SNP BIT_ULL(0) +#define GHCB_HV_FT_SNP_AP_CREATION BIT_ULL(1) /* SNP Page State Change NAE event */ #define VMGEXIT_PSC_MAX_ENTRY 253 diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index feeb93e6ec97..a3203b2caaca 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -66,6 +66,8 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); /* RMP page size */ #define RMP_PG_SIZE_4K 0 +#define RMPADJUST_VMSA_PAGE_BIT BIT(16) + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -130,6 +132,7 @@ void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op); void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); +void snp_set_wakeup_secondary_cpu(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -146,6 +149,7 @@ early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr, unsigned i static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op) { } static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } +static inline void snp_set_wakeup_secondary_cpu(void) { } #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 64404b47b773..cfea5e875088 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -109,6 +109,10 @@ #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_PSC 0x80000010 +#define SVM_VMGEXIT_AP_CREATION 0x80000013 +#define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 +#define SVM_VMGEXIT_AP_CREATE 1 +#define SVM_VMGEXIT_AP_DESTROY 2 #define SVM_VMGEXIT_HV_FEATURES 0x8000fffd #define SVM_VMGEXIT_UNSUPPORTED_EVENT 0x8000ffff @@ -221,6 +225,7 @@ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ + { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index bf4b57835694..d7915ae7729d 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -31,9 +32,26 @@ #include #include #include +#include #define DR7_RESET_VALUE 0x400 +/* AP INIT values as documented in the APM2 section "Processor Initialization State" */ +#define AP_INIT_CS_LIMIT 0xffff +#define AP_INIT_DS_LIMIT 0xffff +#define AP_INIT_LDTR_LIMIT 0xffff +#define AP_INIT_GDTR_LIMIT 0xffff +#define AP_INIT_IDTR_LIMIT 0xffff +#define AP_INIT_TR_LIMIT 0xffff +#define AP_INIT_RFLAGS_DEFAULT 0x2 +#define AP_INIT_DR6_DEFAULT 0xffff0ff0 +#define AP_INIT_GPAT_DEFAULT 0x0007040600070406ULL +#define AP_INIT_XCR0_DEFAULT 0x1 +#define AP_INIT_X87_FTW_DEFAULT 0x5555 +#define AP_INIT_X87_FCW_DEFAULT 0x0040 +#define AP_INIT_CR0_DEFAULT 0x60000010 +#define AP_INIT_MXCSR_DEFAULT 0x1f80 + /* For early boot hypervisor communication in SEV-ES enabled guests */ static struct ghcb boot_ghcb_page __bss_decrypted __aligned(PAGE_SIZE); @@ -90,6 +108,8 @@ struct ghcb_state { static DEFINE_PER_CPU(struct sev_es_runtime_data*, runtime_data); DEFINE_STATIC_KEY_FALSE(sev_es_enable_key); +static DEFINE_PER_CPU(struct sev_es_save_area *, sev_vmsa); + static __always_inline bool on_vc_stack(struct pt_regs *regs) { unsigned long sp = regs->sp; @@ -823,6 +843,228 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages) pvalidate_pages(vaddr, npages, true); } +static int snp_set_vmsa(void *va, bool vmsa) +{ + u64 attrs; + + /* + * Running at VMPL0 allows the kernel to change the VMSA bit for a page + * using the RMPADJUST instruction. However, for the instruction to + * succeed it must target the permissions of a lesser privileged + * (higher numbered) VMPL level, so use VMPL1 (refer to the RMPADJUST + * instruction in the AMD64 APM Volume 3). + */ + attrs = 1; + if (vmsa) + attrs |= RMPADJUST_VMSA_PAGE_BIT; + + return rmpadjust((unsigned long)va, RMP_PG_SIZE_4K, attrs); +} + +#define __ATTR_BASE (SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK) +#define INIT_CS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_READ_MASK | SVM_SELECTOR_CODE_MASK) +#define INIT_DS_ATTRIBS (__ATTR_BASE | SVM_SELECTOR_WRITE_MASK) + +#define INIT_LDTR_ATTRIBS (SVM_SELECTOR_P_MASK | 2) +#define INIT_TR_ATTRIBS (SVM_SELECTOR_P_MASK | 3) + +static void *snp_alloc_vmsa_page(void) +{ + struct page *p; + + /* + * Allocate VMSA page to work around the SNP erratum where the CPU will + * incorrectly signal an RMP violation #PF if a large page (2MB or 1GB) + * collides with the RMP entry of VMSA page. The recommended workaround + * is to not use a large page. + * + * Allocate an 8k page which is also 8k-aligned. + */ + p = alloc_pages(GFP_KERNEL_ACCOUNT | __GFP_ZERO, 1); + if (!p) + return NULL; + + split_page(p, 1); + + /* Free the first 4k. This page may be 2M/1G aligned and cannot be used. */ + __free_page(p); + + return page_address(p + 1); +} + +static void snp_cleanup_vmsa(struct sev_es_save_area *vmsa) +{ + int err; + + err = snp_set_vmsa(vmsa, false); + if (err) + pr_err("clear VMSA page failed (%u), leaking page\n", err); + else + free_page((unsigned long)vmsa); +} + +static int wakeup_cpu_via_vmgexit(int apic_id, unsigned long start_ip) +{ + struct sev_es_save_area *cur_vmsa, *vmsa; + struct ghcb_state state; + unsigned long flags; + struct ghcb *ghcb; + u8 sipi_vector; + int cpu, ret; + u64 cr4; + + /* + * The hypervisor SNP feature support check has happened earlier, just check + * the AP_CREATION one here. + */ + if (!(sev_hv_features & GHCB_HV_FT_SNP_AP_CREATION)) + return -EOPNOTSUPP; + + /* + * Verify the desired start IP against the known trampoline start IP + * to catch any future new trampolines that may be introduced that + * would require a new protected guest entry point. + */ + if (WARN_ONCE(start_ip != real_mode_header->trampoline_start, + "Unsupported SNP start_ip: %lx\n", start_ip)) + return -EINVAL; + + /* Override start_ip with known protected guest start IP */ + start_ip = real_mode_header->sev_es_trampoline_start; + + /* Find the logical CPU for the APIC ID */ + for_each_present_cpu(cpu) { + if (arch_match_cpu_phys_id(cpu, apic_id)) + break; + } + if (cpu >= nr_cpu_ids) + return -EINVAL; + + cur_vmsa = per_cpu(sev_vmsa, cpu); + + /* + * A new VMSA is created each time because there is no guarantee that + * the current VMSA is the kernels or that the vCPU is not running. If + * an attempt was done to use the current VMSA with a running vCPU, a + * #VMEXIT of that vCPU would wipe out all of the settings being done + * here. + */ + vmsa = (struct sev_es_save_area *)snp_alloc_vmsa_page(); + if (!vmsa) + return -ENOMEM; + + /* CR4 should maintain the MCE value */ + cr4 = native_read_cr4() & X86_CR4_MCE; + + /* Set the CS value based on the start_ip converted to a SIPI vector */ + sipi_vector = (start_ip >> 12); + vmsa->cs.base = sipi_vector << 12; + vmsa->cs.limit = AP_INIT_CS_LIMIT; + vmsa->cs.attrib = INIT_CS_ATTRIBS; + vmsa->cs.selector = sipi_vector << 8; + + /* Set the RIP value based on start_ip */ + vmsa->rip = start_ip & 0xfff; + + /* Set AP INIT defaults as documented in the APM */ + vmsa->ds.limit = AP_INIT_DS_LIMIT; + vmsa->ds.attrib = INIT_DS_ATTRIBS; + vmsa->es = vmsa->ds; + vmsa->fs = vmsa->ds; + vmsa->gs = vmsa->ds; + vmsa->ss = vmsa->ds; + + vmsa->gdtr.limit = AP_INIT_GDTR_LIMIT; + vmsa->ldtr.limit = AP_INIT_LDTR_LIMIT; + vmsa->ldtr.attrib = INIT_LDTR_ATTRIBS; + vmsa->idtr.limit = AP_INIT_IDTR_LIMIT; + vmsa->tr.limit = AP_INIT_TR_LIMIT; + vmsa->tr.attrib = INIT_TR_ATTRIBS; + + vmsa->cr4 = cr4; + vmsa->cr0 = AP_INIT_CR0_DEFAULT; + vmsa->dr7 = DR7_RESET_VALUE; + vmsa->dr6 = AP_INIT_DR6_DEFAULT; + vmsa->rflags = AP_INIT_RFLAGS_DEFAULT; + vmsa->g_pat = AP_INIT_GPAT_DEFAULT; + vmsa->xcr0 = AP_INIT_XCR0_DEFAULT; + vmsa->mxcsr = AP_INIT_MXCSR_DEFAULT; + vmsa->x87_ftw = AP_INIT_X87_FTW_DEFAULT; + vmsa->x87_fcw = AP_INIT_X87_FCW_DEFAULT; + + /* SVME must be set. */ + vmsa->efer = EFER_SVME; + + /* + * Set the SNP-specific fields for this VMSA: + * VMPL level + * SEV_FEATURES (matches the SEV STATUS MSR right shifted 2 bits) + */ + vmsa->vmpl = 0; + vmsa->sev_features = sev_status >> 2; + + /* Switch the page over to a VMSA page now that it is initialized */ + ret = snp_set_vmsa(vmsa, true); + if (ret) { + pr_err("set VMSA page failed (%u)\n", ret); + free_page((unsigned long)vmsa); + + return -EINVAL; + } + + /* Issue VMGEXIT AP Creation NAE event */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + + vc_ghcb_invalidate(ghcb); + ghcb_set_rax(ghcb, vmsa->sev_features); + ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_CREATION); + ghcb_set_sw_exit_info_1(ghcb, ((u64)apic_id << 32) | SVM_VMGEXIT_AP_CREATE); + ghcb_set_sw_exit_info_2(ghcb, __pa(vmsa)); + + sev_es_wr_ghcb_msr(__pa(ghcb)); + VMGEXIT(); + + if (!ghcb_sw_exit_info_1_is_valid(ghcb) || + lower_32_bits(ghcb->save.sw_exit_info_1)) { + pr_err("SNP AP Creation error\n"); + ret = -EINVAL; + } + + __sev_put_ghcb(&state); + + local_irq_restore(flags); + + /* Perform cleanup if there was an error */ + if (ret) { + snp_cleanup_vmsa(vmsa); + vmsa = NULL; + } + + /* Free up any previous VMSA page */ + if (cur_vmsa) + snp_cleanup_vmsa(cur_vmsa); + + /* Record the current VMSA page */ + per_cpu(sev_vmsa, cpu) = vmsa; + + return ret; +} + +void snp_set_wakeup_secondary_cpu(void) +{ + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return; + + /* + * Always set this override if SNP is enabled. This makes it the + * required method to start APs under SNP. If the hypervisor does + * not support AP creation, then no APs will be started. + */ + apic->wakeup_secondary_cpu = wakeup_cpu_via_vmgexit; +} + int sev_es_setup_ap_jump_table(struct real_mode_header *rmh) { u16 startup_cs, startup_ip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..85d7b1c5eb70 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -82,6 +82,7 @@ #include #include #include +#include /* representing HT siblings of each logical CPU */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); @@ -1430,6 +1431,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) smp_quirk_init_udelay(); speculative_store_bypass_ht_init(); + + snp_set_wakeup_secondary_cpu(); } void arch_thaw_secondary_cpus_begin(void) -- cgit v1.2.3-59-g8ed1b From 469693d8f62299709e8ba56d8fb3da9ea990213c Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Wed, 9 Feb 2022 12:10:17 -0600 Subject: x86/head/64: Re-enable stack protection Due to 103a4908ad4d ("x86/head/64: Disable stack protection for head$(BITS).o") kernel/head{32,64}.c are compiled with -fno-stack-protector to allow a call to set_bringup_idt_handler(), which would otherwise have stack protection enabled with CONFIG_STACKPROTECTOR_STRONG. While sufficient for that case, there may still be issues with calls to any external functions that were compiled with stack protection enabled that in-turn make stack-protected calls, or if the exception handlers set up by set_bringup_idt_handler() make calls to stack-protected functions. Subsequent patches for SEV-SNP CPUID validation support will introduce both such cases. Attempting to disable stack protection for everything in scope to address that is prohibitive since much of the code, like the SEV-ES #VC handler, is shared code that remains in use after boot and could benefit from having stack protection enabled. Attempting to inline calls is brittle and can quickly balloon out to library/helper code where that's not really an option. Instead, re-enable stack protection for head32.c/head64.c, and make the appropriate changes to ensure the segment used for the stack canary is initialized in advance of any stack-protected C calls. For head64.c: - The BSP will enter from startup_64() and call into C code (startup_64_setup_env()) shortly after setting up the stack, which may result in calls to stack-protected code. Set up %gs early to allow for this safely. - APs will enter from secondary_startup_64*(), and %gs will be set up soon after. There is one call to C code prior to %gs being setup (__startup_secondary_64()), but it is only to fetch 'sme_me_mask' global, so just load 'sme_me_mask' directly instead, and remove the now-unused __startup_secondary_64() function. For head32.c: - BSPs/APs will set %fs to __BOOT_DS prior to any C calls. In recent kernels, the compiler is configured to access the stack canary at %fs:__stack_chk_guard [1], which overlaps with the initial per-cpu '__stack_chk_guard' variable in the initial/"master" .data..percpu area. This is sufficient to allow access to the canary for use during initial startup, so no changes are needed there. [1] 3fb0fdb3bbe7 ("x86/stackprotector/32: Make the canary into a regular percpu variable") [ bp: Massage commit message. ] Suggested-by: Joerg Roedel #for 64-bit %gs set up Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-24-brijesh.singh@amd.com --- arch/x86/include/asm/setup.h | 1 - arch/x86/kernel/Makefile | 2 -- arch/x86/kernel/head64.c | 9 --------- arch/x86/kernel/head_64.S | 24 +++++++++++++++++++++--- 4 files changed, 21 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 896e48d45828..a1b107f2a12a 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -50,7 +50,6 @@ extern unsigned long saved_video_mode; extern void reserve_standard_io_resources(void); extern void i386_reserve_resources(void); extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp); -extern unsigned long __startup_secondary_64(void); extern void startup_64_setup_env(unsigned long physbase); extern void early_setup_idt(void); extern void __init do_early_exception(struct pt_regs *regs, int trapnr); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..1a2dc328cb5e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,8 +46,6 @@ endif # non-deterministic coverage. KCOV_INSTRUMENT := n -CFLAGS_head$(BITS).o += -fno-stack-protector - CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 656d2f3e2cf0..c185f4831498 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -318,15 +318,6 @@ unsigned long __head __startup_64(unsigned long physaddr, return sme_postprocess_startup(bp, pmd); } -unsigned long __startup_secondary_64(void) -{ - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6bf340c91165..7bac9a4bdf7c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,22 @@ SYM_CODE_START_NOALIGN(startup_64) leaq (__end_init_task - FRAME_SIZE)(%rip), %rsp leaq _text(%rip), %rdi + + /* + * initial_gs points to initial fixed_percpu_data struct with storage for + * the stack protector canary. Global pointer fixups are needed at this + * stage, so apply them as is done in fixup_pointer(), and initialize %gs + * such that the canary can be accessed at %gs:40 for subsequent C calls. + */ + movl $MSR_GS_BASE, %ecx + movq initial_gs(%rip), %rax + movq $_text, %rdx + subq %rdx, %rax + addq %rdi, %rax + movq %rax, %rdx + shrq $32, %rdx + wrmsr + pushq %rsi call startup_64_setup_env popq %rsi @@ -147,9 +163,11 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL) * Retrieve the modifier (SME encryption mask if SME is active) to be * added to the initial pgdir entry that will be programmed into CR3. */ - pushq %rsi - call __startup_secondary_64 - popq %rsi +#ifdef CONFIG_AMD_MEM_ENCRYPT + movq sme_me_mask, %rax +#else + xorq %rax, %rax +#endif /* Form the CR3 value being sure to include the CR3 modifier */ addq $(init_top_pgt - __START_KERNEL_map), %rax -- cgit v1.2.3-59-g8ed1b From 5ea98e01ab524cbc53dad8aebd27b434ebe5d074 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:39 -0600 Subject: x86/boot: Add Confidential Computing type to setup_data While launching encrypted guests, the hypervisor may need to provide some additional information during the guest boot. When booting under an EFI-based BIOS, the EFI configuration table contains an entry for the confidential computing blob that contains the required information. To support booting encrypted guests on non-EFI VMs, the hypervisor needs to pass this additional information to the guest kernel using a different method. For this purpose, introduce SETUP_CC_BLOB type in setup_data to hold the physical address of the confidential computing blob location. The boot loader or hypervisor may choose to use this method instead of an EFI configuration table. The CC blob location scanning should give preference to a setup_data blob over an EFI configuration table. In AMD SEV-SNP, the CC blob contains the address of the secrets and CPUID pages. The secrets page includes information such as a VM to PSP communication key and the CPUID page contains PSP-filtered CPUID values. Define the AMD SEV confidential computing blob structure. While at it, define the EFI GUID for the confidential computing blob. [ bp: Massage commit message, mark struct __packed. ] Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Acked-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20220307213356.2797205-30-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 18 ++++++++++++++++++ arch/x86/include/uapi/asm/bootparam.h | 1 + include/linux/efi.h | 1 + 3 files changed, 20 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index a3203b2caaca..8c934bda2a90 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -42,6 +42,24 @@ struct es_em_ctxt { struct es_fault_info fi; }; +/* + * AMD SEV Confidential computing blob structure. The structure is + * defined in OVMF UEFI firmware header: + * https://github.com/tianocore/edk2/blob/master/OvmfPkg/Include/Guid/ConfidentialComputingSevSnpBlob.h + */ +#define CC_BLOB_SEV_HDR_MAGIC 0x45444d41 +struct cc_blob_sev_info { + u32 magic; + u16 version; + u16 reserved; + u64 secrets_phys; + u32 secrets_len; + u32 rsvd1; + u64 cpuid_phys; + u32 cpuid_len; + u32 rsvd2; +} __packed; + void do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code); static inline u64 lower_bits(u64 val, unsigned int bits) diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index b25d3f82c2f3..1ac5acca72ce 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -10,6 +10,7 @@ #define SETUP_EFI 4 #define SETUP_APPLE_PROPERTIES 5 #define SETUP_JAILHOUSE 6 +#define SETUP_CC_BLOB 7 #define SETUP_INDIRECT (1<<31) diff --git a/include/linux/efi.h b/include/linux/efi.h index ccd4d3f91c98..984aa688997a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -390,6 +390,7 @@ void efi_native_runtime_setup(void); #define EFI_CERT_SHA256_GUID EFI_GUID(0xc1c41626, 0x504c, 0x4092, 0xac, 0xa9, 0x41, 0xf9, 0x36, 0x93, 0x43, 0x28) #define EFI_CERT_X509_GUID EFI_GUID(0xa5c059a1, 0x94e4, 0x4aa7, 0x87, 0xb5, 0xab, 0x15, 0x5c, 0x2b, 0xf0, 0x72) #define EFI_CERT_X509_SHA256_GUID EFI_GUID(0x3bd2a492, 0x96c0, 0x4079, 0xb4, 0x20, 0xfc, 0xf9, 0x8e, 0xf1, 0x03, 0xed) +#define EFI_CC_BLOB_GUID EFI_GUID(0x067b1f5f, 0xcf26, 0x44c5, 0x85, 0x54, 0x93, 0xd7, 0x77, 0x91, 0x2d, 0x42) /* * This GUID is used to pass to the kernel proper the struct screen_info -- cgit v1.2.3-59-g8ed1b From b66370db9a90b3fa4c4a1a732af3e7e38d6d4c7c Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:10 -0600 Subject: KVM: x86: Move lookup of indexed CPUID leafs to helper Determining which CPUID leafs have significant ECX/index values is also needed by guest kernel code when doing SEV-SNP-validated CPUID lookups. Move this to common code to keep future updates in sync. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Reviewed-by: Venu Busireddy Link: https://lore.kernel.org/r/20220307213356.2797205-31-brijesh.singh@amd.com --- arch/x86/include/asm/cpuid.h | 34 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/cpuid.c | 19 ++----------------- 2 files changed, 36 insertions(+), 17 deletions(-) create mode 100644 arch/x86/include/asm/cpuid.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpuid.h b/arch/x86/include/asm/cpuid.h new file mode 100644 index 000000000000..70b2db18165e --- /dev/null +++ b/arch/x86/include/asm/cpuid.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * CPUID-related helpers/definitions + * + * Derived from arch/x86/kvm/cpuid.c + */ + +#ifndef _ASM_X86_CPUID_H +#define _ASM_X86_CPUID_H + +static __always_inline bool cpuid_function_is_indexed(u32 function) +{ + switch (function) { + case 4: + case 7: + case 0xb: + case 0xd: + case 0xf: + case 0x10: + case 0x12: + case 0x14: + case 0x17: + case 0x18: + case 0x1d: + case 0x1e: + case 0x1f: + case 0x8000001d: + return true; + } + + return false; +} + +#endif /* _ASM_X86_CPUID_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b24ca7f4ed7c..4b62d80bb22f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -744,24 +745,8 @@ static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, cpuid_count(entry->function, entry->index, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); - switch (function) { - case 4: - case 7: - case 0xb: - case 0xd: - case 0xf: - case 0x10: - case 0x12: - case 0x14: - case 0x17: - case 0x18: - case 0x1d: - case 0x1e: - case 0x1f: - case 0x8000001d: + if (cpuid_function_is_indexed(function)) entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; - break; - } return entry; } -- cgit v1.2.3-59-g8ed1b From ee0bfa08a345370df28c07288e886abcbaac481f Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:12 -0600 Subject: x86/compressed/64: Add support for SEV-SNP CPUID table in #VC handlers CPUID instructions generate a #VC exception for SEV-ES/SEV-SNP guests, for which early handlers are currently set up to handle. In the case of SEV-SNP, guests can use a configurable location in guest memory that has been pre-populated with a firmware-validated CPUID table to look up the relevant CPUID values rather than requesting them from hypervisor via a VMGEXIT. Add the various hooks in the #VC handlers to allow CPUID instructions to be handled via the table. The code to actually configure/enable the table will be added in a subsequent commit. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-33-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 2 + arch/x86/kernel/sev-shared.c | 324 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index e9b6815b3b3d..0759af9b1acf 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -152,6 +152,8 @@ struct snp_psc_desc { #define GHCB_TERM_PSC 1 /* Page State Change failure */ #define GHCB_TERM_PVALIDATE 2 /* Pvalidate failure */ #define GHCB_TERM_NOT_VMPL0 3 /* SNP guest is not running at VMPL-0 */ +#define GHCB_TERM_CPUID 4 /* CPUID-validation failure */ +#define GHCB_TERM_CPUID_HV 5 /* CPUID failure during hypervisor fallback */ #define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK) diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index b4d5558c9d0a..0f1375164ff0 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -24,6 +24,36 @@ struct cpuid_leaf { u32 edx; }; +/* + * Individual entries of the SNP CPUID table, as defined by the SNP + * Firmware ABI, Revision 0.9, Section 7.1, Table 14. + */ +struct snp_cpuid_fn { + u32 eax_in; + u32 ecx_in; + u64 xcr0_in; + u64 xss_in; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u64 __reserved; +} __packed; + +/* + * SNP CPUID table, as defined by the SNP Firmware ABI, Revision 0.9, + * Section 8.14.2.6. Also noted there is the SNP firmware-enforced limit + * of 64 entries per CPUID table. + */ +#define SNP_CPUID_COUNT_MAX 64 + +struct snp_cpuid_table { + u32 count; + u32 __reserved1; + u64 __reserved2; + struct snp_cpuid_fn fn[SNP_CPUID_COUNT_MAX]; +} __packed; + /* * Since feature negotiation related variables are set early in the boot * process they must reside in the .data section so as not to be zeroed @@ -33,6 +63,19 @@ struct cpuid_leaf { */ static u16 ghcb_version __ro_after_init; +/* Copy of the SNP firmware's CPUID page. */ +static struct snp_cpuid_table cpuid_table_copy __ro_after_init; + +/* + * These will be initialized based on CPUID table so that non-present + * all-zero leaves (for sparse tables) can be differentiated from + * invalid/out-of-range leaves. This is needed since all-zero leaves + * still need to be post-processed. + */ +static u32 cpuid_std_range_max __ro_after_init; +static u32 cpuid_hyp_range_max __ro_after_init; +static u32 cpuid_ext_range_max __ro_after_init; + static bool __init sev_es_check_cpu_features(void) { if (!has_cpuflag(X86_FEATURE_RDRAND)) { @@ -242,6 +285,252 @@ static int sev_cpuid_hv(struct cpuid_leaf *leaf) return ret; } +/* + * This may be called early while still running on the initial identity + * mapping. Use RIP-relative addressing to obtain the correct address + * while running with the initial identity mapping as well as the + * switch-over to kernel virtual addresses later. + */ +static const struct snp_cpuid_table *snp_cpuid_get_table(void) +{ + void *ptr; + + asm ("lea cpuid_table_copy(%%rip), %0" + : "=r" (ptr) + : "p" (&cpuid_table_copy)); + + return ptr; +} + +/* + * The SNP Firmware ABI, Revision 0.9, Section 7.1, details the use of + * XCR0_IN and XSS_IN to encode multiple versions of 0xD subfunctions 0 + * and 1 based on the corresponding features enabled by a particular + * combination of XCR0 and XSS registers so that a guest can look up the + * version corresponding to the features currently enabled in its XCR0/XSS + * registers. The only values that differ between these versions/table + * entries is the enabled XSAVE area size advertised via EBX. + * + * While hypervisors may choose to make use of this support, it is more + * robust/secure for a guest to simply find the entry corresponding to the + * base/legacy XSAVE area size (XCR0=1 or XCR0=3), and then calculate the + * XSAVE area size using subfunctions 2 through 64, as documented in APM + * Volume 3, Rev 3.31, Appendix E.3.8, which is what is done here. + * + * Since base/legacy XSAVE area size is documented as 0x240, use that value + * directly rather than relying on the base size in the CPUID table. + * + * Return: XSAVE area size on success, 0 otherwise. + */ +static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + u64 xfeatures_found = 0; + u32 xsave_size = 0x240; + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (!(e->eax_in == 0xD && e->ecx_in > 1 && e->ecx_in < 64)) + continue; + if (!(xfeatures_en & (BIT_ULL(e->ecx_in)))) + continue; + if (xfeatures_found & (BIT_ULL(e->ecx_in))) + continue; + + xfeatures_found |= (BIT_ULL(e->ecx_in)); + + if (compacted) + xsave_size += e->eax; + else + xsave_size = max(xsave_size, e->eax + e->ebx); + } + + /* + * Either the guest set unsupported XCR0/XSS bits, or the corresponding + * entries in the CPUID table were not present. This is not a valid + * state to be in. + */ + if (xfeatures_found != (xfeatures_en & GENMASK_ULL(63, 2))) + return 0; + + return xsave_size; +} + +static bool +snp_cpuid_get_validated_func(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + int i; + + for (i = 0; i < cpuid_table->count; i++) { + const struct snp_cpuid_fn *e = &cpuid_table->fn[i]; + + if (e->eax_in != leaf->fn) + continue; + + if (cpuid_function_is_indexed(leaf->fn) && e->ecx_in != leaf->subfn) + continue; + + /* + * For 0xD subfunctions 0 and 1, only use the entry corresponding + * to the base/legacy XSAVE area size (XCR0=1 or XCR0=3, XSS=0). + * See the comments above snp_cpuid_calc_xsave_size() for more + * details. + */ + if (e->eax_in == 0xD && (e->ecx_in == 0 || e->ecx_in == 1)) + if (!(e->xcr0_in == 1 || e->xcr0_in == 3) || e->xss_in) + continue; + + leaf->eax = e->eax; + leaf->ebx = e->ebx; + leaf->ecx = e->ecx; + leaf->edx = e->edx; + + return true; + } + + return false; +} + +static void snp_cpuid_hv(struct cpuid_leaf *leaf) +{ + if (sev_cpuid_hv(leaf)) + sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV); +} + +static int snp_cpuid_postprocess(struct cpuid_leaf *leaf) +{ + struct cpuid_leaf leaf_hv = *leaf; + + switch (leaf->fn) { + case 0x1: + snp_cpuid_hv(&leaf_hv); + + /* initial APIC ID */ + leaf->ebx = (leaf_hv.ebx & GENMASK(31, 24)) | (leaf->ebx & GENMASK(23, 0)); + /* APIC enabled bit */ + leaf->edx = (leaf_hv.edx & BIT(9)) | (leaf->edx & ~BIT(9)); + + /* OSXSAVE enabled bit */ + if (native_read_cr4() & X86_CR4_OSXSAVE) + leaf->ecx |= BIT(27); + break; + case 0x7: + /* OSPKE enabled bit */ + leaf->ecx &= ~BIT(4); + if (native_read_cr4() & X86_CR4_PKE) + leaf->ecx |= BIT(4); + break; + case 0xB: + leaf_hv.subfn = 0; + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->edx = leaf_hv.edx; + break; + case 0xD: { + bool compacted = false; + u64 xcr0 = 1, xss = 0; + u32 xsave_size; + + if (leaf->subfn != 0 && leaf->subfn != 1) + return 0; + + if (native_read_cr4() & X86_CR4_OSXSAVE) + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if (leaf->subfn == 1) { + /* Get XSS value if XSAVES is enabled. */ + if (leaf->eax & BIT(3)) { + unsigned long lo, hi; + + asm volatile("rdmsr" : "=a" (lo), "=d" (hi) + : "c" (MSR_IA32_XSS)); + xss = (hi << 32) | lo; + } + + /* + * The PPR and APM aren't clear on what size should be + * encoded in 0xD:0x1:EBX when compaction is not enabled + * by either XSAVEC (feature bit 1) or XSAVES (feature + * bit 3) since SNP-capable hardware has these feature + * bits fixed as 1. KVM sets it to 0 in this case, but + * to avoid this becoming an issue it's safer to simply + * treat this as unsupported for SNP guests. + */ + if (!(leaf->eax & (BIT(1) | BIT(3)))) + return -EINVAL; + + compacted = true; + } + + xsave_size = snp_cpuid_calc_xsave_size(xcr0 | xss, compacted); + if (!xsave_size) + return -EINVAL; + + leaf->ebx = xsave_size; + } + break; + case 0x8000001E: + snp_cpuid_hv(&leaf_hv); + + /* extended APIC ID */ + leaf->eax = leaf_hv.eax; + /* compute ID */ + leaf->ebx = (leaf->ebx & GENMASK(31, 8)) | (leaf_hv.ebx & GENMASK(7, 0)); + /* node ID */ + leaf->ecx = (leaf->ecx & GENMASK(31, 8)) | (leaf_hv.ecx & GENMASK(7, 0)); + break; + default: + /* No fix-ups needed, use values as-is. */ + break; + } + + return 0; +} + +/* + * Returns -EOPNOTSUPP if feature not enabled. Any other non-zero return value + * should be treated as fatal by caller. + */ +static int snp_cpuid(struct cpuid_leaf *leaf) +{ + const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table(); + + if (!cpuid_table->count) + return -EOPNOTSUPP; + + if (!snp_cpuid_get_validated_func(leaf)) { + /* + * Some hypervisors will avoid keeping track of CPUID entries + * where all values are zero, since they can be handled the + * same as out-of-range values (all-zero). This is useful here + * as well as it allows virtually all guest configurations to + * work using a single SNP CPUID table. + * + * To allow for this, there is a need to distinguish between + * out-of-range entries and in-range zero entries, since the + * CPUID table entries are only a template that may need to be + * augmented with additional values for things like + * CPU-specific information during post-processing. So if it's + * not in the table, set the values to zero. Then, if they are + * within a valid CPUID range, proceed with post-processing + * using zeros as the initial values. Otherwise, skip + * post-processing and just return zeros immediately. + */ + leaf->eax = leaf->ebx = leaf->ecx = leaf->edx = 0; + + /* Skip post-processing for out-of-range zero leafs. */ + if (!(leaf->fn <= cpuid_std_range_max || + (leaf->fn >= 0x40000000 && leaf->fn <= cpuid_hyp_range_max) || + (leaf->fn >= 0x80000000 && leaf->fn <= cpuid_ext_range_max))) + return 0; + } + + return snp_cpuid_postprocess(leaf); +} + /* * Boot VC Handler - This is the first VC handler during boot, there is no GHCB * page yet, so it only supports the MSR based communication with the @@ -252,6 +541,7 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) unsigned int subfn = lower_bits(regs->cx, 32); unsigned int fn = lower_bits(regs->ax, 32); struct cpuid_leaf leaf; + int ret; /* Only CPUID is supported via MSR protocol */ if (exit_code != SVM_EXIT_CPUID) @@ -259,9 +549,18 @@ void __init do_vc_no_ghcb(struct pt_regs *regs, unsigned long exit_code) leaf.fn = fn; leaf.subfn = subfn; + + ret = snp_cpuid(&leaf); + if (!ret) + goto cpuid_done; + + if (ret != -EOPNOTSUPP) + goto fail; + if (sev_cpuid_hv(&leaf)) goto fail; +cpuid_done: regs->ax = leaf.eax; regs->bx = leaf.ebx; regs->cx = leaf.ecx; @@ -556,12 +855,37 @@ static enum es_result vc_handle_ioio(struct ghcb *ghcb, struct es_em_ctxt *ctxt) return ret; } +static int vc_handle_cpuid_snp(struct pt_regs *regs) +{ + struct cpuid_leaf leaf; + int ret; + + leaf.fn = regs->ax; + leaf.subfn = regs->cx; + ret = snp_cpuid(&leaf); + if (!ret) { + regs->ax = leaf.eax; + regs->bx = leaf.ebx; + regs->cx = leaf.ecx; + regs->dx = leaf.edx; + } + + return ret; +} + static enum es_result vc_handle_cpuid(struct ghcb *ghcb, struct es_em_ctxt *ctxt) { struct pt_regs *regs = ctxt->regs; u32 cr4 = native_read_cr4(); enum es_result ret; + int snp_cpuid_ret; + + snp_cpuid_ret = vc_handle_cpuid_snp(regs); + if (!snp_cpuid_ret) + return ES_OK; + if (snp_cpuid_ret != -EOPNOTSUPP) + return ES_VMM_ERROR; ghcb_set_rax(ghcb, regs->ax); ghcb_set_rcx(ghcb, regs->cx); -- cgit v1.2.3-59-g8ed1b From 8c9c509baf660f1062bc758c26008b7f9bbc39f3 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:13 -0600 Subject: x86/boot: Add a pointer to Confidential Computing blob in bootparams The previously defined Confidential Computing blob is provided to the kernel via a setup_data structure or EFI config table entry. Currently, these are both checked for by boot/compressed kernel to access the CPUID table address within it for use with SEV-SNP CPUID enforcement. To also enable that enforcement for the run-time kernel, similar access to the CPUID table is needed early on while it's still using the identity-mapped page table set up by boot/compressed, where global pointers need to be accessed via fixup_pointer(). This isn't much of an issue for accessing setup_data, and the EFI config table helper code currently used in boot/compressed *could* be used in this case as well since they both rely on identity-mapping. However, it has some reliance on EFI helpers/string constants that would need to be accessed via fixup_pointer(), and fixing it up while making it shareable between boot/compressed and run-time kernel is fragile and introduces a good bit of ugliness. Instead, add a boot_params->cc_blob_address pointer that the boot/compressed kernel can initialize so that the run-time kernel can access the CC blob from there instead of re-scanning the EFI config table. Also document these in Documentation/x86/zero-page.rst. While there, add missing documentation for the acpi_rsdp_addr field, which serves a similar purpose in providing the run-time kernel a pointer to the ACPI RSDP table so that it does not need to [re-]scan the EFI configuration table. [ bp: Fix typos, massage commit message. ] Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-34-brijesh.singh@amd.com --- Documentation/x86/zero-page.rst | 2 ++ arch/x86/include/asm/bootparam_utils.h | 1 + arch/x86/include/uapi/asm/bootparam.h | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/Documentation/x86/zero-page.rst b/Documentation/x86/zero-page.rst index f088f5881666..45aa9cceb4f1 100644 --- a/Documentation/x86/zero-page.rst +++ b/Documentation/x86/zero-page.rst @@ -19,6 +19,7 @@ Offset/Size Proto Name Meaning 058/008 ALL tboot_addr Physical address of tboot shared page 060/010 ALL ist_info Intel SpeedStep (IST) BIOS support information (struct ist_info) +070/008 ALL acpi_rsdp_addr Physical address of ACPI RSDP table 080/010 ALL hd0_info hd0 disk parameter, OBSOLETE!! 090/010 ALL hd1_info hd1 disk parameter, OBSOLETE!! 0A0/010 ALL sys_desc_table System description table (struct sys_desc_table), @@ -27,6 +28,7 @@ Offset/Size Proto Name Meaning 0C0/004 ALL ext_ramdisk_image ramdisk_image high 32bits 0C4/004 ALL ext_ramdisk_size ramdisk_size high 32bits 0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits +13C/004 ALL cc_blob_address Physical address of Confidential Computing blob 140/080 ALL edid_info Video mode setup (struct edid_info) 1C0/020 ALL efi_info EFI 32 information (struct efi_info) 1E0/004 ALL alt_mem_k Alternative mem check, in KB diff --git a/arch/x86/include/asm/bootparam_utils.h b/arch/x86/include/asm/bootparam_utils.h index 981fe923a59f..53e9b0620d96 100644 --- a/arch/x86/include/asm/bootparam_utils.h +++ b/arch/x86/include/asm/bootparam_utils.h @@ -74,6 +74,7 @@ static void sanitize_boot_params(struct boot_params *boot_params) BOOT_PARAM_PRESERVE(hdr), BOOT_PARAM_PRESERVE(e820_table), BOOT_PARAM_PRESERVE(eddbuf), + BOOT_PARAM_PRESERVE(cc_blob_address), }; memset(&scratch, 0, sizeof(scratch)); diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 1ac5acca72ce..bea5cdcdf532 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -188,7 +188,8 @@ struct boot_params { __u32 ext_ramdisk_image; /* 0x0c0 */ __u32 ext_ramdisk_size; /* 0x0c4 */ __u32 ext_cmd_line_ptr; /* 0x0c8 */ - __u8 _pad4[116]; /* 0x0cc */ + __u8 _pad4[112]; /* 0x0cc */ + __u32 cc_blob_address; /* 0x13c */ struct edid_info edid_info; /* 0x140 */ struct efi_info efi_info; /* 0x1c0 */ __u32 alt_mem_k; /* 0x1e0 */ -- cgit v1.2.3-59-g8ed1b From c01fce9cef8491974f7f007f90281f1608400768 Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:14 -0600 Subject: x86/compressed: Add SEV-SNP feature detection/setup Initial/preliminary detection of SEV-SNP is done via the Confidential Computing blob. Check for it prior to the normal SEV/SME feature initialization, and add some sanity checks to confirm it agrees with SEV-SNP CPUID/MSR bits. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-35-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 112 ++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/sev.h | 3 ++ 2 files changed, 114 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 7a9cfbc43f4b..2a94bb7879ae 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -274,6 +274,13 @@ void sev_enable(struct boot_params *bp) { unsigned int eax, ebx, ecx, edx; struct msr m; + bool snp; + + /* + * Setup/preliminary detection of SNP. This will be sanity-checked + * against CPUID/MSR values later. + */ + snp = snp_init(bp); /* Check for the SME/SEV support leaf */ eax = 0x80000000; @@ -294,8 +301,11 @@ void sev_enable(struct boot_params *bp) ecx = 0; native_cpuid(&eax, &ebx, &ecx, &edx); /* Check whether SEV is supported */ - if (!(eax & BIT(1))) + if (!(eax & BIT(1))) { + if (snp) + error("SEV-SNP support indicated by CC blob, but not CPUID."); return; + } /* Set the SME mask if this is an SEV guest. */ boot_rdmsr(MSR_AMD64_SEV, &m); @@ -320,5 +330,105 @@ void sev_enable(struct boot_params *bp) enforce_vmpl0(); } + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + error("SEV-SNP supported indicated by CC blob, but not SEV status MSR."); + sme_me_mask = BIT_ULL(ebx & 0x3f); } + +/* Search for Confidential Computing blob in the EFI config table. */ +static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) +{ + unsigned long cfg_table_pa; + unsigned int cfg_table_len; + int ret; + + ret = efi_get_conf_table(bp, &cfg_table_pa, &cfg_table_len); + if (ret) + return NULL; + + return (struct cc_blob_sev_info *)efi_find_vendor_table(bp, cfg_table_pa, + cfg_table_len, + EFI_CC_BLOB_GUID); +} + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the boot kernel + * by firmware/bootloader in the following ways: + * + * - via an entry in the EFI config table + * - via a setup_data structure, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + cc_info = find_cc_blob_efi(bp); + if (cc_info) + goto found_cc_info; + + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); + + return cc_info; +} + +/* + * Indicate SNP based on presence of SNP-specific CC blob. Subsequent checks + * will verify the SNP CPUID/MSR bits. + */ +bool snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * Pass run-time kernel a pointer to CC info via boot_params so EFI + * config table doesn't need to be searched again during early startup + * phase. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 8c934bda2a90..31b3b10323b6 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -11,6 +11,7 @@ #include #include #include +#include #define GHCB_PROTOCOL_MIN 1ULL #define GHCB_PROTOCOL_MAX 2ULL @@ -151,6 +152,7 @@ void __init snp_prep_memory(unsigned long paddr, unsigned int sz, enum psc_op op void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); +bool snp_init(struct boot_params *bp); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -168,6 +170,7 @@ static inline void __init snp_prep_memory(unsigned long paddr, unsigned int sz, static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } +static inline bool snp_init(struct boot_params *bp) { return false; } #endif #endif -- cgit v1.2.3-59-g8ed1b From b190a043c49af4587f5e157053f909192820522a Mon Sep 17 00:00:00 2001 From: Michael Roth Date: Thu, 24 Feb 2022 10:56:18 -0600 Subject: x86/sev: Add SEV-SNP feature detection/setup Initial/preliminary detection of SEV-SNP is done via the Confidential Computing blob. Check for it prior to the normal SEV/SME feature initialization, and add some sanity checks to confirm it agrees with SEV-SNP CPUID/MSR bits. Signed-off-by: Michael Roth Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-39-brijesh.singh@amd.com --- arch/x86/boot/compressed/sev.c | 27 ---------------- arch/x86/include/asm/sev.h | 2 ++ arch/x86/kernel/sev-shared.c | 27 ++++++++++++++++ arch/x86/kernel/sev.c | 64 ++++++++++++++++++++++++++++++++++++++ arch/x86/mm/mem_encrypt_identity.c | 8 +++++ 5 files changed, 101 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 1e4930ce2c95..82079ce7be06 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -352,33 +352,6 @@ static struct cc_blob_sev_info *find_cc_blob_efi(struct boot_params *bp) EFI_CC_BLOB_GUID); } -struct cc_setup_data { - struct setup_data header; - u32 cc_blob_address; -}; - -/* - * Search for a Confidential Computing blob passed in as a setup_data entry - * via the Linux Boot Protocol. - */ -static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) -{ - struct cc_setup_data *sd = NULL; - struct setup_data *hdr; - - hdr = (struct setup_data *)bp->hdr.setup_data; - - while (hdr) { - if (hdr->type == SETUP_CC_BLOB) { - sd = (struct cc_setup_data *)hdr; - return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; - } - hdr = (struct setup_data *)hdr->next; - } - - return NULL; -} - /* * Initial set up of SNP relies on information provided by the * Confidential Computing blob, which can be passed to the boot kernel diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 31b3b10323b6..84d3d5121e9c 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -153,6 +153,7 @@ void snp_set_memory_shared(unsigned long vaddr, unsigned int npages); void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); +void snp_abort(void); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -171,6 +172,7 @@ static inline void snp_set_memory_shared(unsigned long vaddr, unsigned int npage static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npages) { } static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } +static inline void snp_abort(void) { } #endif #endif diff --git a/arch/x86/kernel/sev-shared.c b/arch/x86/kernel/sev-shared.c index 0f1375164ff0..a7a1c0fb298e 100644 --- a/arch/x86/kernel/sev-shared.c +++ b/arch/x86/kernel/sev-shared.c @@ -937,3 +937,30 @@ static enum es_result vc_handle_rdtsc(struct ghcb *ghcb, return ES_OK; } + +struct cc_setup_data { + struct setup_data header; + u32 cc_blob_address; +}; + +/* + * Search for a Confidential Computing blob passed in as a setup_data entry + * via the Linux Boot Protocol. + */ +static struct cc_blob_sev_info *find_cc_blob_setup_data(struct boot_params *bp) +{ + struct cc_setup_data *sd = NULL; + struct setup_data *hdr; + + hdr = (struct setup_data *)bp->hdr.setup_data; + + while (hdr) { + if (hdr->type == SETUP_CC_BLOB) { + sd = (struct cc_setup_data *)hdr; + return (struct cc_blob_sev_info *)(unsigned long)sd->cc_blob_address; + } + hdr = (struct setup_data *)hdr->next; + } + + return NULL; +} diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 0c2bf39c4a8f..692da7b29127 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -1974,3 +1974,67 @@ fail: while (true) halt(); } + +/* + * Initial set up of SNP relies on information provided by the + * Confidential Computing blob, which can be passed to the kernel + * in the following ways, depending on how it is booted: + * + * - when booted via the boot/decompress kernel: + * - via boot_params + * + * - when booted directly by firmware/bootloader (e.g. CONFIG_PVH): + * - via a setup_data entry, as defined by the Linux Boot Protocol + * + * Scan for the blob in that order. + */ +static __init struct cc_blob_sev_info *find_cc_blob(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + /* Boot kernel would have passed the CC blob via boot_params. */ + if (bp->cc_blob_address) { + cc_info = (struct cc_blob_sev_info *)(unsigned long)bp->cc_blob_address; + goto found_cc_info; + } + + /* + * If kernel was booted directly, without the use of the + * boot/decompression kernel, the CC blob may have been passed via + * setup_data instead. + */ + cc_info = find_cc_blob_setup_data(bp); + if (!cc_info) + return NULL; + +found_cc_info: + if (cc_info->magic != CC_BLOB_SEV_HDR_MAGIC) + snp_abort(); + + return cc_info; +} + +bool __init snp_init(struct boot_params *bp) +{ + struct cc_blob_sev_info *cc_info; + + if (!bp) + return false; + + cc_info = find_cc_blob(bp); + if (!cc_info) + return false; + + /* + * The CC blob will be used later to access the secrets page. Cache + * it here like the boot kernel does. + */ + bp->cc_blob_address = (u32)(unsigned long)cc_info; + + return true; +} + +void __init snp_abort(void) +{ + sev_es_terminate(SEV_TERM_SET_GEN, GHCB_SNP_UNSUPPORTED); +} diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index b43bc24d2bb6..f415498d3175 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -509,8 +510,11 @@ void __init sme_enable(struct boot_params *bp) bool active_by_default; unsigned long me_mask; char buffer[16]; + bool snp; u64 msr; + snp = snp_init(bp); + /* Check for the SME/SEV support leaf */ eax = 0x80000000; ecx = 0; @@ -542,6 +546,10 @@ void __init sme_enable(struct boot_params *bp) sev_status = __rdmsr(MSR_AMD64_SEV); feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT; + /* The SEV-SNP CC blob should never be present unless SEV-SNP is enabled. */ + if (snp && !(sev_status & MSR_AMD64_SEV_SNP_ENABLED)) + snp_abort(); + /* Check if memory encryption is enabled */ if (feature_mask == AMD_SME_BIT) { /* -- cgit v1.2.3-59-g8ed1b From d5af44dde5461d125d1602ac913ab5c6bdf09b8b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 7 Mar 2022 15:33:51 -0600 Subject: x86/sev: Provide support for SNP guest request NAEs Version 2 of GHCB specification provides SNP_GUEST_REQUEST and SNP_EXT_GUEST_REQUEST NAE that can be used by the SNP guest to communicate with the PSP. While at it, add a snp_issue_guest_request() helper that will be used by driver or other subsystem to issue the request to PSP. See SEV-SNP firmware and GHCB spec for more details. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-42-brijesh.singh@amd.com --- arch/x86/include/asm/sev-common.h | 3 +++ arch/x86/include/asm/sev.h | 14 ++++++++++ arch/x86/include/uapi/asm/svm.h | 4 +++ arch/x86/kernel/sev.c | 57 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev-common.h b/arch/x86/include/asm/sev-common.h index 0759af9b1acf..b8357d6ecd47 100644 --- a/arch/x86/include/asm/sev-common.h +++ b/arch/x86/include/asm/sev-common.h @@ -128,6 +128,9 @@ struct snp_psc_desc { struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; } __packed; +/* Guest message request error code */ +#define SNP_GUEST_REQ_INVALID_LEN BIT_ULL(32) + #define GHCB_MSR_TERM_REQ 0x100 #define GHCB_MSR_TERM_REASON_SET_POS 12 #define GHCB_MSR_TERM_REASON_SET_MASK 0xf diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 84d3d5121e9c..c63a1d485c20 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -87,6 +87,14 @@ extern bool handle_vc_boot_ghcb(struct pt_regs *regs); #define RMPADJUST_VMSA_PAGE_BIT BIT(16) +/* SNP Guest message request */ +struct snp_req_data { + unsigned long req_gpa; + unsigned long resp_gpa; + unsigned long data_gpa; + unsigned int data_npages; +}; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); @@ -154,6 +162,7 @@ void snp_set_memory_private(unsigned long vaddr, unsigned int npages); void snp_set_wakeup_secondary_cpu(void); bool snp_init(struct boot_params *bp); void snp_abort(void); +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err); #else static inline void sev_es_ist_enter(struct pt_regs *regs) { } static inline void sev_es_ist_exit(void) { } @@ -173,6 +182,11 @@ static inline void snp_set_memory_private(unsigned long vaddr, unsigned int npag static inline void snp_set_wakeup_secondary_cpu(void) { } static inline bool snp_init(struct boot_params *bp) { return false; } static inline void snp_abort(void) { } +static inline int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, + unsigned long *fw_err) +{ + return -ENOTTY; +} #endif #endif diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index cfea5e875088..f69c168391aa 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -109,6 +109,8 @@ #define SVM_VMGEXIT_SET_AP_JUMP_TABLE 0 #define SVM_VMGEXIT_GET_AP_JUMP_TABLE 1 #define SVM_VMGEXIT_PSC 0x80000010 +#define SVM_VMGEXIT_GUEST_REQUEST 0x80000011 +#define SVM_VMGEXIT_EXT_GUEST_REQUEST 0x80000012 #define SVM_VMGEXIT_AP_CREATION 0x80000013 #define SVM_VMGEXIT_AP_CREATE_ON_INIT 0 #define SVM_VMGEXIT_AP_CREATE 1 @@ -225,6 +227,8 @@ { SVM_VMGEXIT_AP_HLT_LOOP, "vmgexit_ap_hlt_loop" }, \ { SVM_VMGEXIT_AP_JUMP_TABLE, "vmgexit_ap_jump_table" }, \ { SVM_VMGEXIT_PSC, "vmgexit_page_state_change" }, \ + { SVM_VMGEXIT_GUEST_REQUEST, "vmgexit_guest_request" }, \ + { SVM_VMGEXIT_EXT_GUEST_REQUEST, "vmgexit_ext_guest_request" }, \ { SVM_VMGEXIT_AP_CREATION, "vmgexit_ap_creation" }, \ { SVM_VMGEXIT_HV_FEATURES, "vmgexit_hypervisor_feature" }, \ { SVM_EXIT_ERR, "invalid_guest_state" } diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 70ecc6e2f251..7237b4178935 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2106,3 +2106,60 @@ static int __init init_sev_config(char *str) return 1; } __setup("sev=", init_sev_config); + +int snp_issue_guest_request(u64 exit_code, struct snp_req_data *input, unsigned long *fw_err) +{ + struct ghcb_state state; + struct es_em_ctxt ctxt; + unsigned long flags; + struct ghcb *ghcb; + int ret; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + if (!fw_err) + return -EINVAL; + + /* + * __sev_get_ghcb() needs to run with IRQs disabled because it is using + * a per-CPU GHCB. + */ + local_irq_save(flags); + + ghcb = __sev_get_ghcb(&state); + if (!ghcb) { + ret = -EIO; + goto e_restore_irq; + } + + vc_ghcb_invalidate(ghcb); + + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST) { + ghcb_set_rax(ghcb, input->data_gpa); + ghcb_set_rbx(ghcb, input->data_npages); + } + + ret = sev_es_ghcb_hv_call(ghcb, true, &ctxt, exit_code, input->req_gpa, input->resp_gpa); + if (ret) + goto e_put; + + if (ghcb->save.sw_exit_info_2) { + /* Number of expected pages are returned in RBX */ + if (exit_code == SVM_VMGEXIT_EXT_GUEST_REQUEST && + ghcb->save.sw_exit_info_2 == SNP_GUEST_REQ_INVALID_LEN) + input->data_npages = ghcb_get_rbx(ghcb); + + *fw_err = ghcb->save.sw_exit_info_2; + + ret = -EIO; + } + +e_put: + __sev_put_ghcb(&state); +e_restore_irq: + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(snp_issue_guest_request); -- cgit v1.2.3-59-g8ed1b From 3a45b3753849c4a12cca2dd176c0192cd2a63e62 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Thu, 24 Feb 2022 10:56:21 -0600 Subject: x86/sev: Register SEV-SNP guest request platform device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Version 2 of the GHCB specification provides a Non Automatic Exit (NAE) event type that can be used by the SEV-SNP guest to communicate with the PSP without risk from a malicious hypervisor who wishes to read, alter, drop or replay the messages sent. SNP_LAUNCH_UPDATE can insert two special pages into the guest’s memory: the secrets page and the CPUID page. The PSP firmware populates the contents of the secrets page. The secrets page contains encryption keys used by the guest to interact with the firmware. Because the secrets page is encrypted with the guest’s memory encryption key, the hypervisor cannot read the keys. See SEV-SNP firmware spec for further details on the secrets page format. Create a platform device that the SEV-SNP guest driver can bind to get the platform resources such as encryption key and message id to use to communicate with the PSP. The SEV-SNP guest driver provides a userspace interface to get the attestation report, key derivation, extended attestation report etc. Signed-off-by: Brijesh Singh Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220307213356.2797205-43-brijesh.singh@amd.com --- arch/x86/include/asm/sev.h | 4 ++++ arch/x86/kernel/sev.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index c63a1d485c20..9c2d33f1cfee 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -95,6 +95,10 @@ struct snp_req_data { unsigned int data_npages; }; +struct snp_guest_platform_data { + u64 secrets_gpa; +}; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index 7237b4178935..ace43e116b40 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include #include #include @@ -2163,3 +2166,56 @@ e_restore_irq: return ret; } EXPORT_SYMBOL_GPL(snp_issue_guest_request); + +static struct platform_device guest_req_device = { + .name = "snp-guest", + .id = -1, +}; + +static u64 get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static int __init snp_init_platform_device(void) +{ + struct snp_guest_platform_data data; + u64 gpa; + + if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return -ENODEV; + + gpa = get_secrets_page(); + if (!gpa) + return -ENODEV; + + data.secrets_gpa = gpa; + if (platform_device_add_data(&guest_req_device, &data, sizeof(data))) + return -ENODEV; + + if (platform_device_register(&guest_req_device)) + return -ENODEV; + + pr_info("SNP guest platform device initialized.\n"); + return 0; +} +device_initcall(snp_init_platform_device); -- cgit v1.2.3-59-g8ed1b From 59bd54a84d15e9335de5b8abe7b3b9713a36b99b Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:10 +0300 Subject: x86/tdx: Detect running as a TDX guest in early boot In preparation of extending cc_platform_has() API to support TDX guest, use CPUID instruction to detect support for TDX guests in the early boot code (via tdx_early_init()). Since copy_bootdata() is the first user of cc_platform_has() API, detect the TDX guest status before it. Define a synthetic feature flag (X86_FEATURE_TDX_GUEST) and set this bit in a valid TDX guest platform. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Borislav Petkov Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-2-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 12 ++++++++++++ arch/x86/coco/Makefile | 2 ++ arch/x86/coco/tdx/Makefile | 3 +++ arch/x86/coco/tdx/tdx.c | 22 ++++++++++++++++++++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/include/asm/tdx.h | 21 +++++++++++++++++++++ arch/x86/kernel/head64.c | 4 ++++ 8 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/Makefile create mode 100644 arch/x86/coco/tdx/tdx.c create mode 100644 arch/x86/include/asm/tdx.h (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b0142e01002e..4ae27322869d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -878,6 +878,18 @@ config ACRN_GUEST IOT with small footprint and real-time features. More details can be found in https://projectacrn.org/. +config INTEL_TDX_GUEST + bool "Intel TDX (Trust Domain Extensions) - Guest Support" + depends on X86_64 && CPU_SUP_INTEL + depends on X86_X2APIC + help + Support running as a guest under Intel TDX. Without this support, + the guest kernel can not boot or run under TDX. + TDX includes memory encryption and integrity capabilities + which protect the confidentiality and integrity of guest + memory contents and CPU state. TDX guests are protected from + some attacks from the VMM. + endif #HYPERVISOR_GUEST source "arch/x86/Kconfig.cpu" diff --git a/arch/x86/coco/Makefile b/arch/x86/coco/Makefile index c1ead00017a7..c816acf78b6a 100644 --- a/arch/x86/coco/Makefile +++ b/arch/x86/coco/Makefile @@ -4,3 +4,5 @@ KASAN_SANITIZE_core.o := n CFLAGS_core.o += -fno-stack-protector obj-y += core.o + +obj-$(CONFIG_INTEL_TDX_GUEST) += tdx/ diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile new file mode 100644 index 000000000000..c929d53ee059 --- /dev/null +++ b/arch/x86/coco/tdx/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-y += tdx.o diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c new file mode 100644 index 000000000000..97674471fd1e --- /dev/null +++ b/arch/x86/coco/tdx/tdx.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2021-2022 Intel Corporation */ + +#undef pr_fmt +#define pr_fmt(fmt) "tdx: " fmt + +#include +#include + +void __init tdx_early_init(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; + + setup_force_cpu_cap(X86_FEATURE_TDX_GUEST); + + pr_info("Guest detected\n"); +} diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..20df73b51025 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -238,6 +238,7 @@ #define X86_FEATURE_VMW_VMMCALL ( 8*32+19) /* "" VMware prefers VMMCALL hypercall instruction */ #define X86_FEATURE_PVUNLOCK ( 8*32+20) /* "" PV unlock function */ #define X86_FEATURE_VCPUPREEMPT ( 8*32+21) /* "" PV vcpu_is_preempted function */ +#define X86_FEATURE_TDX_GUEST ( 8*32+22) /* Intel Trust Domain Extensions Guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ #define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 1231d63f836d..b37de8268c9a 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -68,6 +68,12 @@ # define DISABLE_SGX (1 << (X86_FEATURE_SGX & 31)) #endif +#ifdef CONFIG_INTEL_TDX_GUEST +# define DISABLE_TDX_GUEST 0 +#else +# define DISABLE_TDX_GUEST (1 << (X86_FEATURE_TDX_GUEST & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -79,7 +85,7 @@ #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 #define DISABLED_MASK7 (DISABLE_PTI) -#define DISABLED_MASK8 0 +#define DISABLED_MASK8 (DISABLE_TDX_GUEST) #define DISABLED_MASK9 (DISABLE_SMAP|DISABLE_SGX) #define DISABLED_MASK10 0 #define DISABLED_MASK11 0 diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h new file mode 100644 index 000000000000..ba8042ce61c2 --- /dev/null +++ b/arch/x86/include/asm/tdx.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2021-2022 Intel Corporation */ +#ifndef _ASM_X86_TDX_H +#define _ASM_X86_TDX_H + +#include + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#ifdef CONFIG_INTEL_TDX_GUEST + +void __init tdx_early_init(void); + +#else + +static inline void tdx_early_init(void) { }; + +#endif /* CONFIG_INTEL_TDX_GUEST */ + +#endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 4f5ecbbaae77..6dff50c3edd6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -40,6 +40,7 @@ #include #include #include +#include /* * Manage page tables very early on. @@ -514,6 +515,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) idt_setup_early_handler(); + /* Needed before cc_platform_has() can be used for TDX */ + tdx_early_init(); + copy_bootdata(__va(real_mode_data)); /* -- cgit v1.2.3-59-g8ed1b From 527a534c732604931959e73e9c3a8952d8c1a994 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:11 +0300 Subject: x86/tdx: Provide common base for SEAMCALL and TDCALL C wrappers Secure Arbitration Mode (SEAM) is an extension of VMX architecture. It defines a new VMX root operation (SEAM VMX root) and a new VMX non-root operation (SEAM VMX non-root) which are both isolated from the legacy VMX operation where the host kernel runs. A CPU-attested software module (called 'TDX module') runs in SEAM VMX root to manage and protect VMs running in SEAM VMX non-root. SEAM VMX root is also used to host another CPU-attested software module (called 'P-SEAMLDR') to load and update the TDX module. Host kernel transits to either P-SEAMLDR or TDX module via the new SEAMCALL instruction, which is essentially a VMExit from VMX root mode to SEAM VMX root mode. SEAMCALLs are leaf functions defined by P-SEAMLDR and TDX module around the new SEAMCALL instruction. A guest kernel can also communicate with TDX module via TDCALL instruction. TDCALLs and SEAMCALLs use an ABI different from the x86-64 system-v ABI. RAX is used to carry both the SEAMCALL leaf function number (input) and the completion status (output). Additional GPRs (RCX, RDX, R8-R11) may be further used as both input and output operands in individual leaf. TDCALL and SEAMCALL share the same ABI and require the largely same code to pass down arguments and retrieve results. Define an assembly macro that can be used to implement C wrapper for both TDCALL and SEAMCALL. Suggested-by: Thomas Gleixner Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-3-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/tdx.h | 29 +++++++++++++ arch/x86/kernel/asm-offsets.c | 9 ++++ arch/x86/virt/vmx/tdx/tdxcall.S | 96 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 arch/x86/virt/vmx/tdx/tdxcall.S (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index ba8042ce61c2..cb4c4e607c43 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -4,10 +4,38 @@ #define _ASM_X86_TDX_H #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +/* + * SW-defined error codes. + * + * Bits 47:40 == 0xFF indicate Reserved status code class that never used by + * TDX module. + */ +#define TDX_ERROR _BITUL(63) +#define TDX_SW_ERROR (TDX_ERROR | GENMASK_ULL(47, 40)) +#define TDX_SEAMCALL_VMFAILINVALID (TDX_SW_ERROR | _UL(0xFFFF0000)) + +#ifndef __ASSEMBLY__ + +/* + * Used to gather the output registers values of the TDCALL and SEAMCALL + * instructions when requesting services from the TDX module. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_module_output { + u64 rcx; + u64 rdx; + u64 r8; + u64 r9; + u64 r10; + u64 r11; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -18,4 +46,5 @@ static inline void tdx_early_init(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 9fb0a2f8b62a..7dca52f5cfc6 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -65,6 +66,14 @@ static void __used common(void) OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif + BLANK(); + OFFSET(TDX_MODULE_rcx, tdx_module_output, rcx); + OFFSET(TDX_MODULE_rdx, tdx_module_output, rdx); + OFFSET(TDX_MODULE_r8, tdx_module_output, r8); + OFFSET(TDX_MODULE_r9, tdx_module_output, r9); + OFFSET(TDX_MODULE_r10, tdx_module_output, r10); + OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); diff --git a/arch/x86/virt/vmx/tdx/tdxcall.S b/arch/x86/virt/vmx/tdx/tdxcall.S new file mode 100644 index 000000000000..49a54356ae99 --- /dev/null +++ b/arch/x86/virt/vmx/tdx/tdxcall.S @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include + +/* + * TDCALL and SEAMCALL are supported in Binutils >= 2.36. + */ +#define tdcall .byte 0x66,0x0f,0x01,0xcc +#define seamcall .byte 0x66,0x0f,0x01,0xcf + +/* + * TDX_MODULE_CALL - common helper macro for both + * TDCALL and SEAMCALL instructions. + * + * TDCALL - used by TDX guests to make requests to the + * TDX module and hypercalls to the VMM. + * SEAMCALL - used by TDX hosts to make requests to the + * TDX module. + */ +.macro TDX_MODULE_CALL host:req + /* + * R12 will be used as temporary storage for struct tdx_module_output + * pointer. Since R12-R15 registers are not used by TDCALL/SEAMCALL + * services supported by this function, it can be reused. + */ + + /* Callee saved, so preserve it */ + push %r12 + + /* + * Push output pointer to stack. + * After the operation, it will be fetched into R12 register. + */ + push %r9 + + /* Mangle function call ABI into TDCALL/SEAMCALL ABI: */ + /* Move Leaf ID to RAX */ + mov %rdi, %rax + /* Move input 4 to R9 */ + mov %r8, %r9 + /* Move input 3 to R8 */ + mov %rcx, %r8 + /* Move input 1 to RCX */ + mov %rsi, %rcx + /* Leave input param 2 in RDX */ + + .if \host + seamcall + /* + * SEAMCALL instruction is essentially a VMExit from VMX root + * mode to SEAM VMX root mode. VMfailInvalid (CF=1) indicates + * that the targeted SEAM firmware is not loaded or disabled, + * or P-SEAMLDR is busy with another SEAMCALL. %rax is not + * changed in this case. + * + * Set %rax to TDX_SEAMCALL_VMFAILINVALID for VMfailInvalid. + * This value will never be used as actual SEAMCALL error code as + * it is from the Reserved status code class. + */ + jnc .Lno_vmfailinvalid + mov $TDX_SEAMCALL_VMFAILINVALID, %rax +.Lno_vmfailinvalid: + + .else + tdcall + .endif + + /* + * Fetch output pointer from stack to R12 (It is used + * as temporary storage) + */ + pop %r12 + + /* + * Since this macro can be invoked with NULL as an output pointer, + * check if caller provided an output struct before storing output + * registers. + * + * Update output registers, even if the call failed (RAX != 0). + * Other registers may contain details of the failure. + */ + test %r12, %r12 + jz .Lno_output_struct + + /* Copy result registers to output struct: */ + movq %rcx, TDX_MODULE_rcx(%r12) + movq %rdx, TDX_MODULE_rdx(%r12) + movq %r8, TDX_MODULE_r8(%r12) + movq %r9, TDX_MODULE_r9(%r12) + movq %r10, TDX_MODULE_r10(%r12) + movq %r11, TDX_MODULE_r11(%r12) + +.Lno_output_struct: + /* Restore the state of R12 register */ + pop %r12 +.endm -- cgit v1.2.3-59-g8ed1b From eb94f1b6a70a683040d60d21bbb6ad65f082600a Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:12 +0300 Subject: x86/tdx: Add __tdx_module_call() and __tdx_hypercall() helper functions Guests communicate with VMMs with hypercalls. Historically, these are implemented using instructions that are known to cause VMEXITs like VMCALL, VMLAUNCH, etc. However, with TDX, VMEXITs no longer expose the guest state to the host. This prevents the old hypercall mechanisms from working. So, to communicate with VMM, TDX specification defines a new instruction called TDCALL. In a TDX based VM, since the VMM is an untrusted entity, an intermediary layer -- TDX module -- facilitates secure communication between the host and the guest. TDX module is loaded like a firmware into a special CPU mode called SEAM. TDX guests communicate with the TDX module using the TDCALL instruction. A guest uses TDCALL to communicate with both the TDX module and VMM. The value of the RAX register when executing the TDCALL instruction is used to determine the TDCALL type. A leaf of TDCALL used to communicate with the VMM is called TDVMCALL. Add generic interfaces to communicate with the TDX module and VMM (using the TDCALL instruction). __tdx_module_call() - Used to communicate with the TDX module (via TDCALL instruction). __tdx_hypercall() - Used by the guest to request services from the VMM (via TDVMCALL leaf of TDCALL). Also define an additional wrapper _tdx_hypercall(), which adds error handling support for the TDCALL failure. The __tdx_module_call() and __tdx_hypercall() helper functions are implemented in assembly in a .S file. The TDCALL ABI requires shuffling arguments in and out of registers, which proved to be awkward with inline assembly. Just like syscalls, not all TDVMCALL use cases need to use the same number of argument registers. The implementation here picks the current worst-case scenario for TDCALL (4 registers). For TDCALLs with fewer than 4 arguments, there will end up being a few superfluous (cheap) instructions. But, this approach maximizes code reuse. For registers used by the TDCALL instruction, please check TDX GHCI specification, the section titled "TDCALL instruction" and "TDG.VP.VMCALL Interface". Based on previous patch by Sean Christopherson. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20220405232939.73860-4-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/Makefile | 2 +- arch/x86/coco/tdx/tdcall.S | 191 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/coco/tdx/tdx.c | 24 ++++++ arch/x86/include/asm/tdx.h | 30 +++++++ arch/x86/kernel/asm-offsets.c | 8 ++ 5 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 arch/x86/coco/tdx/tdcall.S (limited to 'arch/x86/include') diff --git a/arch/x86/coco/tdx/Makefile b/arch/x86/coco/tdx/Makefile index c929d53ee059..46c55998557d 100644 --- a/arch/x86/coco/tdx/Makefile +++ b/arch/x86/coco/tdx/Makefile @@ -1,3 +1,3 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += tdx.o +obj-y += tdx.o tdcall.o diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S new file mode 100644 index 000000000000..662479ccf630 --- /dev/null +++ b/arch/x86/coco/tdx/tdcall.S @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include + +#include +#include +#include + +#include "../../virt/vmx/tdx/tdxcall.S" + +/* + * Bitmasks of exposed registers (with VMM). + */ +#define TDX_R10 BIT(10) +#define TDX_R11 BIT(11) +#define TDX_R12 BIT(12) +#define TDX_R13 BIT(13) +#define TDX_R14 BIT(14) +#define TDX_R15 BIT(15) + +/* + * These registers are clobbered to hold arguments for each + * TDVMCALL. They are safe to expose to the VMM. + * Each bit in this mask represents a register ID. Bit field + * details can be found in TDX GHCI specification, section + * titled "TDCALL [TDG.VP.VMCALL] leaf". + */ +#define TDVMCALL_EXPOSE_REGS_MASK ( TDX_R10 | TDX_R11 | \ + TDX_R12 | TDX_R13 | \ + TDX_R14 | TDX_R15 ) + +/* + * __tdx_module_call() - Used by TDX guests to request services from + * the TDX module (does not include VMM services) using TDCALL instruction. + * + * Transforms function call register arguments into the TDCALL register ABI. + * After TDCALL operation, TDX module output is saved in @out (if it is + * provided by the user). + * + *------------------------------------------------------------------------- + * TDCALL ABI: + *------------------------------------------------------------------------- + * Input Registers: + * + * RAX - TDCALL Leaf number. + * RCX,RDX,R8-R9 - TDCALL Leaf specific input registers. + * + * Output Registers: + * + * RAX - TDCALL instruction error code. + * RCX,RDX,R8-R11 - TDCALL Leaf specific output registers. + * + *------------------------------------------------------------------------- + * + * __tdx_module_call() function ABI: + * + * @fn (RDI) - TDCALL Leaf ID, moved to RAX + * @rcx (RSI) - Input parameter 1, moved to RCX + * @rdx (RDX) - Input parameter 2, moved to RDX + * @r8 (RCX) - Input parameter 3, moved to R8 + * @r9 (R8) - Input parameter 4, moved to R9 + * + * @out (R9) - struct tdx_module_output pointer + * stored temporarily in R12 (not + * shared with the TDX module). It + * can be NULL. + * + * Return status of TDCALL via RAX. + */ +SYM_FUNC_START(__tdx_module_call) + FRAME_BEGIN + TDX_MODULE_CALL host=0 + FRAME_END + ret +SYM_FUNC_END(__tdx_module_call) + +/* + * __tdx_hypercall() - Make hypercalls to a TDX VMM using TDVMCALL leaf + * of TDCALL instruction + * + * Transforms values in function call argument struct tdx_hypercall_args @args + * into the TDCALL register ABI. After TDCALL operation, VMM output is saved + * back in @args. + * + *------------------------------------------------------------------------- + * TD VMCALL ABI: + *------------------------------------------------------------------------- + * + * Input Registers: + * + * RAX - TDCALL instruction leaf number (0 - TDG.VP.VMCALL) + * RCX - BITMAP which controls which part of TD Guest GPR + * is passed as-is to the VMM and back. + * R10 - Set 0 to indicate TDCALL follows standard TDX ABI + * specification. Non zero value indicates vendor + * specific ABI. + * R11 - VMCALL sub function number + * RBX, RBP, RDI, RSI - Used to pass VMCALL sub function specific arguments. + * R8-R9, R12-R15 - Same as above. + * + * Output Registers: + * + * RAX - TDCALL instruction status (Not related to hypercall + * output). + * R10 - Hypercall output error code. + * R11-R15 - Hypercall sub function specific output values. + * + *------------------------------------------------------------------------- + * + * __tdx_hypercall() function ABI: + * + * @args (RDI) - struct tdx_hypercall_args for input and output + * @flags (RSI) - TDX_HCALL_* flags + * + * On successful completion, return the hypercall error code. + */ +SYM_FUNC_START(__tdx_hypercall) + FRAME_BEGIN + + /* Save callee-saved GPRs as mandated by the x86_64 ABI */ + push %r15 + push %r14 + push %r13 + push %r12 + + /* Mangle function call ABI into TDCALL ABI: */ + /* Set TDCALL leaf ID (TDVMCALL (0)) in RAX */ + xor %eax, %eax + + /* Copy hypercall registers from arg struct: */ + movq TDX_HYPERCALL_r10(%rdi), %r10 + movq TDX_HYPERCALL_r11(%rdi), %r11 + movq TDX_HYPERCALL_r12(%rdi), %r12 + movq TDX_HYPERCALL_r13(%rdi), %r13 + movq TDX_HYPERCALL_r14(%rdi), %r14 + movq TDX_HYPERCALL_r15(%rdi), %r15 + + movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + + tdcall + + /* + * RAX==0 indicates a failure of the TDVMCALL mechanism itself and that + * something has gone horribly wrong with the TDX module. + * + * The return status of the hypercall operation is in a separate + * register (in R10). Hypercall errors are a part of normal operation + * and are handled by callers. + */ + testq %rax, %rax + jne .Lpanic + + /* TDVMCALL leaf return code is in R10 */ + movq %r10, %rax + + /* Copy hypercall result registers to arg struct if needed */ + testq $TDX_HCALL_HAS_OUTPUT, %rsi + jz .Lout + + movq %r10, TDX_HYPERCALL_r10(%rdi) + movq %r11, TDX_HYPERCALL_r11(%rdi) + movq %r12, TDX_HYPERCALL_r12(%rdi) + movq %r13, TDX_HYPERCALL_r13(%rdi) + movq %r14, TDX_HYPERCALL_r14(%rdi) + movq %r15, TDX_HYPERCALL_r15(%rdi) +.Lout: + /* + * Zero out registers exposed to the VMM to avoid speculative execution + * with VMM-controlled values. This needs to include all registers + * present in TDVMCALL_EXPOSE_REGS_MASK (except R12-R15). R12-R15 + * context will be restored. + */ + xor %r10d, %r10d + xor %r11d, %r11d + + /* Restore callee-saved GPRs as mandated by the x86_64 ABI */ + pop %r12 + pop %r13 + pop %r14 + pop %r15 + + FRAME_END + + retq +.Lpanic: + call __tdx_hypercall_failed + /* __tdx_hypercall_failed never returns */ + jmp .Lpanic +SYM_FUNC_END(__tdx_hypercall) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 97674471fd1e..4b57880e45b0 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,30 @@ #include #include +/* + * Wrapper for standard use of __tdx_hypercall with no output aside from + * return code. + */ +static inline u64 _tdx_hypercall(u64 fn, u64 r12, u64 r13, u64 r14, u64 r15) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = fn, + .r12 = r12, + .r13 = r13, + .r14 = r14, + .r15 = r15, + }; + + return __tdx_hypercall(&args, 0); +} + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + panic("TDVMCALL failed. TDX module bug?"); +} + void __init tdx_early_init(void) { u32 eax, sig[3]; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index cb4c4e607c43..a33d47abe67d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,12 +3,17 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H +#include #include #include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) + /* * SW-defined error codes. * @@ -36,10 +41,35 @@ struct tdx_module_output { u64 r11; }; +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); +/* Used to communicate with the TDX module */ +u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, + struct tdx_module_output *out); + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 7dca52f5cfc6..437308004ef2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -74,6 +74,14 @@ static void __used common(void) OFFSET(TDX_MODULE_r10, tdx_module_output, r10); OFFSET(TDX_MODULE_r11, tdx_module_output, r11); + BLANK(); + OFFSET(TDX_HYPERCALL_r10, tdx_hypercall_args, r10); + OFFSET(TDX_HYPERCALL_r11, tdx_hypercall_args, r11); + OFFSET(TDX_HYPERCALL_r12, tdx_hypercall_args, r12); + OFFSET(TDX_HYPERCALL_r13, tdx_hypercall_args, r13); + OFFSET(TDX_HYPERCALL_r14, tdx_hypercall_args, r14); + OFFSET(TDX_HYPERCALL_r15, tdx_hypercall_args, r15); + BLANK(); OFFSET(BP_scratch, boot_params, scratch); OFFSET(BP_secure_boot, boot_params, secure_boot); -- cgit v1.2.3-59-g8ed1b From 9a22bf6debbf5169f750af53c7f86eb4e3cd6712 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:16 +0300 Subject: x86/traps: Add #VE support for TDX guest Virtualization Exceptions (#VE) are delivered to TDX guests due to specific guest actions which may happen in either user space or the kernel: * Specific instructions (WBINVD, for example) * Specific MSR accesses * Specific CPUID leaf accesses * Access to specific guest physical addresses Syscall entry code has a critical window where the kernel stack is not yet set up. Any exception in this window leads to hard to debug issues and can be exploited for privilege escalation. Exceptions in the NMI entry code also cause issues. Returning from the exception handler with IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. For these reasons, the kernel avoids #VEs during the syscall gap and the NMI entry code. Entry code paths do not access TD-shared memory, MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves that might generate #VE. VMM can remove memory from TD at any point, but access to unaccepted (or missing) private memory leads to VM termination, not to #VE. Similarly to page faults and breakpoints, #VEs are allowed in NMI handlers once the kernel is ready to deal with nested NMIs. During #VE delivery, all interrupts, including NMIs, are blocked until TDGETVEINFO is called. It prevents #VE nesting until the kernel reads the VE info. TDGETVEINFO retrieves the #VE info from the TDX module, which also clears the "#VE valid" flag. This must be done before anything else as any #VE that occurs while the valid flag is set escalates to #DF by TDX module. It will result in an oops. Virtual NMIs are inhibited if the #VE valid flag is set. NMI will not be delivered until TDGETVEINFO is called. For now, convert unhandled #VE's (everything, until later in this series) so that they appear just like a #GP by calling the ve_raise_fault() directly. The ve_raise_fault() function is similar to #GP handler and is responsible for sending SIGSEGV to userspace and CPU die and notifying debuggers and other die chain users. Co-developed-by: Sean Christopherson Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-8-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 38 ++++++++++++++++++ arch/x86/include/asm/idtentry.h | 4 ++ arch/x86/include/asm/tdx.h | 21 ++++++++++ arch/x86/kernel/idt.c | 3 ++ arch/x86/kernel/traps.c | 86 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e84f6dd3ed2a..60a3f2ff5b95 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -10,6 +10,7 @@ /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 +#define TDX_GET_VEINFO 3 /* * Wrapper for standard use of __tdx_hypercall with no output aside from @@ -73,6 +74,43 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +void tdx_get_ve_info(struct ve_info *ve) +{ + struct tdx_module_output out; + + /* + * Called during #VE handling to retrieve the #VE info from the + * TDX module. + * + * This has to be called early in #VE handling. A "nested" #VE which + * occurs before this will raise a #DF and is not recoverable. + * + * The call retrieves the #VE info from the TDX module, which also + * clears the "#VE valid" flag. This must be done before anything else + * because any #VE that occurs while the valid flag is set will lead to + * #DF. + * + * Note, the TDX module treats virtual NMIs as inhibited if the #VE + * valid flag is set. It means that NMI=>#VE will not result in a #DF. + */ + tdx_module_call(TDX_GET_VEINFO, 0, 0, 0, 0, &out); + + /* Transfer the output parameters */ + ve->exit_reason = out.rcx; + ve->exit_qual = out.rdx; + ve->gla = out.r8; + ve->gpa = out.r9; + ve->instr_len = lower_32_bits(out.r10); + ve->instr_info = upper_32_bits(out.r10); +} + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) +{ + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + + return false; +} + void __init tdx_early_init(void) { u64 cc_mask; diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 7924f27f5c8b..72184b0b2219 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback); DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap); #endif +#ifdef CONFIG_INTEL_TDX_GUEST +DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception); +#endif + /* Device interrupts common/spurious */ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt); #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index a33d47abe67d..c4142e7b004c 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -6,6 +6,7 @@ #include #include #include +#include #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " @@ -56,6 +57,22 @@ struct tdx_hypercall_args { u64 r15; }; +/* + * Used by the #VE exception handler to gather the #VE exception + * info from the TDX module. This is a software only structure + * and not part of the TDX module/VMM ABI. + */ +struct ve_info { + u64 exit_reason; + u64 exit_qual; + /* Guest Linear (virtual) Address */ + u64 gla; + /* Guest Physical Address */ + u64 gpa; + u32 instr_len; + u32 instr_info; +}; + #ifdef CONFIG_INTEL_TDX_GUEST void __init tdx_early_init(void); @@ -70,6 +87,10 @@ u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); /* Called from __tdx_hypercall() for unrecoverable failure */ void __tdx_hypercall_failed(void); +void tdx_get_ve_info(struct ve_info *ve); + +bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); + #else static inline void tdx_early_init(void) { }; diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 608eb63bf044..a58c6bc1cd68 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -69,6 +69,9 @@ static const __initconst struct idt_data early_idts[] = { */ INTG(X86_TRAP_PF, asm_exc_page_fault), #endif +#ifdef CONFIG_INTEL_TDX_GUEST + INTG(X86_TRAP_VE, asm_exc_virtualization_exception), +#endif }; /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index db8d22a0d003..f9fb6530338f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -1348,6 +1349,91 @@ DEFINE_IDTENTRY(exc_device_not_available) } } +#ifdef CONFIG_INTEL_TDX_GUEST + +#define VE_FAULT_STR "VE fault" + +static void ve_raise_fault(struct pt_regs *regs, long error_code) +{ + if (user_mode(regs)) { + gp_user_force_sig_segv(regs, X86_TRAP_VE, error_code, VE_FAULT_STR); + return; + } + + if (gp_try_fixup_and_notify(regs, X86_TRAP_VE, error_code, VE_FAULT_STR)) + return; + + die_addr(VE_FAULT_STR, regs, error_code, 0); +} + +/* + * Virtualization Exceptions (#VE) are delivered to TDX guests due to + * specific guest actions which may happen in either user space or the + * kernel: + * + * * Specific instructions (WBINVD, for example) + * * Specific MSR accesses + * * Specific CPUID leaf accesses + * * Access to specific guest physical addresses + * + * In the settings that Linux will run in, virtualization exceptions are + * never generated on accesses to normal, TD-private memory that has been + * accepted. + * + * Syscall entry code has a critical window where the kernel stack is not + * yet set up. Any exception in this window leads to hard to debug issues + * and can be exploited for privilege escalation. Exceptions in the NMI + * entry code also cause issues. Returning from the exception handler with + * IRET will re-enable NMIs and nested NMI will corrupt the NMI stack. + * + * For these reasons, the kernel avoids #VEs during the syscall gap and + * the NMI entry code. Entry code paths do not access TD-shared memory, + * MMIO regions, use #VE triggering MSRs, instructions, or CPUID leaves + * that might generate #VE. VMM can remove memory from TD at any point, + * but access to unaccepted (or missing) private memory leads to VM + * termination, not to #VE. + * + * Similarly to page faults and breakpoints, #VEs are allowed in NMI + * handlers once the kernel is ready to deal with nested NMIs. + * + * During #VE delivery, all interrupts, including NMIs, are blocked until + * TDGETVEINFO is called. It prevents #VE nesting until the kernel reads + * the VE info. + * + * If a guest kernel action which would normally cause a #VE occurs in + * the interrupt-disabled region before TDGETVEINFO, a #DF (fault + * exception) is delivered to the guest which will result in an oops. + * + * The entry code has been audited carefully for following these expectations. + * Changes in the entry code have to be audited for correctness vs. this + * aspect. Similarly to #PF, #VE in these places will expose kernel to + * privilege escalation or may lead to random crashes. + */ +DEFINE_IDTENTRY(exc_virtualization_exception) +{ + struct ve_info ve; + + /* + * NMIs/Machine-checks/Interrupts will be in a disabled state + * till TDGETVEINFO TDCALL is executed. This ensures that VE + * info cannot be overwritten by a nested #VE. + */ + tdx_get_ve_info(&ve); + + cond_local_irq_enable(regs); + + /* + * If tdx_handle_virt_exception() could not process + * it successfully, treat it as #GP(0) and handle it. + */ + if (!tdx_handle_virt_exception(regs, &ve)) + ve_raise_fault(regs, 0); + + cond_local_irq_disable(regs); +} + +#endif + #ifdef CONFIG_X86_32 DEFINE_IDTENTRY_SW(iret_error) { -- cgit v1.2.3-59-g8ed1b From bfe6ed0c672782ac2a8edffac93b1ba84b0ff984 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:17 +0300 Subject: x86/tdx: Add HLT support for TDX guests The HLT instruction is a privileged instruction, executing it stops instruction execution and places the processor in a HALT state. It is used in kernel for cases like reboot, idle loop and exception fixup handlers. For the idle case, interrupts will be enabled (using STI) before the HLT instruction (this is also called safe_halt()). To support the HLT instruction in TDX guests, it needs to be emulated using TDVMCALL (hypercall to VMM). More details about it can be found in Intel Trust Domain Extensions (Intel TDX) Guest-Host-Communication Interface (GHCI) specification, section TDVMCALL[Instruction.HLT]. In TDX guests, executing HLT instruction will generate a #VE, which is used to emulate the HLT instruction. But #VE based emulation will not work for the safe_halt() flavor, because it requires STI instruction to be executed just before the TDCALL. Since idle loop is the only user of safe_halt() variant, handle it as a special case. To avoid *safe_halt() call in the idle function, define the tdx_guest_idle() and use it to override the "x86_idle" function pointer for a valid TDX guest. Alternative choices like PV ops have been considered for adding safe_halt() support. But it was rejected because HLT paravirt calls only exist under PARAVIRT_XXL, and enabling it in TDX guest just for safe_halt() use case is not worth the cost. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-9-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdcall.S | 13 +++++++ arch/x86/coco/tdx/tdx.c | 93 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/include/asm/tdx.h | 4 ++ arch/x86/kernel/process.c | 4 ++ 4 files changed, 112 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/coco/tdx/tdcall.S b/arch/x86/coco/tdx/tdcall.S index 662479ccf630..245888290bb6 100644 --- a/arch/x86/coco/tdx/tdcall.S +++ b/arch/x86/coco/tdx/tdcall.S @@ -139,6 +139,19 @@ SYM_FUNC_START(__tdx_hypercall) movl $TDVMCALL_EXPOSE_REGS_MASK, %ecx + /* + * For the idle loop STI needs to be called directly before the TDCALL + * that enters idle (EXIT_REASON_HLT case). STI instruction enables + * interrupts only one instruction later. If there is a window between + * STI and the instruction that emulates the HALT state, there is a + * chance for interrupts to happen in this window, which can delay the + * HLT operation indefinitely. Since this is the not the desired + * result, conditionally call STI before TDCALL. + */ + testq $TDX_HCALL_ISSUE_STI, %rsi + jz .Lskip_sti + sti +.Lskip_sti: tdcall /* diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index 60a3f2ff5b95..ed7302581cc7 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -7,6 +7,7 @@ #include #include #include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 @@ -36,6 +37,17 @@ void __tdx_hypercall_failed(void) panic("TDVMCALL failed. TDX module bug?"); } +/* + * The TDG.VP.VMCALL-Instruction-execution sub-functions are defined + * independently from but are currently matched 1:1 with VMX EXIT_REASONs. + * Reusing the KVM EXIT_REASON macros makes it easier to connect the host and + * guest sides of these calls. + */ +static u64 hcall_func(u64 exit_reason) +{ + return exit_reason; +} + /* * Used for TDX guests to make calls directly to the TD module. This * should only be used for calls that have no legitimate reason to fail @@ -74,6 +86,62 @@ static u64 get_cc_mask(void) return BIT_ULL(gpa_width - 1); } +static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = hcall_func(EXIT_REASON_HLT), + .r12 = irq_disabled, + }; + + /* + * Emulate HLT operation via hypercall. More info about ABI + * can be found in TDX Guest-Host-Communication Interface + * (GHCI), section 3.8 TDG.VP.VMCALL. + * + * The VMM uses the "IRQ disabled" param to understand IRQ + * enabled status (RFLAGS.IF) of the TD guest and to determine + * whether or not it should schedule the halted vCPU if an + * IRQ becomes pending. E.g. if IRQs are disabled, the VMM + * can keep the vCPU in virtual HLT, even if an IRQ is + * pending, without hanging/breaking the guest. + */ + return __tdx_hypercall(&args, do_sti ? TDX_HCALL_ISSUE_STI : 0); +} + +static bool handle_halt(void) +{ + /* + * Since non safe halt is mainly used in CPU offlining + * and the guest will always stay in the halt state, don't + * call the STI instruction (set do_sti as false). + */ + const bool irq_disabled = irqs_disabled(); + const bool do_sti = false; + + if (__halt(irq_disabled, do_sti)) + return false; + + return true; +} + +void __cpuidle tdx_safe_halt(void) +{ + /* + * For do_sti=true case, __tdx_hypercall() function enables + * interrupts using the STI instruction before the TDCALL. So + * set irq_disabled as false. + */ + const bool irq_disabled = false; + const bool do_sti = true; + + /* + * Use WARN_ONCE() to report the failure. + */ + if (__halt(irq_disabled, do_sti)) + WARN_ONCE(1, "HLT instruction emulation failed\n"); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; @@ -104,11 +172,32 @@ void tdx_get_ve_info(struct ve_info *ve) ve->instr_info = upper_32_bits(out.r10); } +/* Handle the kernel #VE */ +static bool virt_exception_kernel(struct pt_regs *regs, struct ve_info *ve) +{ + switch (ve->exit_reason) { + case EXIT_REASON_HLT: + return handle_halt(); + default: + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + return false; + } +} + bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve) { - pr_warn("Unexpected #VE: %lld\n", ve->exit_reason); + bool ret; + + if (user_mode(regs)) + ret = false; + else + ret = virt_exception_kernel(regs, ve); + + /* After successful #VE handling, move the IP */ + if (ret) + regs->ip += ve->instr_len; - return false; + return ret; } void __init tdx_early_init(void) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index c4142e7b004c..cbd61e142f4e 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -14,6 +14,7 @@ #define TDX_HYPERCALL_STANDARD 0 #define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) /* * SW-defined error codes. @@ -91,9 +92,12 @@ void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); +void tdx_safe_halt(void); + #else static inline void tdx_early_init(void) { }; +static inline void tdx_safe_halt(void) { }; #endif /* CONFIG_INTEL_TDX_GUEST */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..dbaf12c43fe1 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "process.h" @@ -873,6 +874,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) } else if (prefer_mwait_c1_over_halt(c)) { pr_info("using mwait in idle threads\n"); x86_idle = mwait_idle; + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); + x86_idle = tdx_safe_halt; } else x86_idle = default_idle; } -- cgit v1.2.3-59-g8ed1b From 4b05f81504bfcaab51083d3a271fada75e6b8904 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:21 +0300 Subject: x86/tdx: Detect TDX at early kernel decompression time The early decompression code does port I/O for its console output. But, handling the decompression-time port I/O demands a different approach from normal runtime because the IDT required to support #VE based port I/O emulation is not yet set up. Paravirtualizing I/O calls during the decompression step is acceptable because the decompression code doesn't have a lot of call sites to IO instruction. To support port I/O in decompression code, TDX must be detected before the decompression code might do port I/O. Detect whether the kernel runs in a TDX guest. Add an early_is_tdx_guest() interface to query the cached TDX guest status in the decompression code. TDX is detected with CPUID. Make cpuid_count() accessible outside boot/cpuflags.c. TDX detection in the main kernel is very similar. Move common bits into . The actual port I/O paravirtualization will come later in the series. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-13-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 1 + arch/x86/boot/compressed/misc.c | 8 ++++++++ arch/x86/boot/compressed/misc.h | 2 ++ arch/x86/boot/compressed/tdx.c | 16 ++++++++++++++++ arch/x86/boot/compressed/tdx.h | 13 +++++++++++++ arch/x86/boot/cpuflags.c | 3 +-- arch/x86/boot/cpuflags.h | 1 + arch/x86/include/asm/shared/tdx.h | 8 ++++++++ arch/x86/include/asm/tdx.h | 4 +--- 9 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 arch/x86/boot/compressed/tdx.c create mode 100644 arch/x86/boot/compressed/tdx.h create mode 100644 arch/x86/include/asm/shared/tdx.h (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 6115274fe10f..732f6b21ecbd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,6 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 1cdcaf34ee36..e8142e977ddb 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -371,6 +371,14 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, lines = boot_params->screen_info.orig_video_lines; cols = boot_params->screen_info.orig_video_cols; + /* + * Detect TDX guest environment. + * + * It has to be done before console_init() in order to use + * paravirtualized port I/O operations if needed. + */ + early_tdx_detect(); + console_init(); /* diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 16ed360b6692..0d8e275a9d96 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -28,6 +28,8 @@ #include #include +#include "tdx.h" + #define BOOT_CTYPE_H #include diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c new file mode 100644 index 000000000000..5f6d01a2f1f4 --- /dev/null +++ b/arch/x86/boot/compressed/tdx.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "../cpuflags.h" +#include "../string.h" + +#include + +void early_tdx_detect(void) +{ + u32 eax, sig[3]; + + cpuid_count(TDX_CPUID_LEAF_ID, 0, &eax, &sig[0], &sig[2], &sig[1]); + + if (memcmp(TDX_IDENT, sig, sizeof(sig))) + return; +} diff --git a/arch/x86/boot/compressed/tdx.h b/arch/x86/boot/compressed/tdx.h new file mode 100644 index 000000000000..9055482cd35c --- /dev/null +++ b/arch/x86/boot/compressed/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BOOT_COMPRESSED_TDX_H +#define BOOT_COMPRESSED_TDX_H + +#include + +#ifdef CONFIG_INTEL_TDX_GUEST +void early_tdx_detect(void); +#else +static inline void early_tdx_detect(void) { }; +#endif + +#endif /* BOOT_COMPRESSED_TDX_H */ diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index a0b75f73dc63..a83d67ec627d 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -71,8 +71,7 @@ int has_eflag(unsigned long mask) # define EBX_REG "=b" #endif -static inline void cpuid_count(u32 id, u32 count, - u32 *a, u32 *b, u32 *c, u32 *d) +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d) { asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" "cpuid \n\t" diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 2e20814d3ce3..475b8fde90f7 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -17,5 +17,6 @@ extern u32 cpu_vendor[3]; int has_eflag(unsigned long mask); void get_cpuflags(void); +void cpuid_count(u32 id, u32 count, u32 *a, u32 *b, u32 *c, u32 *d); #endif diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h new file mode 100644 index 000000000000..8209ba9ffe1a --- /dev/null +++ b/arch/x86/include/asm/shared/tdx.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_TDX_H +#define _ASM_X86_SHARED_TDX_H + +#define TDX_CPUID_LEAF_ID 0x21 +#define TDX_IDENT "IntelTDX " + +#endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index cbd61e142f4e..81a1ec14e476 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -7,9 +7,7 @@ #include #include #include - -#define TDX_CPUID_LEAF_ID 0x21 -#define TDX_IDENT "IntelTDX " +#include #define TDX_HYPERCALL_STANDARD 0 -- cgit v1.2.3-59-g8ed1b From 15104de122a4f0258981b06ed94cf616a6eb03ef Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:22 +0300 Subject: x86: Adjust types used in port I/O helpers Change port I/O helpers to use u8/u16/u32 instead of unsigned char/short/int for values. Use u16 instead of int for port number. It aligns the helpers with implementation in boot stub in preparation for consolidation. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-14-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/io.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index f6d91ecb8026..638c1a2a82e0 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -258,37 +258,37 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(unsigned type value, int port) \ +static inline void out##bwl(type value, u16 port) \ { \ asm volatile("out" #bwl " %" #bw "0, %w1" \ : : "a"(value), "Nd"(port)); \ } \ \ -static inline unsigned type in##bwl(int port) \ +static inline type in##bwl(u16 port) \ { \ - unsigned type value; \ + type value; \ asm volatile("in" #bwl " %w1, %" #bw "0" \ : "=a"(value) : "Nd"(port)); \ return value; \ } \ \ -static inline void out##bwl##_p(unsigned type value, int port) \ +static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ slow_down_io(); \ } \ \ -static inline unsigned type in##bwl##_p(int port) \ +static inline type in##bwl##_p(u16 port) \ { \ - unsigned type value = in##bwl(port); \ + type value = in##bwl(port); \ slow_down_io(); \ return value; \ } \ \ -static inline void outs##bwl(int port, const void *addr, unsigned long count) \ +static inline void outs##bwl(u16 port, const void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ out##bwl(*value, port); \ value++; \ @@ -301,10 +301,10 @@ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ } \ } \ \ -static inline void ins##bwl(int port, void *addr, unsigned long count) \ +static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ { \ if (cc_platform_has(CC_ATTR_GUEST_UNROLL_STRING_IO)) { \ - unsigned type *value = (unsigned type *)addr; \ + type *value = (type *)addr; \ while (count) { \ *value = in##bwl(port); \ value++; \ @@ -317,9 +317,9 @@ static inline void ins##bwl(int port, void *addr, unsigned long count) \ } \ } -BUILDIO(b, b, char) -BUILDIO(w, w, short) -BUILDIO(l, , int) +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) #define inb inb #define inw inw -- cgit v1.2.3-59-g8ed1b From 1e8f93e18379d05da9fd130eb7d50988a20f8b9a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:23 +0300 Subject: x86: Consolidate port I/O helpers There are two implementations of port I/O helpers: one in the kernel and one in the boot stub. Move the helpers required for both to and use the one implementation everywhere. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-15-kirill.shutemov@linux.intel.com --- arch/x86/boot/boot.h | 35 +---------------------------------- arch/x86/boot/compressed/misc.h | 2 +- arch/x86/include/asm/io.h | 22 ++-------------------- arch/x86/include/asm/shared/io.h | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 38 insertions(+), 55 deletions(-) create mode 100644 arch/x86/include/asm/shared/io.h (limited to 'arch/x86/include') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 34c9dbb6a47d..22a474c5b3e8 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" @@ -35,40 +36,6 @@ extern struct boot_params boot_params; #define cpu_relax() asm volatile("rep; nop") -/* Basic port I/O */ -static inline void outb(u8 v, u16 port) -{ - asm volatile("outb %0,%1" : : "a" (v), "dN" (port)); -} -static inline u8 inb(u16 port) -{ - u8 v; - asm volatile("inb %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outw(u16 v, u16 port) -{ - asm volatile("outw %0,%1" : : "a" (v), "dN" (port)); -} -static inline u16 inw(u16 port) -{ - u16 v; - asm volatile("inw %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline void outl(u32 v, u16 port) -{ - asm volatile("outl %0,%1" : : "a" (v), "dN" (port)); -} -static inline u32 inl(u16 port) -{ - u32 v; - asm volatile("inl %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - static inline void io_delay(void) { const u16 DELAY_PORT = 0x80; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 0d8e275a9d96..8a253e85f990 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -22,11 +22,11 @@ #include #include #include -#include #include #include #include #include +#include #include "tdx.h" diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 638c1a2a82e0..a1eb218a49f8 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -44,6 +44,7 @@ #include #include #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -258,20 +259,6 @@ static inline void slow_down_io(void) #endif #define BUILDIO(bwl, bw, type) \ -static inline void out##bwl(type value, u16 port) \ -{ \ - asm volatile("out" #bwl " %" #bw "0, %w1" \ - : : "a"(value), "Nd"(port)); \ -} \ - \ -static inline type in##bwl(u16 port) \ -{ \ - type value; \ - asm volatile("in" #bwl " %w1, %" #bw "0" \ - : "=a"(value) : "Nd"(port)); \ - return value; \ -} \ - \ static inline void out##bwl##_p(type value, u16 port) \ { \ out##bwl(value, port); \ @@ -320,10 +307,8 @@ static inline void ins##bwl(u16 port, void *addr, unsigned long count) \ BUILDIO(b, b, u8) BUILDIO(w, w, u16) BUILDIO(l, , u32) +#undef BUILDIO -#define inb inb -#define inw inw -#define inl inl #define inb_p inb_p #define inw_p inw_p #define inl_p inl_p @@ -331,9 +316,6 @@ BUILDIO(l, , u32) #define insw insw #define insl insl -#define outb outb -#define outw outw -#define outl outl #define outb_p outb_p #define outw_p outw_p #define outl_p outl_p diff --git a/arch/x86/include/asm/shared/io.h b/arch/x86/include/asm/shared/io.h new file mode 100644 index 000000000000..c0ef921c0586 --- /dev/null +++ b/arch/x86/include/asm/shared/io.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SHARED_IO_H +#define _ASM_X86_SHARED_IO_H + +#include + +#define BUILDIO(bwl, bw, type) \ +static inline void __out##bwl(type value, u16 port) \ +{ \ + asm volatile("out" #bwl " %" #bw "0, %w1" \ + : : "a"(value), "Nd"(port)); \ +} \ + \ +static inline type __in##bwl(u16 port) \ +{ \ + type value; \ + asm volatile("in" #bwl " %w1, %" #bw "0" \ + : "=a"(value) : "Nd"(port)); \ + return value; \ +} + +BUILDIO(b, b, u8) +BUILDIO(w, w, u16) +BUILDIO(l, , u32) +#undef BUILDIO + +#define inb __inb +#define inw __inw +#define inl __inl +#define outb __outb +#define outw __outw +#define outl __outl + +#endif -- cgit v1.2.3-59-g8ed1b From 4c5b9aac6cade51aef64cc6ed67f2ad5acda9aed Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:25 +0300 Subject: x86/boot: Port I/O: Add decompression-time support for TDX Port I/O instructions trigger #VE in the TDX environment. In response to the exception, kernel emulates these instructions using hypercalls. But during early boot, on the decompression stage, it is cumbersome to deal with #VE. It is cleaner to go to hypercalls directly, bypassing #VE handling. Hook up TDX-specific port I/O helpers if booting in TDX environment. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-17-kirill.shutemov@linux.intel.com --- arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/tdcall.S | 3 ++ arch/x86/boot/compressed/tdx.c | 61 +++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/shared/tdx.h | 32 ++++++++++++++++++++ arch/x86/include/asm/tdx.h | 27 ----------------- 5 files changed, 97 insertions(+), 28 deletions(-) create mode 100644 arch/x86/boot/compressed/tdcall.S (limited to 'arch/x86/include') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 732f6b21ecbd..8fd0e6ae2e1f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -101,7 +101,7 @@ ifdef CONFIG_X86_64 endif vmlinux-objs-$(CONFIG_ACPI) += $(obj)/acpi.o -vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o +vmlinux-objs-$(CONFIG_INTEL_TDX_GUEST) += $(obj)/tdx.o $(obj)/tdcall.o vmlinux-objs-$(CONFIG_EFI_MIXED) += $(obj)/efi_thunk_$(BITS).o efi-obj-$(CONFIG_EFI_STUB) = $(objtree)/drivers/firmware/efi/libstub/lib.a diff --git a/arch/x86/boot/compressed/tdcall.S b/arch/x86/boot/compressed/tdcall.S new file mode 100644 index 000000000000..46d0495e0d3a --- /dev/null +++ b/arch/x86/boot/compressed/tdcall.S @@ -0,0 +1,3 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "../../coco/tdx/tdcall.S" diff --git a/arch/x86/boot/compressed/tdx.c b/arch/x86/boot/compressed/tdx.c index 5f6d01a2f1f4..918a7606f53c 100644 --- a/arch/x86/boot/compressed/tdx.c +++ b/arch/x86/boot/compressed/tdx.c @@ -2,9 +2,65 @@ #include "../cpuflags.h" #include "../string.h" +#include "../io.h" +#include "error.h" + +#include +#include #include +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void) +{ + error("TDVMCALL failed. TDX module bug?"); +} + +static inline unsigned int tdx_io_in(int size, u16 port) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 0, + .r14 = port, + }; + + if (__tdx_hypercall(&args, TDX_HCALL_HAS_OUTPUT)) + return UINT_MAX; + + return args.r11; +} + +static inline void tdx_io_out(int size, u16 port, u32 value) +{ + struct tdx_hypercall_args args = { + .r10 = TDX_HYPERCALL_STANDARD, + .r11 = EXIT_REASON_IO_INSTRUCTION, + .r12 = size, + .r13 = 1, + .r14 = port, + .r15 = value, + }; + + __tdx_hypercall(&args, 0); +} + +static inline u8 tdx_inb(u16 port) +{ + return tdx_io_in(1, port); +} + +static inline void tdx_outb(u8 value, u16 port) +{ + tdx_io_out(1, port, value); +} + +static inline void tdx_outw(u16 value, u16 port) +{ + tdx_io_out(2, port, value); +} + void early_tdx_detect(void) { u32 eax, sig[3]; @@ -13,4 +69,9 @@ void early_tdx_detect(void) if (memcmp(TDX_IDENT, sig, sizeof(sig))) return; + + /* Use hypercalls instead of I/O instructions */ + pio_ops.f_inb = tdx_inb; + pio_ops.f_outb = tdx_outb; + pio_ops.f_outw = tdx_outw; } diff --git a/arch/x86/include/asm/shared/tdx.h b/arch/x86/include/asm/shared/tdx.h index 8209ba9ffe1a..e53f26228fbb 100644 --- a/arch/x86/include/asm/shared/tdx.h +++ b/arch/x86/include/asm/shared/tdx.h @@ -2,7 +2,39 @@ #ifndef _ASM_X86_SHARED_TDX_H #define _ASM_X86_SHARED_TDX_H +#include +#include + +#define TDX_HYPERCALL_STANDARD 0 + +#define TDX_HCALL_HAS_OUTPUT BIT(0) +#define TDX_HCALL_ISSUE_STI BIT(1) + #define TDX_CPUID_LEAF_ID 0x21 #define TDX_IDENT "IntelTDX " +#ifndef __ASSEMBLY__ + +/* + * Used in __tdx_hypercall() to pass down and get back registers' values of + * the TDCALL instruction when requesting services from the VMM. + * + * This is a software only structure and not part of the TDX module/VMM ABI. + */ +struct tdx_hypercall_args { + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; + +/* Used to request services from the VMM */ +u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); + +/* Called from __tdx_hypercall() for unrecoverable failure */ +void __tdx_hypercall_failed(void); + +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_SHARED_TDX_H */ diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 81a1ec14e476..7944fd1ae07d 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -3,17 +3,11 @@ #ifndef _ASM_X86_TDX_H #define _ASM_X86_TDX_H -#include #include #include #include #include -#define TDX_HYPERCALL_STANDARD 0 - -#define TDX_HCALL_HAS_OUTPUT BIT(0) -#define TDX_HCALL_ISSUE_STI BIT(1) - /* * SW-defined error codes. * @@ -41,21 +35,6 @@ struct tdx_module_output { u64 r11; }; -/* - * Used in __tdx_hypercall() to pass down and get back registers' values of - * the TDCALL instruction when requesting services from the VMM. - * - * This is a software only structure and not part of the TDX module/VMM ABI. - */ -struct tdx_hypercall_args { - u64 r10; - u64 r11; - u64 r12; - u64 r13; - u64 r14; - u64 r15; -}; - /* * Used by the #VE exception handler to gather the #VE exception * info from the TDX module. This is a software only structure @@ -80,12 +59,6 @@ void __init tdx_early_init(void); u64 __tdx_module_call(u64 fn, u64 rcx, u64 rdx, u64 r8, u64 r9, struct tdx_module_output *out); -/* Used to request services from the VMM */ -u64 __tdx_hypercall(struct tdx_hypercall_args *args, unsigned long flags); - -/* Called from __tdx_hypercall() for unrecoverable failure */ -void __tdx_hypercall_failed(void); - void tdx_get_ve_info(struct ve_info *ve); bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); -- cgit v1.2.3-59-g8ed1b From 32e72854fa5fef6bc72e27c54f31897db9092acb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 Apr 2022 02:29:27 +0300 Subject: x86/tdx: Port I/O: Add early boot support TDX guests cannot do port I/O directly. The TDX module triggers a #VE exception to let the guest kernel emulate port I/O by converting them into TDCALLs to call the host. But before IDT handlers are set up, port I/O cannot be emulated using normal kernel #VE handlers. To support the #VE-based emulation during this boot window, add a minimal early #VE handler support in early exception handlers. This is similar to what AMD SEV does. This is mainly to support earlyprintk's serial driver, as well as potentially the VGA driver. The early handler only supports I/O-related #VE exceptions. Unhandled or failed exceptions will be handled via early_fixup_exceptions() (like normal exception failures). At runtime I/O-related #VE exceptions (along with other types) handled by virt_exception_kernel(). Signed-off-by: Andi Kleen Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-19-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 16 ++++++++++++++++ arch/x86/include/asm/tdx.h | 4 ++++ arch/x86/kernel/head64.c | 3 +++ 3 files changed, 23 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index e47e2ed6b03e..cc14b7c0c157 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -418,6 +418,22 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual) return handle_out(regs, size, port); } +/* + * Early #VE exception handler. Only handles a subset of port I/O. + * Intended only for earlyprintk. If failed, return false. + */ +__init bool tdx_early_handle_ve(struct pt_regs *regs) +{ + struct ve_info ve; + + tdx_get_ve_info(&ve); + + if (ve.exit_reason != EXIT_REASON_IO_INSTRUCTION) + return false; + + return handle_io(regs, ve.exit_qual); +} + void tdx_get_ve_info(struct ve_info *ve) { struct tdx_module_output out; diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 7944fd1ae07d..9ffd0d2e6e0f 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -65,11 +65,15 @@ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); void tdx_safe_halt(void); +bool tdx_early_handle_ve(struct pt_regs *regs); + #else static inline void tdx_early_init(void) { }; static inline void tdx_safe_halt(void) { }; +static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } + #endif /* CONFIG_INTEL_TDX_GUEST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6dff50c3edd6..ecbf50e5b8e0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -417,6 +417,9 @@ void __init do_early_exception(struct pt_regs *regs, int trapnr) trapnr == X86_TRAP_VC && handle_vc_boot_ghcb(regs)) return; + if (trapnr == X86_TRAP_VE && tdx_early_handle_ve(regs)) + return; + early_fixup_exception(regs, trapnr); } -- cgit v1.2.3-59-g8ed1b From cfb8ec7a31f234b4519c104f1cc9accbc8b393a9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:28 +0300 Subject: x86/tdx: Wire up KVM hypercalls KVM hypercalls use the VMCALL or VMMCALL instructions. Although the ABI is similar, those instructions no longer function for TDX guests. Make vendor-specific TDVMCALLs instead of VMCALL. This enables TDX guests to run with KVM acting as the hypervisor. Among other things, KVM hypercall is used to send IPIs. Since the KVM driver can be built as a kernel module, export tdx_kvm_hypercall() to make the symbols visible to kvm.ko. Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-20-kirill.shutemov@linux.intel.com --- arch/x86/coco/tdx/tdx.c | 17 +++++++++++++++++ arch/x86/include/asm/kvm_para.h | 22 ++++++++++++++++++++++ arch/x86/include/asm/tdx.h | 11 +++++++++++ 3 files changed, 50 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index cc14b7c0c157..f50f530aff5f 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -64,6 +64,23 @@ static u64 hcall_func(u64 exit_reason) return exit_reason; } +#ifdef CONFIG_KVM_GUEST +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4) +{ + struct tdx_hypercall_args args = { + .r10 = nr, + .r11 = p1, + .r12 = p2, + .r13 = p3, + .r14 = p4, + }; + + return __tdx_hypercall(&args, 0); +} +EXPORT_SYMBOL_GPL(tdx_kvm_hypercall); +#endif + /* * Used for TDX guests to make calls directly to the TD module. This * should only be used for calls that have no legitimate reason to fail diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 56935ebb1dfe..57bc74e112f2 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,6 +7,8 @@ #include #include +#include + #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); #else @@ -32,6 +34,10 @@ static inline bool kvm_check_and_clear_guest_paused(void) static inline long kvm_hypercall0(unsigned int nr) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, 0, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr) @@ -42,6 +48,10 @@ static inline long kvm_hypercall0(unsigned int nr) static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, 0, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1) @@ -53,6 +63,10 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, 0, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2) @@ -64,6 +78,10 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, 0); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3) @@ -76,6 +94,10 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p4) { long ret; + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) + return tdx_kvm_hypercall(nr, p1, p2, p3, p4); + asm volatile(KVM_HYPERCALL : "=a"(ret) : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 9ffd0d2e6e0f..020c81a7c729 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -76,5 +76,16 @@ static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } #endif /* CONFIG_INTEL_TDX_GUEST */ +#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_INTEL_TDX_GUEST) +long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, unsigned long p2, + unsigned long p3, unsigned long p4); +#else +static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + return -ENODEV; +} +#endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */ #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_TDX_H */ -- cgit v1.2.3-59-g8ed1b From ff2e64684f153a61f6ae553e860af4b6ef2f4ca5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 6 Apr 2022 02:29:29 +0300 Subject: x86/boot: Add a trampoline for booting APs via firmware handoff Historically, x86 platforms have booted secondary processors (APs) using INIT followed by the start up IPI (SIPI) messages. In regular VMs, this boot sequence is supported by the VMM emulation. But such a wakeup model is fatal for secure VMs like TDX in which VMM is an untrusted entity. To address this issue, a new wakeup model was added in ACPI v6.4, in which firmware (like TDX virtual BIOS) will help boot the APs. More details about this wakeup model can be found in ACPI specification v6.4, the section titled "Multiprocessor Wakeup Structure". Since the existing trampoline code requires processors to boot in real mode with 16-bit addressing, it will not work for this wakeup model (because it boots the AP in 64-bit mode). To handle it, extend the trampoline code to support 64-bit mode firmware handoff. Also, extend IDT and GDT pointers to support 64-bit mode hand off. There is no TDX-specific detection for this new boot method. The kernel will rely on it as the sole boot method whenever the new ACPI structure is present. The ACPI table parser for the MADT multiprocessor wake up structure and the wakeup method that uses this structure will be added by the following patch in this series. Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-21-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/apic.h | 2 ++ arch/x86/include/asm/realmode.h | 1 + arch/x86/kernel/smpboot.c | 12 ++++++++-- arch/x86/realmode/rm/header.S | 1 + arch/x86/realmode/rm/trampoline_64.S | 38 ++++++++++++++++++++++++++++++++ arch/x86/realmode/rm/trampoline_common.S | 12 +++++++++- 6 files changed, 63 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 48067af94678..35006e151774 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -328,6 +328,8 @@ struct apic { /* wakeup_secondary_cpu */ int (*wakeup_secondary_cpu)(int apicid, unsigned long start_eip); + /* wakeup secondary CPU using 64-bit wakeup point */ + int (*wakeup_secondary_cpu_64)(int apicid, unsigned long start_eip); void (*inquire_remote_apic)(int apicid); diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 331474b150f1..fd6f6e5b755a 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -25,6 +25,7 @@ struct real_mode_header { u32 sev_es_trampoline_start; #endif #ifdef CONFIG_X86_64 + u32 trampoline_start64; u32 trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 2ef14772dc04..870cc5d203b1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1082,6 +1082,11 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, unsigned long boot_error = 0; unsigned long timeout; +#ifdef CONFIG_X86_64 + /* If 64-bit wakeup method exists, use the 64-bit mode trampoline IP */ + if (apic->wakeup_secondary_cpu_64) + start_ip = real_mode_header->trampoline_start64; +#endif idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; @@ -1123,11 +1128,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle, /* * Wake up a CPU in difference cases: - * - Use the method in the APIC driver if it's defined + * - Use a method from the APIC driver if one defined, with wakeup + * straight to 64-bit mode preferred over wakeup to RM. * Otherwise, * - Use an INIT boot APIC message for APs or NMI for BSP. */ - if (apic->wakeup_secondary_cpu) + if (apic->wakeup_secondary_cpu_64) + boot_error = apic->wakeup_secondary_cpu_64(apicid, start_ip); + else if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index 8c1db5bf5d78..2eb62be6d256 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -24,6 +24,7 @@ SYM_DATA_START(real_mode_header) .long pa_sev_es_trampoline_start #endif #ifdef CONFIG_X86_64 + .long pa_trampoline_start64 .long pa_trampoline_pgd; #endif /* ACPI S3 wakeup */ diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S index cc8391f86cdb..ae112a91592f 100644 --- a/arch/x86/realmode/rm/trampoline_64.S +++ b/arch/x86/realmode/rm/trampoline_64.S @@ -161,6 +161,19 @@ SYM_CODE_START(startup_32) ljmpl $__KERNEL_CS, $pa_startup_64 SYM_CODE_END(startup_32) +SYM_CODE_START(pa_trampoline_compat) + /* + * In compatibility mode. Prep ESP and DX for startup_32, then disable + * paging and complete the switch to legacy 32-bit mode. + */ + movl $rm_stack_end, %esp + movw $__KERNEL_DS, %dx + + movl $X86_CR0_PE, %eax + movl %eax, %cr0 + ljmpl $__KERNEL32_CS, $pa_startup_32 +SYM_CODE_END(pa_trampoline_compat) + .section ".text64","ax" .code64 .balign 4 @@ -169,6 +182,20 @@ SYM_CODE_START(startup_64) jmpq *tr_start(%rip) SYM_CODE_END(startup_64) +SYM_CODE_START(trampoline_start64) + /* + * APs start here on a direct transfer from 64-bit BIOS with identity + * mapped page tables. Load the kernel's GDT in order to gear down to + * 32-bit mode (to handle 4-level vs. 5-level paging), and to (re)load + * segment registers. Load the zero IDT so any fault triggers a + * shutdown instead of jumping back into BIOS. + */ + lidt tr_idt(%rip) + lgdt tr_gdt64(%rip) + + ljmpl *tr_compat(%rip) +SYM_CODE_END(trampoline_start64) + .section ".rodata","a" # Duplicate the global descriptor table # so the kernel can live anywhere @@ -182,6 +209,17 @@ SYM_DATA_START(tr_gdt) .quad 0x00cf93000000ffff # __KERNEL_DS SYM_DATA_END_LABEL(tr_gdt, SYM_L_LOCAL, tr_gdt_end) +SYM_DATA_START(tr_gdt64) + .short tr_gdt_end - tr_gdt - 1 # gdt limit + .long pa_tr_gdt + .long 0 +SYM_DATA_END(tr_gdt64) + +SYM_DATA_START(tr_compat) + .long pa_trampoline_compat + .short __KERNEL32_CS +SYM_DATA_END(tr_compat) + .bss .balign PAGE_SIZE SYM_DATA(trampoline_pgd, .space PAGE_SIZE) diff --git a/arch/x86/realmode/rm/trampoline_common.S b/arch/x86/realmode/rm/trampoline_common.S index 5033e640f957..4331c32c47f8 100644 --- a/arch/x86/realmode/rm/trampoline_common.S +++ b/arch/x86/realmode/rm/trampoline_common.S @@ -1,4 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ .section ".rodata","a" .balign 16 -SYM_DATA_LOCAL(tr_idt, .fill 1, 6, 0) + +/* + * When a bootloader hands off to the kernel in 32-bit mode an + * IDT with a 2-byte limit and 4-byte base is needed. When a boot + * loader hands off to a kernel 64-bit mode the base address + * extends to 8-bytes. Reserve enough space for either scenario. + */ +SYM_DATA_START_LOCAL(tr_idt) + .short 0 + .quad 0 +SYM_DATA_END(tr_idt) -- cgit v1.2.3-59-g8ed1b From f39642d0dbacded8b4a816a9197a73efb74e5702 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Wed, 6 Apr 2022 02:29:30 +0300 Subject: x86/acpi/x86/boot: Add multiprocessor wake-up support Secondary CPU startup is currently performed with something called the "INIT/SIPI protocol". This protocol requires assistance from VMMs to boot guests. As should be a familiar story by now, that support can not be provded to TDX guests because TDX VMMs are not trusted by guests. To remedy this situation a new[1] "Multiprocessor Wakeup Structure" has been added to to an existing ACPI table (MADT). This structure provides the physical address of a "mailbox". A write to the mailbox then steers the secondary CPU to the boot code. Add ACPI MADT wake structure parsing support and wake support. Use this support to wake CPUs whenever it is present instead of INIT/SIPI. While this structure can theoretically be used on 32-bit kernels, there are no 32-bit TDX guest kernels. It has not been tested and can not practically *be* tested on 32-bit. Make it 64-bit only. 1. Details about the new structure can be found in ACPI v6.4, in the "Multiprocessor Wakeup Structure" section. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Rafael J. Wysocki Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-22-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/apic.h | 5 +++ arch/x86/kernel/acpi/boot.c | 93 ++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/apic/apic.c | 10 +++++ 3 files changed, 107 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 35006e151774..bd8ae0a7010a 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -490,6 +490,11 @@ static inline unsigned int read_apic_id(void) return apic->get_apic_id(reg); } +#ifdef CONFIG_X86_64 +typedef int (*wakeup_cpu_handler)(int apicid, unsigned long start_eip); +extern void acpi_wake_cpu_handler_update(wakeup_cpu_handler handler); +#endif + extern int default_apic_id_valid(u32 apicid); extern int default_acpi_madt_oem_check(char *, char *); extern void default_setup_apic_routing(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d01e7f5078c..6d2c50819501 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -65,6 +65,13 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; static bool acpi_support_online_capable; #endif +#ifdef CONFIG_X86_64 +/* Physical address of the Multiprocessor Wakeup Structure mailbox */ +static u64 acpi_mp_wake_mailbox_paddr; +/* Virtual address of the Multiprocessor Wakeup Structure mailbox */ +static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; +#endif + #ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug @@ -336,7 +343,60 @@ acpi_parse_lapic_nmi(union acpi_subtable_headers * header, const unsigned long e return 0; } -#endif /*CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_64 +static int acpi_wakeup_cpu(int apicid, unsigned long start_ip) +{ + /* + * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). + * + * Wakeup of secondary CPUs is fully serialized in the core code. + * No need to protect acpi_mp_wake_mailbox from concurrent accesses. + */ + if (!acpi_mp_wake_mailbox) { + acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, + sizeof(*acpi_mp_wake_mailbox), + MEMREMAP_WB); + } + + /* + * Mailbox memory is shared between the firmware and OS. Firmware will + * listen on mailbox command address, and once it receives the wakeup + * command, the CPU associated with the given apicid will be booted. + * + * The value of 'apic_id' and 'wakeup_vector' must be visible to the + * firmware before the wakeup command is visible. smp_store_release() + * ensures ordering and visibility. + */ + acpi_mp_wake_mailbox->apic_id = apicid; + acpi_mp_wake_mailbox->wakeup_vector = start_ip; + smp_store_release(&acpi_mp_wake_mailbox->command, + ACPI_MP_WAKE_COMMAND_WAKEUP); + + /* + * Wait for the CPU to wake up. + * + * The CPU being woken up is essentially in a spin loop waiting to be + * woken up. It should not take long for it wake up and acknowledge by + * zeroing out ->command. + * + * ACPI specification doesn't provide any guidance on how long kernel + * has to wait for a wake up acknowledgement. It also doesn't provide + * a way to cancel a wake up request if it takes too long. + * + * In TDX environment, the VMM has control over how long it takes to + * wake up secondary. It can postpone scheduling secondary vCPU + * indefinitely. Giving up on wake up request and reporting error opens + * possible attack vector for VMM: it can wake up a secondary CPU when + * kernel doesn't expect it. Wait until positive result of the wake up + * request. + */ + while (READ_ONCE(acpi_mp_wake_mailbox->command)) + cpu_relax(); + + return 0; +} +#endif /* CONFIG_X86_64 */ +#endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 @@ -1083,6 +1143,29 @@ static int __init acpi_parse_madt_lapic_entries(void) } return 0; } + +#ifdef CONFIG_X86_64 +static int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, + const unsigned long end) +{ + struct acpi_madt_multiproc_wakeup *mp_wake; + + if (!IS_ENABLED(CONFIG_SMP)) + return -ENODEV; + + mp_wake = (struct acpi_madt_multiproc_wakeup *)header; + if (BAD_MADT_ENTRY(mp_wake, end)) + return -EINVAL; + + acpi_table_print_madt_entry(&header->common); + + acpi_mp_wake_mailbox_paddr = mp_wake->base_address; + + acpi_wake_cpu_handler_update(acpi_wakeup_cpu); + + return 0; +} +#endif /* CONFIG_X86_64 */ #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC @@ -1278,6 +1361,14 @@ static void __init acpi_process_madt(void) smp_found_config = 1; } + +#ifdef CONFIG_X86_64 + /* + * Parse MADT MP Wake entry. + */ + acpi_table_parse_madt(ACPI_MADT_TYPE_MULTIPROC_WAKEUP, + acpi_parse_mp_wake, 1); +#endif } if (error == -EINVAL) { /* diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..3c8f2c797a98 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2551,6 +2551,16 @@ u32 x86_msi_msg_get_destid(struct msi_msg *msg, bool extid) } EXPORT_SYMBOL_GPL(x86_msi_msg_get_destid); +#ifdef CONFIG_X86_64 +void __init acpi_wake_cpu_handler_update(wakeup_cpu_handler handler) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) + (*drv)->wakeup_secondary_cpu_64 = handler; +} +#endif + /* * Override the generic EOI implementation with an optimized version. * Only called during early boot when only one CPU is active and with -- cgit v1.2.3-59-g8ed1b From 968b493173ac5205fe75f6330ee767f96bf88e57 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:36 +0300 Subject: x86/mm: Make DMA memory shared for TD guest Intel TDX doesn't allow VMM to directly access guest private memory. Any memory that is required for communication with the VMM must be shared explicitly. The same rule applies for any DMA to and from the TDX guest. All DMA pages have to be marked as shared pages. A generic way to achieve this without any changes to device drivers is to use the SWIOTLB framework. The previous patch ("Add support for TDX shared memory") gave TDX guests the _ability_ to make some pages shared, but did not make any pages shared. This actually marks SWIOTLB buffers *as* shared. Start returning true for cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) in TDX guests. This has several implications: - Allows the existing mem_encrypt_init() to be used for TDX which sets SWIOTLB buffers shared (aka. "decrypted"). - Ensures that all DMA is routed via the SWIOTLB mechanism (see pci_swiotlb_detect()) Stop selecting DYNAMIC_PHYSICAL_MASK directly. It will get set indirectly by selecting X86_MEM_ENCRYPT. mem_encrypt_init() is currently under an AMD-specific #ifdef. Move it to a generic area of the header. Co-developed-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kuppuswamy Sathyanarayanan Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Andi Kleen Reviewed-by: Tony Luck Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20220405232939.73860-28-kirill.shutemov@linux.intel.com --- arch/x86/Kconfig | 2 +- arch/x86/coco/core.c | 1 + arch/x86/include/asm/mem_encrypt.h | 6 +++--- arch/x86/mm/mem_encrypt.c | 9 ++++++++- 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a18185525708..7021ec725dd3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -883,7 +883,7 @@ config INTEL_TDX_GUEST depends on X86_64 && CPU_SUP_INTEL depends on X86_X2APIC select ARCH_HAS_CC_PLATFORM - select DYNAMIC_PHYSICAL_MASK + select X86_MEM_ENCRYPT select X86_MCE help Support running as a guest under Intel TDX. Without this support, diff --git a/arch/x86/coco/core.c b/arch/x86/coco/core.c index 9f74125c582d..4320fadae716 100644 --- a/arch/x86/coco/core.c +++ b/arch/x86/coco/core.c @@ -22,6 +22,7 @@ static bool intel_cc_platform_has(enum cc_attr attr) case CC_ATTR_GUEST_UNROLL_STRING_IO: case CC_ATTR_HOTPLUG_DISABLED: case CC_ATTR_GUEST_MEM_ENCRYPT: + case CC_ATTR_MEM_ENCRYPT: return true; default: return false; diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index e2c6f433ed10..88ceaf3648b3 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -49,9 +49,6 @@ void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, void __init mem_encrypt_free_decrypted_mem(void); -/* Architecture __weak replacement functions */ -void __init mem_encrypt_init(void); - void __init sev_es_init_vc_handling(void); #define __bss_decrypted __section(".bss..decrypted") @@ -89,6 +86,9 @@ static inline void mem_encrypt_free_decrypted_mem(void) { } #endif /* CONFIG_AMD_MEM_ENCRYPT */ +/* Architecture __weak replacement functions */ +void __init mem_encrypt_init(void); + /* * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when * writing to or comparing values from the cr3 register. Having the diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 50d209939c66..10ee40b5204b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -42,7 +42,14 @@ bool force_dma_unencrypted(struct device *dev) static void print_mem_encrypt_feature_info(void) { - pr_info("AMD Memory Encryption Features active:"); + pr_info("Memory Encryption Features active:"); + + if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_cont(" Intel TDX\n"); + return; + } + + pr_cont("AMD "); /* Secure Memory Encryption */ if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) { -- cgit v1.2.3-59-g8ed1b From e2efb6359e620521d1e13f69b2257de8ceaa9475 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 6 Apr 2022 02:29:38 +0300 Subject: ACPICA: Avoid cache flush inside virtual machines While running inside virtual machine, the kernel can bypass cache flushing. Changing sleep state in a virtual machine doesn't affect the host system sleep state and cannot lead to data loss. Before entering sleep states, the ACPI code flushes caches to prevent data loss using the WBINVD instruction. This mechanism is required on bare metal. But, any use WBINVD inside of a guest is worthless. Changing sleep state in a virtual machine doesn't affect the host system sleep state and cannot lead to data loss, so most hypervisors simply ignore it. Despite this, the ACPI code calls WBINVD unconditionally anyway. It's useless, but also normally harmless. In TDX guests, though, WBINVD stops being harmless; it triggers a virtualization exception (#VE). If the ACPI cache-flushing WBINVD were left in place, TDX guests would need handling to recover from the exception. Avoid using WBINVD whenever running under a hypervisor. This both removes the useless WBINVDs and saves TDX from implementing WBINVD handling. Signed-off-by: Kirill A. Shutemov Signed-off-by: Dave Hansen Reviewed-by: Dave Hansen Reviewed-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20220405232939.73860-30-kirill.shutemov@linux.intel.com --- arch/x86/include/asm/acenv.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/acenv.h b/arch/x86/include/asm/acenv.h index 9aff97f0de7f..d937c55e717e 100644 --- a/arch/x86/include/asm/acenv.h +++ b/arch/x86/include/asm/acenv.h @@ -13,7 +13,19 @@ /* Asm macros */ -#define ACPI_FLUSH_CPU_CACHE() wbinvd() +/* + * ACPI_FLUSH_CPU_CACHE() flushes caches on entering sleep states. + * It is required to prevent data loss. + * + * While running inside virtual machine, the kernel can bypass cache flushing. + * Changing sleep state in a virtual machine doesn't affect the host system + * sleep state and cannot lead to data loss. + */ +#define ACPI_FLUSH_CPU_CACHE() \ +do { \ + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) \ + wbinvd(); \ +} while (0) int __acpi_acquire_global_lock(unsigned int *lock); int __acpi_release_global_lock(unsigned int *lock); -- cgit v1.2.3-59-g8ed1b From 4c5e242d3e937bb9f9c226d06888d9189826879d Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 3 Mar 2022 16:00:39 -0600 Subject: x86/PCI: Clip only host bridge windows for E820 regions ACPI firmware advertises PCI host bridge resources via PNP0A03 _CRS methods. Some BIOSes include non-window address space in _CRS, and if we allocate that non-window space for PCI devices, they don't work. 4dc2287c1805 ("x86: avoid E820 regions when allocating address space") works around this issue by clipping out any regions mentioned in the E820 table in the allocate_resource() path, but the implementation has a couple issues: - The clipping is done for *all* allocations, not just those for PCI address space, and - The clipping is done at each allocation instead of being done once when setting up the host bridge windows. Rework the implementation so we only clip PCI host bridge windows, and we do it once when setting them up. Example output changes: BIOS-e820: [mem 0x00000000b0000000-0x00000000c00fffff] reserved + acpi PNP0A08:00: clipped [mem 0xc0000000-0xfebfffff window] to [mem 0xc0100000-0xfebfffff window] for e820 entry [mem 0xb0000000-0xc00fffff] - pci_bus 0000:00: root bus resource [mem 0xc0000000-0xfebfffff window] + pci_bus 0000:00: root bus resource [mem 0xc0100000-0xfebfffff window] Link: https://lore.kernel.org/r/20220304035110.988712-3-helgaas@kernel.org Signed-off-by: Bjorn Helgaas Reviewed-by: Hans de Goede Reviewed-by: Mika Westerberg Acked-by: Rafael J. Wysocki --- arch/x86/include/asm/e820/api.h | 5 +++++ arch/x86/kernel/resource.c | 14 +++++++------- arch/x86/pci/acpi.c | 5 +++++ 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/e820/api.h b/arch/x86/include/asm/e820/api.h index e8f58ddd06d9..5a39ed59b6db 100644 --- a/arch/x86/include/asm/e820/api.h +++ b/arch/x86/include/asm/e820/api.h @@ -4,6 +4,9 @@ #include +struct device; +struct resource; + extern struct e820_table *e820_table; extern struct e820_table *e820_table_kexec; extern struct e820_table *e820_table_firmware; @@ -43,6 +46,8 @@ extern void e820__register_nosave_regions(unsigned long limit_pfn); extern int e820__get_entry_type(u64 start, u64 end); +extern void remove_e820_regions(struct device *dev, struct resource *avail); + /* * Returns true iff the specified range [start,end) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c index 30d524adb012..db2b350a37b7 100644 --- a/arch/x86/kernel/resource.c +++ b/arch/x86/kernel/resource.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include -#include #include static void resource_clip(struct resource *res, resource_size_t start, @@ -24,13 +24,16 @@ static void resource_clip(struct resource *res, resource_size_t start, res->start = end + 1; } -static void remove_e820_regions(struct resource *avail) +void remove_e820_regions(struct device *dev, struct resource *avail) { int i; struct e820_entry *entry; u64 e820_start, e820_end; struct resource orig = *avail; + if (!(avail->flags & IORESOURCE_MEM)) + return; + for (i = 0; i < e820_table->nr_entries; i++) { entry = &e820_table->entries[i]; e820_start = entry->addr; @@ -38,7 +41,7 @@ static void remove_e820_regions(struct resource *avail) resource_clip(avail, e820_start, e820_end); if (orig.start != avail->start || orig.end != avail->end) { - pr_info("clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", + dev_info(dev, "clipped %pR to %pR for e820 entry [mem %#010Lx-%#010Lx]\n", &orig, avail, e820_start, e820_end); orig = *avail; } @@ -52,9 +55,6 @@ void arch_remove_reservations(struct resource *avail) * the low 1MB unconditionally, as this area is needed for some ISA * cards requiring a memory range, e.g. the i82365 PCMCIA controller. */ - if (avail->flags & IORESOURCE_MEM) { + if (avail->flags & IORESOURCE_MEM) resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); - - remove_e820_regions(avail); - } } diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 052f1d78a562..562c81a51ea0 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -8,6 +8,7 @@ #include #include #include +#include struct pci_root_info { struct acpi_pci_root_info common; @@ -299,6 +300,10 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) int status; status = acpi_pci_probe_root_resources(ci); + + resource_list_for_each_entry(entry, &ci->resources) + remove_e820_regions(&device->dev, entry->res); + if (pci_use_crs) { resource_list_for_each_entry_safe(entry, tmp, &ci->resources) if (resource_is_pcicfg_ioport(entry->res)) -- cgit v1.2.3-59-g8ed1b From b584db0c84db5ed9230356d5fa6610de55d297e6 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Thu, 31 Mar 2022 08:11:05 +0100 Subject: x86/PCI: Add $IRT PIRQ routing table support Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP (BIOS Configuration Program) external tool meant for tweaking BIOS structures without the need to rebuild it from sources[1]. The $IRT format has been invented by AMI before Microsoft has come up with its $PIR format and a $IRT table is therefore there in some systems that lack a $PIR table, such as the DataExpert EXP8449 mainboard based on the ALi FinALi 486 chipset (M1489/M1487), which predates DMI 2.0 and cannot therefore be easily identified at run time. Unlike with the $PIR format there is no alignment guarantee as to the placement of the $IRT table, so scan the whole BIOS area bytewise. Credit to Michal Necasek for helping me chase documentation for the format. References: [1] "What is BCP? - AMI", Signed-off-by: Maciej W. Rozycki Signed-off-by: Thomas Gleixner Tested-by: Dmitry Osipenko # crosvm Link: https://lore.kernel.org/r/alpine.DEB.2.21.2203302228410.9038@angie.orcam.me.uk --- arch/x86/include/asm/pci_x86.h | 9 +++++ arch/x86/pci/irq.c | 76 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a0627dfae541..1307cd689d2a 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -93,6 +93,15 @@ struct irq_routing_table { struct irq_info slots[]; } __attribute__((packed)); +struct irt_routing_table { + u32 signature; /* IRT_SIGNATURE should be here */ + u8 size; /* Number of entries provided */ + u8 used; /* Number of entries actually used */ + u16 exclusive_irqs; /* IRQs devoted exclusively to + PCI usage */ + struct irq_info slots[]; +} __attribute__((packed)); + extern unsigned int pcibios_irq_mask; extern raw_spinlock_t pci_config_lock; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index d4ecf881c0c8..4a5e80f8fb01 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -25,6 +25,8 @@ #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 +#define IRT_SIGNATURE (('$' << 0) + ('I' << 8) + ('R' << 16) + ('T' << 24)) + static int broken_hp_bios_irq9; static int acer_tm360_irqrouting; @@ -93,7 +95,74 @@ static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr, return NULL; } +/* + * Handle the $IRT PCI IRQ Routing Table format used by AMI for its BCP + * (BIOS Configuration Program) external tool meant for tweaking BIOS + * structures without the need to rebuild it from sources. The $IRT + * format has been invented by AMI before Microsoft has come up with its + * $PIR format and a $IRT table is therefore there in some systems that + * lack a $PIR table. + * + * It uses the same PCI BIOS 2.1 format for interrupt routing entries + * themselves but has a different simpler header prepended instead, + * occupying 8 bytes, where a `$IRT' signature is followed by one byte + * specifying the total number of interrupt routing entries allocated in + * the table, then one byte specifying the actual number of entries used + * (which the BCP tool can take advantage of when modifying the table), + * and finally a 16-bit word giving the IRQs devoted exclusively to PCI. + * Unlike with the $PIR table there is no alignment guarantee. + * + * Given the similarity of the two formats the $IRT one is trivial to + * convert to the $PIR one, which we do here, except that obviously we + * have no information as to the router device to use, but we can handle + * it by matching PCI device IDs actually seen on the bus against ones + * that our individual routers recognise. + * + * Reportedly there is another $IRT table format where a 16-bit word + * follows the header instead that points to interrupt routing entries + * in a $PIR table provided elsewhere. In that case this code will not + * be reached though as the $PIR table will have been chosen instead. + */ +static inline struct irq_routing_table *pirq_convert_irt_table(u8 *addr, + u8 *limit) +{ + struct irt_routing_table *ir; + struct irq_routing_table *rt; + u16 size; + u8 sum; + int i; + + ir = (struct irt_routing_table *)addr; + if (ir->signature != IRT_SIGNATURE || !ir->used || ir->size < ir->used) + return NULL; + + size = sizeof(*ir) + ir->used * sizeof(ir->slots[0]); + if (size > limit - addr) + return NULL; + + DBG(KERN_DEBUG "PCI: $IRT Interrupt Routing Table found at 0x%lx\n", + __pa(ir)); + + size = sizeof(*rt) + ir->used * sizeof(rt->slots[0]); + rt = kzalloc(size, GFP_KERNEL); + if (!rt) + return NULL; + rt->signature = PIRQ_SIGNATURE; + rt->version = PIRQ_VERSION; + rt->size = size; + rt->exclusive_irqs = ir->exclusive_irqs; + for (i = 0; i < ir->used; i++) + rt->slots[i] = ir->slots[i]; + + addr = (u8 *)rt; + sum = 0; + for (i = 0; i < size; i++) + sum += addr[i]; + rt->checksum = -sum; + + return rt; +} /* * Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table. @@ -120,6 +189,13 @@ static struct irq_routing_table * __init pirq_find_routing_table(void) if (rt) return rt; } + for (addr = bios_start; + addr < bios_end - sizeof(struct irt_routing_table); + addr++) { + rt = pirq_convert_irt_table(addr, bios_end); + if (rt) + return rt; + } return NULL; } -- cgit v1.2.3-59-g8ed1b From f5d9283ecb33329073033029fe427155aa0abfb1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:50 -0400 Subject: x86/32: Simplify ELF_CORE_COPY_REGS GS is now always a user segment, so there is no difference between user and kernel registers. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-2-brgerst@gmail.com --- arch/x86/include/asm/elf.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 29fea180a665..cb0ff1055ab1 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -116,7 +116,7 @@ extern unsigned int vdso32_enabled; * now struct_user_regs, they are different) */ -#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \ +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ @@ -128,6 +128,7 @@ do { \ pr_reg[7] = regs->ds; \ pr_reg[8] = regs->es; \ pr_reg[9] = regs->fs; \ + savesegment(gs, pr_reg[10]); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs; \ @@ -136,18 +137,6 @@ do { \ pr_reg[16] = regs->ss; \ } while (0); -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - pr_reg[10] = get_user_gs(regs); \ -} while (0); - -#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \ -do { \ - ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ - savesegment(gs, pr_reg[10]); \ -} while (0); - #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) -- cgit v1.2.3-59-g8ed1b From daf3af4705ba8f49d33ea9b7bafdc9fd9efd49e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 12 Apr 2022 22:34:21 +0200 Subject: x86/apic: Clarify i82489DX bit overlap in APIC_LVT0 Daniel stumbled over the bit overlap of the i82498DX external APIC and the TSC deadline timer configuration bit in modern APICs, which is neither documented in the code nor in the current SDM. Maciej provided links to the original i82489DX/486 documentation. See Link. Remove the i82489DX macro maze, use a i82489DX specific define in the apic code and document the overlap in a comment. Reported-by: Daniel Vacek Signed-off-by: Thomas Gleixner Cc: Maciej W. Rozycki Link: https://lore.kernel.org/r/87ee22f3ci.ffs@tglx --- arch/x86/include/asm/apicdef.h | 6 ------ arch/x86/kernel/apic/apic.c | 11 ++++++++++- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 5716f22f81ac..92035eb3afee 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -95,12 +95,6 @@ #define APIC_LVTTHMR 0x330 #define APIC_LVTPC 0x340 #define APIC_LVT0 0x350 -#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) -#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) -#define SET_APIC_TIMER_BASE(x) (((x) << 18)) -#define APIC_TIMER_BASE_CLKIN 0x0 -#define APIC_TIMER_BASE_TMBASE 0x1 -#define APIC_TIMER_BASE_DIV 0x2 #define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) #define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b70344bf6600..13819bfa8dde 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -320,6 +320,9 @@ int lapic_get_maxlvt(void) #define APIC_DIVISOR 16 #define TSC_DIVISOR 8 +/* i82489DX specific */ +#define I82489DX_BASE_DIVIDER (((0x2) << 18)) + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -340,8 +343,14 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE; + /* + * The i82489DX APIC uses bit 18 and 19 for the base divider. This + * overlaps with bit 18 on integrated APICs, but is not documented + * in the SDM. No problem though. i82489DX equipped systems do not + * have TSC deadline timer. + */ if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + lvtt_value |= I82489DX_BASE_DIVIDER; if (!irqen) lvtt_value |= APIC_LVT_MASKED; -- cgit v1.2.3-59-g8ed1b From 4e140f59d285c1ca1e5c81b4c13e27366865bd09 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 10 Jan 2022 23:15:27 +0000 Subject: mm/usercopy: Check kmap addresses properly If you are copying to an address in the kmap region, you may not copy across a page boundary, no matter what the size of the underlying allocation. You can't kmap() a slab page because slab pages always come from low memory. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220110231530.665970-2-willy@infradead.org --- arch/x86/include/asm/highmem.h | 1 + include/linux/highmem-internal.h | 10 ++++++++++ mm/usercopy.c | 16 ++++++++++------ 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 032e020853aa..731ee7cc40a5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -26,6 +26,7 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index a77be5630209..337bd9f32921 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -149,6 +149,11 @@ static inline void totalhigh_pages_add(long count) atomic_long_add(count, &_totalhigh_pages); } +static inline bool is_kmap_addr(const void *x) +{ + unsigned long addr = (unsigned long)x; + return addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP); +} #else /* CONFIG_HIGHMEM */ static inline struct page *kmap_to_page(void *addr) @@ -234,6 +239,11 @@ static inline void __kunmap_atomic(void *addr) static inline unsigned int nr_free_highpages(void) { return 0; } static inline unsigned long totalhigh_pages(void) { return 0UL; } +static inline bool is_kmap_addr(const void *x) +{ + return false; +} + #endif /* CONFIG_HIGHMEM */ /* diff --git a/mm/usercopy.c b/mm/usercopy.c index 2c235d5c2364..ff13e7708faa 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -229,12 +229,16 @@ static inline void check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return; - /* - * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the - * highmem page or fallback to virt_to_page(). The following - * is effectively a highmem-aware virt_to_slab(). - */ - folio = page_folio(kmap_to_page((void *)ptr)); + if (is_kmap_addr(ptr)) { + unsigned long page_end = (unsigned long)ptr | (PAGE_SIZE - 1); + + if ((unsigned long)ptr + n - 1 > page_end) + usercopy_abort("kmap", NULL, to_user, + offset_in_page(ptr), n); + return; + } + + folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ -- cgit v1.2.3-59-g8ed1b From 3a24a60854d2ef19e0edcd11cdbbb4fabc655dde Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:52 -0400 Subject: x86/32: Remove lazy GS macros GS is always a user segment now. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-4-brgerst@gmail.com --- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/segment.h | 5 ----- arch/x86/kernel/process.c | 5 +---- arch/x86/kernel/process_32.c | 11 ++++------- arch/x86/kernel/ptrace.c | 6 +++--- arch/x86/kernel/signal.c | 8 +++++--- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/lib/insn-eval.c | 5 +++-- arch/x86/math-emu/get_address.c | 2 +- 9 files changed, 20 insertions(+), 28 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 27516046117a..b8d40ddeab00 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -141,7 +141,7 @@ do { \ #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ - lazy_load_gs(0); \ + loadsegment(gs, 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 656ed6531d03..617b3663e4dd 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -354,11 +354,6 @@ static inline void __loadsegment_fs(unsigned short value) * x86-32 user GS accessors. This is ugly and could do with some cleaning up. */ #ifdef CONFIG_X86_32 -# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) -# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -# define task_user_gs(tsk) ((tsk)->thread.gs) -# define lazy_save_gs(v) savesegment(gs, (v)) -# define lazy_load_gs(v) loadsegment(gs, (v)) # define load_gs_index(v) loadsegment(gs, (v)) #endif /* X86_32 */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..96a9885b9c3a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -160,6 +160,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, savesegment(ds, p->thread.ds); #else p->thread.sp0 = (unsigned long) (childregs + 1); + savesegment(gs, p->thread.gs); /* * Clear all status flags including IF and set fixed bit. 64bit * does not have this initialization as the frame does not contain @@ -191,10 +192,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, if (sp) childregs->sp = sp; -#ifdef CONFIG_X86_32 - task_user_gs(p) = get_user_gs(current_pt_regs()); -#endif - if (unlikely(p->flags & PF_IO_WORKER)) { /* * An IO thread is a user space thread, but it doesn't diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..877358f3dba7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -63,10 +63,7 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, unsigned long d0, d1, d2, d3, d6, d7; unsigned short gs; - if (user_mode(regs)) - gs = get_user_gs(regs); - else - savesegment(gs, gs); + savesegment(gs, gs); show_ip(regs, log_lvl); @@ -114,7 +111,7 @@ void release_thread(struct task_struct *dead_task) void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - set_user_gs(regs, 0); + loadsegment(gs, 0); regs->fs = 0; regs->ds = __USER_DS; regs->es = __USER_DS; @@ -177,7 +174,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - lazy_save_gs(prev->gs); + savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -208,7 +205,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - lazy_load_gs(next->gs); + loadsegment(gs, next->gs); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 98d10ef60571..37c12fb92906 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -170,9 +170,9 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) - retval = get_user_gs(task_pt_regs(task)); + savesegment(gs, retval); else - retval = task_user_gs(task); + retval = task->thread.gs; } return retval; } @@ -210,7 +210,7 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task_user_gs(task) = value; + task->thread.gs = value; } return 0; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e439eb14325f..9c7265b524c7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -93,7 +93,7 @@ static bool restore_sigcontext(struct pt_regs *regs, return false; #ifdef CONFIG_X86_32 - set_user_gs(regs, sc.gs); + loadsegment(gs, sc.gs); regs->fs = sc.fs; regs->es = sc.es; regs->ds = sc.ds; @@ -146,8 +146,10 @@ __unsafe_setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { #ifdef CONFIG_X86_32 - unsafe_put_user(get_user_gs(regs), - (unsigned int __user *)&sc->gs, Efault); + unsigned int gs; + savesegment(gs, gs); + + unsafe_put_user(gs, (unsigned int __user *)&sc->gs, Efault); unsafe_put_user(regs->fs, (unsigned int __user *)&sc->fs, Efault); unsafe_put_user(regs->es, (unsigned int __user *)&sc->es, Efault); unsafe_put_user(regs->ds, (unsigned int __user *)&sc->ds, Efault); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index c21bcd668284..e9e803a4d44c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -151,7 +151,7 @@ exit_vm86: memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(vm86->regs32.gs); + loadsegment(gs, vm86->regs32.gs); regs->pt.ax = retval; return; @@ -325,7 +325,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) * Save old state */ vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(vm86->regs32.gs); + savesegment(gs, vm86->regs32.gs); /* make room for real-mode segments */ preempt_disable(); diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index b781d324211b..21104c41cba0 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -342,9 +342,9 @@ static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff) */ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) { -#ifdef CONFIG_X86_64 unsigned short sel; +#ifdef CONFIG_X86_64 switch (seg_reg_idx) { case INAT_SEG_REG_IGNORE: return 0; @@ -402,7 +402,8 @@ static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx) case INAT_SEG_REG_FS: return (unsigned short)(regs->fs & 0xffff); case INAT_SEG_REG_GS: - return get_user_gs(regs); + savesegment(gs, sel); + return sel; case INAT_SEG_REG_IGNORE: default: return -EINVAL; diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index b82ca14ba718..4a9fd9029a53 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -153,7 +153,7 @@ static long pm_address(u_char FPU_modrm, u_char segment, switch (segment) { case PREFIX_GS_ - 1: /* user gs handling can be lazy, use special accessors */ - addr->selector = get_user_gs(FPU_info->regs); + savesegment(gs, addr->selector); break; default: addr->selector = PM_REG_(segment); -- cgit v1.2.3-59-g8ed1b From 203d8919a9eda5d1bc68ac3cd7637588334c9dc1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 25 Mar 2022 11:39:53 -0400 Subject: x86/asm: Merge load_gs_index() Merge the 32- and 64-bit implementations of load_gs_index(). Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Reviewed-by: Thomas Gleixner Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220325153953.162643-5-brgerst@gmail.com --- arch/x86/include/asm/segment.h | 7 ------- arch/x86/include/asm/special_insns.h | 7 ++++--- 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 617b3663e4dd..2e7890dd58a4 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -350,13 +350,6 @@ static inline void __loadsegment_fs(unsigned short value) #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") -/* - * x86-32 user GS accessors. This is ugly and could do with some cleaning up. - */ -#ifdef CONFIG_X86_32 -# define load_gs_index(v) loadsegment(gs, (v)) -#endif /* X86_32 */ - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 68c257a3de0d..45b18eb94fa1 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -184,14 +184,15 @@ static inline void wbinvd(void) native_wbinvd(); } -#ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int selector) { +#ifdef CONFIG_X86_64 native_load_gs_index(selector); -} - +#else + loadsegment(gs, selector); #endif +} #endif /* CONFIG_PARAVIRT_XXL */ -- cgit v1.2.3-59-g8ed1b From f52ba93190457aa285ae805a3e8360a50552cfd8 Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Fri, 20 Aug 2021 17:42:43 +0530 Subject: tools/power turbostat: Add Power Limit4 support Add Power Limit4 support. Signed-off-by: Sumeet Pawnikar Acked-by: Zhang Rui Signed-off-by: Len Brown --- arch/x86/include/asm/msr-index.h | 1 + tools/power/x86/turbostat/turbostat.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index a4a39c3e0f19..b890840e0059 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -310,6 +310,7 @@ /* Run Time Average Power Limiting (RAPL) Interface */ +#define MSR_VR_CURRENT_CONFIG 0x00000601 #define MSR_RAPL_POWER_UNIT 0x00000606 #define MSR_PKG_POWER_LIMIT 0x00000610 diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 1ba444d9b68a..993af623ae90 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4773,6 +4773,15 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) ((msr >> 32) & 0x7FFF) * rapl_power_units, (1.0 + (((msr >> 54) & 0x3) / 4.0)) * (1 << ((msr >> 49) & 0x1F)) * rapl_time_units, ((msr >> 48) & 1) ? "EN" : "DIS"); + + if (get_msr(cpu, MSR_VR_CURRENT_CONFIG, &msr)) + return -9; + + fprintf(outf, "cpu%d: MSR_VR_CURRENT_CONFIG: 0x%08llx\n", cpu, msr); + fprintf(outf, "cpu%d: PKG Limit #4: %f Watts (%slocked)\n", + cpu, + ((msr >> 0) & 0x1FFF) * rapl_power_units, + (msr >> 31) & 1 ? "" : "UN"); } if (do_rapl & RAPL_DRAM_POWER_INFO) { -- cgit v1.2.3-59-g8ed1b From 78013eaadf696d2105982abb4018fbae394ca08f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Feb 2022 14:11:44 +0100 Subject: x86: remove the IOMMU table infrastructure The IOMMU table tries to separate the different IOMMUs into different backends, but actually requires various cross calls. Rewrite the code to do the generic swiotlb/swiotlb-xen setup directly in pci-dma.c and then just call into the IOMMU drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/ia64/include/asm/iommu_table.h | 7 --- arch/x86/include/asm/dma-mapping.h | 1 - arch/x86/include/asm/gart.h | 5 +- arch/x86/include/asm/iommu.h | 6 ++ arch/x86/include/asm/iommu_table.h | 102 ------------------------------- arch/x86/include/asm/swiotlb.h | 30 --------- arch/x86/include/asm/xen/swiotlb-xen.h | 2 - arch/x86/kernel/Makefile | 2 - arch/x86/kernel/amd_gart_64.c | 5 +- arch/x86/kernel/aperture_64.c | 14 ++--- arch/x86/kernel/pci-dma.c | 107 ++++++++++++++++++++++++++------- arch/x86/kernel/pci-iommu_table.c | 77 ------------------------ arch/x86/kernel/pci-swiotlb.c | 77 ------------------------ arch/x86/kernel/tboot.c | 1 - arch/x86/kernel/vmlinux.lds.S | 12 ---- arch/x86/xen/Makefile | 2 - arch/x86/xen/pci-swiotlb-xen.c | 96 ----------------------------- drivers/iommu/amd/init.c | 6 -- drivers/iommu/amd/iommu.c | 5 +- drivers/iommu/intel/dmar.c | 6 +- include/linux/dmar.h | 6 +- 21 files changed, 110 insertions(+), 459 deletions(-) delete mode 100644 arch/ia64/include/asm/iommu_table.h delete mode 100644 arch/x86/include/asm/iommu_table.h delete mode 100644 arch/x86/include/asm/swiotlb.h delete mode 100644 arch/x86/kernel/pci-iommu_table.c delete mode 100644 arch/x86/kernel/pci-swiotlb.c delete mode 100644 arch/x86/xen/pci-swiotlb-xen.c (limited to 'arch/x86/include') diff --git a/arch/ia64/include/asm/iommu_table.h b/arch/ia64/include/asm/iommu_table.h deleted file mode 100644 index cc96116ac276..000000000000 --- a/arch/ia64/include/asm/iommu_table.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_IA64_IOMMU_TABLE_H -#define _ASM_IA64_IOMMU_TABLE_H - -#define IOMMU_INIT_POST(_detect) - -#endif /* _ASM_IA64_IOMMU_TABLE_H */ diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index bb1654fe0ce7..256fd8115223 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -9,7 +9,6 @@ #include #include -#include extern int iommu_merge; extern int panic_on_overflow; diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h index 318556574345..5af8088a10df 100644 --- a/arch/x86/include/asm/gart.h +++ b/arch/x86/include/asm/gart.h @@ -38,7 +38,7 @@ extern int gart_iommu_aperture_disabled; extern void early_gart_iommu_check(void); extern int gart_iommu_init(void); extern void __init gart_parse_options(char *); -extern int gart_iommu_hole_init(void); +void gart_iommu_hole_init(void); #else #define gart_iommu_aperture 0 @@ -51,9 +51,8 @@ static inline void early_gart_iommu_check(void) static inline void gart_parse_options(char *options) { } -static inline int gart_iommu_hole_init(void) +static inline void gart_iommu_hole_init(void) { - return -ENODEV; } #endif diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index bf1ed2ddc74b..dba89ed40d38 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -9,6 +9,12 @@ extern int force_iommu, no_iommu; extern int iommu_detected; +#ifdef CONFIG_SWIOTLB +extern bool x86_swiotlb_enable; +#else +#define x86_swiotlb_enable false +#endif + /* 10 seconds */ #define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000) diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h deleted file mode 100644 index 1fb3fd1a83c2..000000000000 --- a/arch/x86/include/asm/iommu_table.h +++ /dev/null @@ -1,102 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_IOMMU_TABLE_H -#define _ASM_X86_IOMMU_TABLE_H - -#include - -/* - * History lesson: - * The execution chain of IOMMUs in 2.6.36 looks as so: - * - * [xen-swiotlb] - * | - * +----[swiotlb *]--+ - * / | \ - * / | \ - * [GART] [Calgary] [Intel VT-d] - * / - * / - * [AMD-Vi] - * - * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip - * over the rest of IOMMUs and unconditionally initialize the SWIOTLB. - * Also it would surreptitiously initialize set the swiotlb=1 if there were - * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb - * flag would be turned off by all IOMMUs except the Calgary one. - * - * The IOMMU_INIT* macros allow a similar tree (or more complex if desired) - * to be built by defining who we depend on. - * - * And all that needs to be done is to use one of the macros in the IOMMU - * and the pci-dma.c will take care of the rest. - */ - -struct iommu_table_entry { - initcall_t detect; - initcall_t depend; - void (*early_init)(void); /* No memory allocate available. */ - void (*late_init)(void); /* Yes, can allocate memory. */ -#define IOMMU_FINISH_IF_DETECTED (1<<0) -#define IOMMU_DETECTED (1<<1) - int flags; -}; -/* - * Macro fills out an entry in the .iommu_table that is equivalent - * to the fields that 'struct iommu_table_entry' has. The entries - * that are put in the .iommu_table section are not put in any order - * hence during boot-time we will have to resort them based on - * dependency. */ - - -#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ - static const struct iommu_table_entry \ - __iommu_entry_##_detect __used \ - __attribute__ ((unused, __section__(".iommu_table"), \ - aligned((sizeof(void *))))) \ - = {_detect, _depend, _early_init, _late_init, \ - _finish ? IOMMU_FINISH_IF_DETECTED : 0} -/* - * The simplest IOMMU definition. Provide the detection routine - * and it will be run after the SWIOTLB and the other IOMMUs - * that utilize this macro. If the IOMMU is detected (ie, the - * detect routine returns a positive value), the other IOMMUs - * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer - * to stop detecting the other IOMMUs after yours has been detected. - */ -#define IOMMU_INIT_POST(_detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 0) - -#define IOMMU_INIT_POST_FINISH(detect) \ - __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, NULL, NULL, 1) - -/* - * A more sophisticated version of IOMMU_INIT. This variant requires: - * a). A detection routine function. - * b). The name of the detection routine we depend on to get called - * before us. - * c). The init routine which gets called if the detection routine - * returns a positive value from the pci_iommu_alloc. This means - * no presence of a memory allocator. - * d). Similar to the 'init', except that this gets called from pci_iommu_init - * where we do have a memory allocator. - * - * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant - * in that the former will continue detecting other IOMMUs in the call - * list after the detection routine returns a positive number, while the - * latter will stop the execution chain upon first successful detection. - * Both variants will still call the 'init' and 'late_init' functions if - * they are set. - */ -#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \ - __IOMMU_INIT(_detect, _depend, _init, _late_init, 1) - -#define IOMMU_INIT(_detect, _depend, _init, _late_init) \ - __IOMMU_INIT(_detect, _depend, _init, _late_init, 0) - -void sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish); - -void check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish); - -#endif /* _ASM_X86_IOMMU_TABLE_H */ diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h deleted file mode 100644 index ff6c92eff035..000000000000 --- a/arch/x86/include/asm/swiotlb.h +++ /dev/null @@ -1,30 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_SWIOTLB_H -#define _ASM_X86_SWIOTLB_H - -#include - -#ifdef CONFIG_SWIOTLB -extern int swiotlb; -extern int __init pci_swiotlb_detect_override(void); -extern int __init pci_swiotlb_detect_4gb(void); -extern void __init pci_swiotlb_init(void); -extern void __init pci_swiotlb_late_init(void); -#else -#define swiotlb 0 -static inline int pci_swiotlb_detect_override(void) -{ - return 0; -} -static inline int pci_swiotlb_detect_4gb(void) -{ - return 0; -} -static inline void pci_swiotlb_init(void) -{ -} -static inline void pci_swiotlb_late_init(void) -{ -} -#endif -#endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index 66b4ddde7743..e5a90b42e4dd 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -3,10 +3,8 @@ #define _ASM_X86_SWIOTLB_XEN_H #ifdef CONFIG_SWIOTLB_XEN -extern int __init pci_xen_swiotlb_detect(void); extern int pci_xen_swiotlb_init_late(void); #else -#define pci_xen_swiotlb_detect NULL static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c41ef42adbe8..e17b7e92a3fa 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -68,7 +68,6 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o hw_breakpoint.o obj-y += tsc.o tsc_msr.o io_delay.o rtc.o -obj-y += pci-iommu_table.o obj-y += resource.o obj-y += irqflags.o obj-y += static_call.o @@ -134,7 +133,6 @@ obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index ed837383de5c..194d54eed537 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -38,11 +38,9 @@ #include #include #include -#include #include #include #include -#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -808,7 +806,7 @@ int __init gart_iommu_init(void) flush_gart(); dma_ops = &gart_dma_ops; x86_platform.iommu_shutdown = gart_iommu_shutdown; - swiotlb = 0; + x86_swiotlb_enable = false; return 0; } @@ -842,4 +840,3 @@ void __init gart_parse_options(char *p) } } } -IOMMU_INIT_POST(gart_iommu_hole_init); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index af3ba08b684b..7a5630d904b2 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -392,7 +392,7 @@ void __init early_gart_iommu_check(void) static int __initdata printed_gart_size_msg; -int __init gart_iommu_hole_init(void) +void __init gart_iommu_hole_init(void) { u32 agp_aper_base = 0, agp_aper_order = 0; u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; @@ -401,11 +401,11 @@ int __init gart_iommu_hole_init(void) int i, node; if (!amd_gart_present()) - return -ENODEV; + return; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) - return -ENODEV; + return; pr_info("Checking aperture...\n"); @@ -491,10 +491,8 @@ out: * and fixed up the northbridge */ exclude_from_core(last_aper_base, last_aper_order); - - return 1; } - return 0; + return; } if (!fallback_aper_force) { @@ -527,7 +525,7 @@ out: panic("Not enough memory for aperture"); } } else { - return 0; + return; } /* @@ -561,6 +559,4 @@ out: } set_up_gart_resume(aper_order, aper_alloc); - - return 1; } diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index de234e7a8962..df96926421be 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -7,13 +7,16 @@ #include #include #include +#include #include #include #include #include #include -#include + +#include +#include static bool disable_dac_quirk __read_mostly; @@ -34,24 +37,83 @@ int no_iommu __read_mostly; /* Set this to 1 if there is a HW IOMMU in the system */ int iommu_detected __read_mostly = 0; -extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; +#ifdef CONFIG_SWIOTLB +bool x86_swiotlb_enable; + +static void __init pci_swiotlb_detect(void) +{ + /* don't initialize swiotlb if iommu=off (no_iommu=1) */ + if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) + x86_swiotlb_enable = true; + + /* + * Set swiotlb to 1 so that bounce buffers are allocated and used for + * devices that can't support DMA to encrypted memory. + */ + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + x86_swiotlb_enable = true; + + if (swiotlb_force == SWIOTLB_FORCE) + x86_swiotlb_enable = true; +} +#else +static inline void __init pci_swiotlb_detect(void) +{ +} +#endif /* CONFIG_SWIOTLB */ + +#ifdef CONFIG_SWIOTLB_XEN +static bool xen_swiotlb; + +static void __init pci_xen_swiotlb_init(void) +{ + if (!xen_initial_domain() && !x86_swiotlb_enable && + swiotlb_force != SWIOTLB_FORCE) + return; + x86_swiotlb_enable = true; + xen_swiotlb = true; + xen_swiotlb_init_early(); + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); +} + +int pci_xen_swiotlb_init_late(void) +{ + int rc; + + if (xen_swiotlb) + return 0; + + rc = xen_swiotlb_init(); + if (rc) + return rc; + + /* XXX: this switches the dma ops under live devices! */ + dma_ops = &xen_swiotlb_dma_ops; + if (IS_ENABLED(CONFIG_PCI)) + pci_request_acs(); + return 0; +} +EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); +#else +static inline void __init pci_xen_swiotlb_init(void) +{ +} +#endif /* CONFIG_SWIOTLB_XEN */ void __init pci_iommu_alloc(void) { - struct iommu_table_entry *p; - - sort_iommu_table(__iommu_table, __iommu_table_end); - check_iommu_entries(__iommu_table, __iommu_table_end); - - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && p->detect && p->detect() > 0) { - p->flags |= IOMMU_DETECTED; - if (p->early_init) - p->early_init(); - if (p->flags & IOMMU_FINISH_IF_DETECTED) - break; - } + if (xen_pv_domain()) { + pci_xen_swiotlb_init(); + return; } + pci_swiotlb_detect(); + gart_iommu_hole_init(); + amd_iommu_detect(); + detect_intel_iommu(); + if (x86_swiotlb_enable) + swiotlb_init(0); } /* @@ -102,7 +164,7 @@ static __init int iommu_setup(char *p) } #ifdef CONFIG_SWIOTLB if (!strncmp(p, "soft", 4)) - swiotlb = 1; + x86_swiotlb_enable = true; #endif if (!strncmp(p, "pt", 2)) iommu_set_default_passthrough(true); @@ -121,14 +183,17 @@ early_param("iommu", iommu_setup); static int __init pci_iommu_init(void) { - struct iommu_table_entry *p; - x86_init.iommu.iommu_init(); - for (p = __iommu_table; p < __iommu_table_end; p++) { - if (p && (p->flags & IOMMU_DETECTED) && p->late_init) - p->late_init(); +#ifdef CONFIG_SWIOTLB + /* An IOMMU turned us off. */ + if (x86_swiotlb_enable) { + pr_info("PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); + swiotlb_print_info(); + } else { + swiotlb_exit(); } +#endif return 0; } diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c deleted file mode 100644 index 42e92ec62973..000000000000 --- a/arch/x86/kernel/pci-iommu_table.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include - -static struct iommu_table_entry * __init -find_dependents_of(struct iommu_table_entry *start, - struct iommu_table_entry *finish, - struct iommu_table_entry *q) -{ - struct iommu_table_entry *p; - - if (!q) - return NULL; - - for (p = start; p < finish; p++) - if (p->detect == q->depend) - return p; - - return NULL; -} - - -void __init sort_iommu_table(struct iommu_table_entry *start, - struct iommu_table_entry *finish) { - - struct iommu_table_entry *p, *q, tmp; - - for (p = start; p < finish; p++) { -again: - q = find_dependents_of(start, finish, p); - /* We are bit sneaky here. We use the memory address to figure - * out if the node we depend on is past our point, if so, swap. - */ - if (q > p) { - tmp = *p; - memmove(p, q, sizeof(*p)); - *q = tmp; - goto again; - } - } - -} - -#ifdef DEBUG -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ - struct iommu_table_entry *p, *q, *x; - - /* Simple cyclic dependency checker. */ - for (p = start; p < finish; p++) { - q = find_dependents_of(start, finish, p); - x = find_dependents_of(start, finish, q); - if (p == x) { - printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n", - p->detect, q->detect); - /* Heavy handed way..*/ - x->depend = NULL; - } - } - - for (p = start; p < finish; p++) { - q = find_dependents_of(p, finish, p); - if (q && q > p) { - printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n", - p->detect, q->detect); - } - } -} -#else -void __init check_iommu_entries(struct iommu_table_entry *start, - struct iommu_table_entry *finish) -{ -} -#endif diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c deleted file mode 100644 index 814ab46a0dad..000000000000 --- a/arch/x86/kernel/pci-swiotlb.c +++ /dev/null @@ -1,77 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -int swiotlb __read_mostly; - -/* - * pci_swiotlb_detect_override - set swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use swiotlb (by the boot - * option). - */ -int __init pci_swiotlb_detect_override(void) -{ - if (swiotlb_force == SWIOTLB_FORCE) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT_FINISH(pci_swiotlb_detect_override, - pci_xen_swiotlb_detect, - pci_swiotlb_init, - pci_swiotlb_late_init); - -/* - * If 4GB or more detected (and iommu=off not set) or if SME is active - * then set swiotlb to 1 and return 1. - */ -int __init pci_swiotlb_detect_4gb(void) -{ - /* don't initialize swiotlb if iommu=off (no_iommu=1) */ - if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) - swiotlb = 1; - - /* - * Set swiotlb to 1 so that bounce buffers are allocated and used for - * devices that can't support DMA to encrypted memory. - */ - if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) - swiotlb = 1; - - return swiotlb; -} -IOMMU_INIT(pci_swiotlb_detect_4gb, - pci_swiotlb_detect_override, - pci_swiotlb_init, - pci_swiotlb_late_init); - -void __init pci_swiotlb_init(void) -{ - if (swiotlb) - swiotlb_init(0); -} - -void __init pci_swiotlb_late_init(void) -{ - /* An IOMMU turned us off. */ - if (!swiotlb) - swiotlb_exit(); - else { - printk(KERN_INFO "PCI-DMA: " - "Using software bounce buffering for IO (SWIOTLB)\n"); - swiotlb_print_info(); - } -} diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index f9af561c3cd4..0c1154a1c403 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 7fda7f27e762..f5f6dc2e8007 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -315,18 +315,6 @@ SECTIONS *(.altinstr_replacement) } - /* - * struct iommu_table_entry entries are injected in this section. - * It is an array of IOMMUs which during run time gets sorted depending - * on its dependency order. After rootfs_initcall is complete - * this section can be safely removed. - */ - .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) { - __iommu_table = .; - *(.iommu_table) - __iommu_table_end = .; - } - . = ALIGN(8); .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { __apicdrivers = .; diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4953260e281c..3c5b52fbe4a7 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -47,6 +47,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_PV_DOM0) += vga.o -obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o - obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c deleted file mode 100644 index 46df59aeaa06..000000000000 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* Glue code to lib/swiotlb-xen.c */ - -#include -#include -#include - -#include -#include -#include - - -#include -#ifdef CONFIG_X86_64 -#include -#include -#endif -#include - -static int xen_swiotlb __read_mostly; - -/* - * pci_xen_swiotlb_detect - set xen_swiotlb to 1 if necessary - * - * This returns non-zero if we are forced to use xen_swiotlb (by the boot - * option). - */ -int __init pci_xen_swiotlb_detect(void) -{ - - if (!xen_pv_domain()) - return 0; - - /* If running as PV guest, either iommu=soft, or swiotlb=force will - * activate this IOMMU. If running as PV privileged, activate it - * irregardless. - */ - if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE) - xen_swiotlb = 1; - - /* If we are running under Xen, we MUST disable the native SWIOTLB. - * Don't worry about swiotlb_force flag activating the native, as - * the 'swiotlb' flag is the only one turning it on. */ - swiotlb = 0; - -#ifdef CONFIG_X86_64 - /* pci_swiotlb_detect_4gb turns on native SWIOTLB if no_iommu == 0 - * (so no iommu=X command line over-writes). - * Considering that PV guests do not want the *native SWIOTLB* but - * only Xen SWIOTLB it is not useful to us so set no_iommu=1 here. - */ - if (max_pfn > MAX_DMA32_PFN) - no_iommu = 1; -#endif - return xen_swiotlb; -} - -static void __init pci_xen_swiotlb_init(void) -{ - if (xen_swiotlb) { - xen_swiotlb_init_early(); - dma_ops = &xen_swiotlb_dma_ops; - -#ifdef CONFIG_PCI - /* Make sure ACS will be enabled */ - pci_request_acs(); -#endif - } -} - -int pci_xen_swiotlb_init_late(void) -{ - int rc; - - if (xen_swiotlb) - return 0; - - rc = xen_swiotlb_init(); - if (rc) - return rc; - - dma_ops = &xen_swiotlb_dma_ops; -#ifdef CONFIG_PCI - /* Make sure ACS will be enabled */ - pci_request_acs(); -#endif - - return 0; -} -EXPORT_SYMBOL_GPL(pci_xen_swiotlb_init_late); - -IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, - NULL, - pci_xen_swiotlb_init, - NULL); diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index b4a798c7b347..1a3ad58ba846 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -3257,11 +3256,6 @@ __setup("ivrs_ioapic", parse_ivrs_ioapic); __setup("ivrs_hpet", parse_ivrs_hpet); __setup("ivrs_acpihid", parse_ivrs_acpihid); -IOMMU_INIT_FINISH(amd_iommu_detect, - gart_iommu_hole_init, - NULL, - NULL); - bool amd_iommu_v2_supported(void) { return amd_iommu_v2_present; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a1ada7bff44e..b47220ac09ea 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1840,7 +1840,10 @@ void amd_iommu_domain_update(struct protection_domain *domain) static void __init amd_iommu_init_dma_ops(void) { - swiotlb = (iommu_default_passthrough() || sme_me_mask) ? 1 : 0; + if (iommu_default_passthrough() || sme_me_mask) + x86_swiotlb_enable = true; + else + x86_swiotlb_enable = false; } int __init amd_iommu_init_api(void) diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index 4de960834a1b..592c1e1a5d4b 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include "../irq_remapping.h" @@ -912,7 +911,7 @@ dmar_validate_one_drhd(struct acpi_dmar_header *entry, void *arg) return 0; } -int __init detect_intel_iommu(void) +void __init detect_intel_iommu(void) { int ret; struct dmar_res_callback validate_drhd_cb = { @@ -945,8 +944,6 @@ int __init detect_intel_iommu(void) dmar_tbl = NULL; } up_write(&dmar_global_lock); - - return ret ? ret : 1; } static void unmap_iommu(struct intel_iommu *iommu) @@ -2164,7 +2161,6 @@ static int __init dmar_free_unused_resources(void) } late_initcall(dmar_free_unused_resources); -IOMMU_INIT_POST(detect_intel_iommu); /* * DMAR Hotplug Support diff --git a/include/linux/dmar.h b/include/linux/dmar.h index 45e903d84733..cbd714a198a0 100644 --- a/include/linux/dmar.h +++ b/include/linux/dmar.h @@ -121,7 +121,7 @@ extern int dmar_remove_dev_scope(struct dmar_pci_notify_info *info, u16 segment, struct dmar_dev_scope *devices, int count); /* Intel IOMMU detection */ -extern int detect_intel_iommu(void); +void detect_intel_iommu(void); extern int enable_drhd_fault_handling(void); extern int dmar_device_add(acpi_handle handle); extern int dmar_device_remove(acpi_handle handle); @@ -197,6 +197,10 @@ static inline bool dmar_platform_optin(void) return false; } +static inline void detect_intel_iommu(void) +{ +} + #endif /* CONFIG_DMAR_TABLE */ struct irte { -- cgit v1.2.3-59-g8ed1b From 3f70356edf5611c28a68d8d5a9c2b442c9eb81e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Mar 2022 07:58:45 +0100 Subject: swiotlb: merge swiotlb-xen initialization into swiotlb Reuse the generic swiotlb initialization for xen-swiotlb. For ARM/ARM64 this works trivially, while for x86 xen_swiotlb_fixup needs to be passed as the remap argument to swiotlb_init_remap/swiotlb_init_late. Note that the lower bound of the swiotlb size is changed to the smaller IO_TLB_MIN_SLABS based value with this patch, but that is fine as the 2MB value used in Xen before was just an optimization and is not the hard lower bound. Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/arm/xen/mm.c | 21 +++---- arch/x86/include/asm/xen/page.h | 5 -- arch/x86/kernel/pci-dma.c | 20 +++---- drivers/xen/swiotlb-xen.c | 128 +--------------------------------------- include/xen/arm/page.h | 1 - include/xen/swiotlb-xen.h | 8 ++- 6 files changed, 28 insertions(+), 155 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index 28c207060253..ff05a7899cb8 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -23,22 +23,20 @@ #include #include -unsigned long xen_get_swiotlb_free_pages(unsigned int order) +static gfp_t xen_swiotlb_gfp(void) { phys_addr_t base; - gfp_t flags = __GFP_NOWARN|__GFP_KSWAPD_RECLAIM; u64 i; for_each_mem_range(i, &base, NULL) { if (base < (phys_addr_t)0xffffffff) { if (IS_ENABLED(CONFIG_ZONE_DMA32)) - flags |= __GFP_DMA32; - else - flags |= __GFP_DMA; - break; + return __GFP_DMA32; + return __GFP_DMA; } } - return __get_free_pages(flags, order); + + return GFP_KERNEL; } static bool hypercall_cflush = false; @@ -140,10 +138,13 @@ static int __init xen_mm_init(void) if (!xen_swiotlb_detect()) return 0; - rc = xen_swiotlb_init(); /* we can work with the default swiotlb */ - if (rc < 0 && rc != -EEXIST) - return rc; + if (!io_tlb_default_mem.nslabs) { + rc = swiotlb_init_late(swiotlb_size_or_default(), + xen_swiotlb_gfp(), NULL); + if (rc < 0) + return rc; + } cflush.op = 0; cflush.a.dev_bus_addr = 0; diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index e989bc2269f5..1fc67df50014 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -357,9 +357,4 @@ static inline bool xen_arch_need_swiotlb(struct device *dev, return false; } -static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order) -{ - return __get_free_pages(__GFP_NOWARN, order); -} - #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a705a199bf8a..30bbe4abb5d6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -72,15 +72,13 @@ static inline void __init pci_swiotlb_detect(void) #endif /* CONFIG_SWIOTLB */ #ifdef CONFIG_SWIOTLB_XEN -static bool xen_swiotlb; - static void __init pci_xen_swiotlb_init(void) { if (!xen_initial_domain() && !x86_swiotlb_enable) return; x86_swiotlb_enable = true; - xen_swiotlb = true; - xen_swiotlb_init_early(); + x86_swiotlb_flags |= SWIOTLB_ANY; + swiotlb_init_remap(true, x86_swiotlb_flags, xen_swiotlb_fixup); dma_ops = &xen_swiotlb_dma_ops; if (IS_ENABLED(CONFIG_PCI)) pci_request_acs(); @@ -88,14 +86,16 @@ static void __init pci_xen_swiotlb_init(void) int pci_xen_swiotlb_init_late(void) { - int rc; - - if (xen_swiotlb) + if (dma_ops == &xen_swiotlb_dma_ops) return 0; - rc = xen_swiotlb_init(); - if (rc) - return rc; + /* we can work with the default swiotlb */ + if (!io_tlb_default_mem.nslabs) { + int rc = swiotlb_init_late(swiotlb_size_or_default(), + GFP_KERNEL, xen_swiotlb_fixup); + if (rc < 0) + return rc; + } /* XXX: this switches the dma ops under live devices! */ dma_ops = &xen_swiotlb_dma_ops; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index c2da3eb4826e..df8085b50df1 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -104,7 +104,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) return 0; } -static int xen_swiotlb_fixup(void *buf, unsigned long nslabs) +int xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int rc; unsigned int order = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT); @@ -130,132 +130,6 @@ static int xen_swiotlb_fixup(void *buf, unsigned long nslabs) return 0; } -enum xen_swiotlb_err { - XEN_SWIOTLB_UNKNOWN = 0, - XEN_SWIOTLB_ENOMEM, - XEN_SWIOTLB_EFIXUP -}; - -static const char *xen_swiotlb_error(enum xen_swiotlb_err err) -{ - switch (err) { - case XEN_SWIOTLB_ENOMEM: - return "Cannot allocate Xen-SWIOTLB buffer\n"; - case XEN_SWIOTLB_EFIXUP: - return "Failed to get contiguous memory for DMA from Xen!\n"\ - "You either: don't have the permissions, do not have"\ - " enough free memory under 4GB, or the hypervisor memory"\ - " is too fragmented!"; - default: - break; - } - return ""; -} - -int xen_swiotlb_init(void) -{ - enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; - unsigned long bytes = swiotlb_size_or_default(); - unsigned long nslabs = bytes >> IO_TLB_SHIFT; - unsigned int order, repeat = 3; - int rc = -ENOMEM; - char *start; - - if (io_tlb_default_mem.nslabs) { - pr_warn("swiotlb buffer already initialized\n"); - return -EEXIST; - } - -retry: - m_ret = XEN_SWIOTLB_ENOMEM; - order = get_order(bytes); - - /* - * Get IO TLB memory from any location. - */ -#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) -#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) - while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { - start = (void *)xen_get_swiotlb_free_pages(order); - if (start) - break; - order--; - } - if (!start) - goto exit; - if (order != get_order(bytes)) { - pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", - (PAGE_SIZE << order) >> 20); - nslabs = SLABS_PER_PAGE << order; - bytes = nslabs << IO_TLB_SHIFT; - } - - /* - * And replace that memory with pages under 4GB. - */ - rc = xen_swiotlb_fixup(start, nslabs); - if (rc) { - free_pages((unsigned long)start, order); - m_ret = XEN_SWIOTLB_EFIXUP; - goto error; - } - rc = swiotlb_late_init_with_tbl(start, nslabs); - if (rc) - return rc; - return 0; -error: - if (nslabs > 1024 && repeat--) { - /* Min is 2MB */ - nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); - bytes = nslabs << IO_TLB_SHIFT; - pr_info("Lowering to %luMB\n", bytes >> 20); - goto retry; - } -exit: - pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); - return rc; -} - -#ifdef CONFIG_X86 -void __init xen_swiotlb_init_early(void) -{ - unsigned long bytes = swiotlb_size_or_default(); - unsigned long nslabs = bytes >> IO_TLB_SHIFT; - unsigned int repeat = 3; - char *start; - int rc; - -retry: - /* - * Get IO TLB memory from any location. - */ - start = memblock_alloc(PAGE_ALIGN(bytes), - IO_TLB_SEGSIZE << IO_TLB_SHIFT); - if (!start) - panic("%s: Failed to allocate %lu bytes\n", - __func__, PAGE_ALIGN(bytes)); - - /* - * And replace that memory with pages under 4GB. - */ - rc = xen_swiotlb_fixup(start, nslabs); - if (rc) { - memblock_free(start, PAGE_ALIGN(bytes)); - if (nslabs > 1024 && repeat--) { - /* Min is 2MB */ - nslabs = max(1024UL, ALIGN(nslabs >> 1, IO_TLB_SEGSIZE)); - bytes = nslabs << IO_TLB_SHIFT; - pr_info("Lowering to %luMB\n", bytes >> 20); - goto retry; - } - panic("%s (rc:%d)", xen_swiotlb_error(XEN_SWIOTLB_EFIXUP), rc); - } - - if (swiotlb_init_with_tbl(start, nslabs, SWIOTLB_VERBOSE)) - panic("Cannot allocate SWIOTLB buffer"); -} -#endif /* CONFIG_X86 */ - static void * xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h index ac1b65470563..7e199c6656b9 100644 --- a/include/xen/arm/page.h +++ b/include/xen/arm/page.h @@ -115,6 +115,5 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) bool xen_arch_need_swiotlb(struct device *dev, phys_addr_t phys, dma_addr_t dev_addr); -unsigned long xen_get_swiotlb_free_pages(unsigned int order); #endif /* _ASM_ARM_XEN_PAGE_H */ diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index b3e647f86e3e..590ceb923f0c 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -10,8 +10,12 @@ void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir); -int xen_swiotlb_init(void); -void __init xen_swiotlb_init_early(void); +#ifdef CONFIG_SWIOTLB_XEN +int xen_swiotlb_fixup(void *buf, unsigned long nslabs); +#else +#define xen_swiotlb_fixup NULL +#endif + extern const struct dma_map_ops xen_swiotlb_dma_ops; #endif /* __LINUX_SWIOTLB_XEN_H */ -- cgit v1.2.3-59-g8ed1b From 3cb4503a330159dc5cf2f8382181ccbabbbaa5b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Feb 2022 15:10:54 +0100 Subject: x86: remove cruft from gets pulled in by all drivers using the DMA API. Remove x86 internal variables and unnecessary includes from it. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk Tested-by: Boris Ostrovsky --- arch/x86/include/asm/dma-mapping.h | 11 ----------- arch/x86/include/asm/iommu.h | 2 ++ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 256fd8115223..1c66708e3062 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -2,17 +2,6 @@ #ifndef _ASM_X86_DMA_MAPPING_H #define _ASM_X86_DMA_MAPPING_H -/* - * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and - * Documentation/core-api/dma-api.rst for documentation. - */ - -#include -#include - -extern int iommu_merge; -extern int panic_on_overflow; - extern const struct dma_map_ops *dma_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h index dba89ed40d38..0bef44d30a27 100644 --- a/arch/x86/include/asm/iommu.h +++ b/arch/x86/include/asm/iommu.h @@ -8,6 +8,8 @@ extern int force_iommu, no_iommu; extern int iommu_detected; +extern int iommu_merge; +extern int panic_on_overflow; #ifdef CONFIG_SWIOTLB extern bool x86_swiotlb_enable; -- cgit v1.2.3-59-g8ed1b From 2bf93ffbb97e0614cfc431d2ea33b7eae7481eb2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Wed, 20 Apr 2022 09:14:13 -0500 Subject: virt: sevguest: Change driver name to reflect generic SEV support During patch review, it was decided the SNP guest driver name should not be SEV-SNP specific, but should be generic for use with anything SEV. However, this feedback was missed and the driver name, and many of the driver functions and structures, are SEV-SNP name specific. Rename the driver to "sev-guest" (to match the misc device that is created) and update some of the function and structure names, too. While in the file, adjust the one pr_err() message to be a dev_err() message so that the message, if issued, uses the driver name. Signed-off-by: Tom Lendacky Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/307710bb5515c9088a19fd0b930268c7300479b2.1650464054.git.thomas.lendacky@amd.com --- arch/x86/include/asm/sev.h | 2 +- arch/x86/kernel/sev.c | 10 ++++----- drivers/virt/coco/sevguest/sevguest.c | 39 +++++++++++++++++++---------------- 3 files changed, 27 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 9c2d33f1cfee..6e3dda4f82b5 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -95,7 +95,7 @@ struct snp_req_data { unsigned int data_npages; }; -struct snp_guest_platform_data { +struct sev_guest_platform_data { u64 secrets_gpa; }; diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index f01f4550e2c6..2fa87a07ab30 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -2166,8 +2166,8 @@ e_restore_irq: } EXPORT_SYMBOL_GPL(snp_issue_guest_request); -static struct platform_device guest_req_device = { - .name = "snp-guest", +static struct platform_device sev_guest_device = { + .name = "sev-guest", .id = -1, }; @@ -2197,7 +2197,7 @@ static u64 get_secrets_page(void) static int __init snp_init_platform_device(void) { - struct snp_guest_platform_data data; + struct sev_guest_platform_data data; u64 gpa; if (!cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) @@ -2208,10 +2208,10 @@ static int __init snp_init_platform_device(void) return -ENODEV; data.secrets_gpa = gpa; - if (platform_device_add_data(&guest_req_device, &data, sizeof(data))) + if (platform_device_add_data(&sev_guest_device, &data, sizeof(data))) return -ENODEV; - if (platform_device_register(&guest_req_device)) + if (platform_device_register(&sev_guest_device)) return -ENODEV; pr_info("SNP guest platform device initialized.\n"); diff --git a/drivers/virt/coco/sevguest/sevguest.c b/drivers/virt/coco/sevguest/sevguest.c index 15f069ec8f0b..18c3231a816d 100644 --- a/drivers/virt/coco/sevguest/sevguest.c +++ b/drivers/virt/coco/sevguest/sevguest.c @@ -1,14 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * AMD Secure Encrypted Virtualization Nested Paging (SEV-SNP) guest request interface + * AMD Secure Encrypted Virtualization (SEV) guest driver interface * * Copyright (C) 2021 Advanced Micro Devices, Inc. * * Author: Brijesh Singh */ -#define pr_fmt(fmt) "SNP: GUEST: " fmt - #include #include #include @@ -574,7 +572,7 @@ static void free_shared_pages(void *buf, size_t sz) __free_pages(virt_to_page(buf), get_order(sz)); } -static void *alloc_shared_pages(size_t sz) +static void *alloc_shared_pages(struct device *dev, size_t sz) { unsigned int npages = PAGE_ALIGN(sz) >> PAGE_SHIFT; struct page *page; @@ -586,7 +584,7 @@ static void *alloc_shared_pages(size_t sz) ret = set_memory_decrypted((unsigned long)page_address(page), npages); if (ret) { - pr_err("failed to mark page shared, ret=%d\n", ret); + dev_err(dev, "failed to mark page shared, ret=%d\n", ret); __free_pages(page, get_order(sz)); return NULL; } @@ -627,10 +625,10 @@ static u8 *get_vmpck(int id, struct snp_secrets_page_layout *layout, u32 **seqno return key; } -static int __init snp_guest_probe(struct platform_device *pdev) +static int __init sev_guest_probe(struct platform_device *pdev) { struct snp_secrets_page_layout *layout; - struct snp_guest_platform_data *data; + struct sev_guest_platform_data *data; struct device *dev = &pdev->dev; struct snp_guest_dev *snp_dev; struct miscdevice *misc; @@ -639,7 +637,7 @@ static int __init snp_guest_probe(struct platform_device *pdev) if (!dev->platform_data) return -ENODEV; - data = (struct snp_guest_platform_data *)dev->platform_data; + data = (struct sev_guest_platform_data *)dev->platform_data; layout = (__force void *)ioremap_encrypted(data->secrets_gpa, PAGE_SIZE); if (!layout) return -ENODEV; @@ -667,15 +665,15 @@ static int __init snp_guest_probe(struct platform_device *pdev) snp_dev->layout = layout; /* Allocate the shared page used for the request and response message. */ - snp_dev->request = alloc_shared_pages(sizeof(struct snp_guest_msg)); + snp_dev->request = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); if (!snp_dev->request) goto e_unmap; - snp_dev->response = alloc_shared_pages(sizeof(struct snp_guest_msg)); + snp_dev->response = alloc_shared_pages(dev, sizeof(struct snp_guest_msg)); if (!snp_dev->response) goto e_free_request; - snp_dev->certs_data = alloc_shared_pages(SEV_FW_BLOB_MAX_SIZE); + snp_dev->certs_data = alloc_shared_pages(dev, SEV_FW_BLOB_MAX_SIZE); if (!snp_dev->certs_data) goto e_free_response; @@ -698,7 +696,7 @@ static int __init snp_guest_probe(struct platform_device *pdev) if (ret) goto e_free_cert_data; - dev_info(dev, "Initialized SNP guest driver (using vmpck_id %d)\n", vmpck_id); + dev_info(dev, "Initialized SEV guest driver (using vmpck_id %d)\n", vmpck_id); return 0; e_free_cert_data: @@ -712,7 +710,7 @@ e_unmap: return ret; } -static int __exit snp_guest_remove(struct platform_device *pdev) +static int __exit sev_guest_remove(struct platform_device *pdev) { struct snp_guest_dev *snp_dev = platform_get_drvdata(pdev); @@ -725,16 +723,21 @@ static int __exit snp_guest_remove(struct platform_device *pdev) return 0; } -static struct platform_driver snp_guest_driver = { - .remove = __exit_p(snp_guest_remove), +/* + * This driver is a common SEV guest interface driver and meant to support + * any SEV guest API. As such, even though it has been introduced along with + * the SEV-SNP support, it is named "sev-guest". + */ +static struct platform_driver sev_guest_driver = { + .remove = __exit_p(sev_guest_remove), .driver = { - .name = "snp-guest", + .name = "sev-guest", }, }; -module_platform_driver_probe(snp_guest_driver, snp_guest_probe); +module_platform_driver_probe(sev_guest_driver, sev_guest_probe); MODULE_AUTHOR("Brijesh Singh "); MODULE_LICENSE("GPL"); MODULE_VERSION("1.0.0"); -MODULE_DESCRIPTION("AMD SNP Guest Driver"); +MODULE_DESCRIPTION("AMD SEV Guest Driver"); -- cgit v1.2.3-59-g8ed1b From 03f16cd020eb8bb2eb837e2090086f296a9fa91d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:36 -0700 Subject: objtool: Add CONFIG_OBJTOOL Now that stack validation is an optional feature of objtool, add CONFIG_OBJTOOL and replace most usages of CONFIG_STACK_VALIDATION with it. CONFIG_STACK_VALIDATION can now be considered to be frame-pointer specific. CONFIG_UNWINDER_ORC is already inherently valid for live patching, so no need to "validate" it. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/939bf3d85604b2a126412bf11af6e3bd3b872bcb.1650300597.git.jpoimboe@redhat.com --- Makefile | 2 +- arch/Kconfig | 8 ++++++-- arch/x86/Kconfig | 18 +++++++++++------- arch/x86/Kconfig.debug | 2 +- arch/x86/include/asm/jump_label.h | 6 +++--- arch/x86/kernel/alternative.c | 6 +++--- include/linux/compiler.h | 6 +++--- include/linux/instrumentation.h | 6 +++--- include/linux/objtool.h | 6 +++--- kernel/trace/Kconfig | 1 + lib/Kconfig.debug | 20 +++++++++++--------- lib/Kconfig.kcsan | 3 ++- lib/Kconfig.ubsan | 2 +- scripts/Makefile.build | 4 ++-- scripts/link-vmlinux.sh | 32 ++++++++++++++++++-------------- scripts/package/builddeb | 2 +- tools/include/linux/objtool.h | 6 +++--- 17 files changed, 73 insertions(+), 57 deletions(-) (limited to 'arch/x86/include') diff --git a/Makefile b/Makefile index fa5112a0ec1b..250707647359 100644 --- a/Makefile +++ b/Makefile @@ -1302,7 +1302,7 @@ install: sub_make_done := # --------------------------------------------------------------------------- # Tools -ifdef CONFIG_STACK_VALIDATION +ifdef CONFIG_OBJTOOL prepare: tools/objtool endif diff --git a/arch/Kconfig b/arch/Kconfig index 29b0167c088b..04cdef16db24 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1028,11 +1028,14 @@ config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT depends on MMU select ARCH_HAS_ELF_RANDOMIZE +config HAVE_OBJTOOL + bool + config HAVE_STACK_VALIDATION bool help - Architecture supports the 'objtool check' host tool command, which - performs compile-time stack metadata validation. + Architecture supports objtool compile-time frame pointer rule + validation. config HAVE_RELIABLE_STACKTRACE bool @@ -1302,6 +1305,7 @@ config HAVE_STATIC_CALL config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL + select OBJTOOL config HAVE_PREEMPT_DYNAMIC bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4bed3abf444d..43e26ee1611e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -188,7 +188,7 @@ config X86 select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT - select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION + select HAVE_OBJTOOL_MCOUNT if HAVE_OBJTOOL select HAVE_BUILDTIME_MCOUNT_SORT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS @@ -231,6 +231,7 @@ config X86 select HAVE_MOVE_PMD select HAVE_MOVE_PUD select HAVE_NMI + select HAVE_OBJTOOL if X86_64 select HAVE_OPTPROBES select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS @@ -239,17 +240,17 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if UNWINDER_ORC || STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR - select HAVE_STACK_VALIDATION if X86_64 + select HAVE_STACK_VALIDATION if HAVE_OBJTOOL select HAVE_STATIC_CALL - select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS @@ -268,7 +269,6 @@ config X86 select RTC_MC146818_LIB select SPARSE_IRQ select SRCU - select STACK_VALIDATION if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE) select SYSCTL_EXCEPTION_TRACE select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT @@ -459,6 +459,7 @@ config GOLDFISH config RETPOLINE bool "Avoid speculative indirect branches in kernel" + select OBJTOOL if HAVE_OBJTOOL default y help Compile kernel with the retpoline compiler options to guard against @@ -472,6 +473,7 @@ config CC_HAS_SLS config SLS bool "Mitigate Straight-Line-Speculation" depends on CC_HAS_SLS && X86_64 + select OBJTOOL if HAVE_OBJTOOL default n help Compile the kernel with straight-line-speculation options to guard @@ -1819,6 +1821,7 @@ config ARCH_RANDOM config X86_SMAP def_bool y prompt "Supervisor Mode Access Prevention" if EXPERT + select OBJTOOL if HAVE_OBJTOOL help Supervisor Mode Access Prevention (SMAP) is a security feature in newer Intel processors. There is a small @@ -1855,9 +1858,10 @@ config CC_HAS_IBT config X86_KERNEL_IBT prompt "Indirect Branch Tracking" bool - depends on X86_64 && CC_HAS_IBT && STACK_VALIDATION + depends on X86_64 && CC_HAS_IBT && HAVE_OBJTOOL # https://github.com/llvm/llvm-project/commit/9d7001eba9c4cb311e03cd8cdc231f9e579f2d0f depends on !LD_IS_LLD || LLD_VERSION >= 140000 + select OBJTOOL help Build the kernel with support for Indirect Branch Tracking, a hardware support course-grain forward-edge Control Flow Integrity diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d3a6f74a94bd..d872a7522e55 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -237,7 +237,7 @@ choice config UNWINDER_ORC bool "ORC unwinder" depends on X86_64 - select STACK_VALIDATION + select OBJTOOL help This option enables the ORC (Oops Rewind Capability) unwinder for unwinding kernel stack traces. It uses a custom data format which is diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 0449b125d27f..3ce0e67c579c 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { @@ -34,7 +34,7 @@ l_yes: return true; } -#else +#else /* !CONFIG_OBJTOOL */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -48,7 +48,7 @@ l_yes: return true; } -#endif /* STACK_VALIDATION */ +#endif /* CONFIG_OBJTOOL */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d374cb3cf024..3c66073e7645 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -338,7 +338,7 @@ next: } } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_STACK_VALIDATION) +#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -507,11 +507,11 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) } } -#else /* !RETPOLINES || !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 219aa5ddbc73..01ce94b58b42 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -109,7 +109,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, #endif /* Unreachable code */ -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL /* * These macros help objtool understand GCC code flow for unreachable code. * The __COUNTER__ based labels are a hack to make each instance of the macros @@ -128,10 +128,10 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Annotate a C jump table to allow objtool to follow the code flow */ #define __annotate_jump_table __section(".rodata..c_jump_table") -#else +#else /* !CONFIG_OBJTOOL */ #define annotate_unreachable() #define __annotate_jump_table -#endif +#endif /* CONFIG_OBJTOOL */ #ifndef unreachable # define unreachable() do { \ diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h index 24359b4a9605..9111a3704072 100644 --- a/include/linux/instrumentation.h +++ b/include/linux/instrumentation.h @@ -2,7 +2,7 @@ #ifndef __LINUX_INSTRUMENTATION_H #define __LINUX_INSTRUMENTATION_H -#if defined(CONFIG_DEBUG_ENTRY) && defined(CONFIG_STACK_VALIDATION) +#ifdef CONFIG_VMLINUX_VALIDATION #include @@ -53,9 +53,9 @@ ".popsection\n\t" : : "i" (c)); \ }) #define instrumentation_end() __instrumentation_end(__COUNTER__) -#else +#else /* !CONFIG_VMLINUX_VALIDATION */ # define instrumentation_begin() do { } while(0) # define instrumentation_end() do { } while(0) -#endif +#endif /* CONFIG_VMLINUX_VALIDATION */ #endif /* __LINUX_INSTRUMENTATION_H */ diff --git a/include/linux/objtool.h b/include/linux/objtool.h index 586d35720f13..977d90ba642d 100644 --- a/include/linux/objtool.h +++ b/include/linux/objtool.h @@ -38,7 +38,7 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL #ifndef __ASSEMBLY__ @@ -157,7 +157,7 @@ struct unwind_hint { #endif /* __ASSEMBLY__ */ -#else /* !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_OBJTOOL */ #ifndef __ASSEMBLY__ @@ -179,6 +179,6 @@ struct unwind_hint { .endm #endif -#endif /* CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_OBJTOOL */ #endif /* _LINUX_OBJTOOL_H */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2c43e327a619..2956bc277150 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -728,6 +728,7 @@ config FTRACE_MCOUNT_USE_OBJTOOL depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY depends on !FTRACE_MCOUNT_USE_CC depends on FTRACE_MCOUNT_RECORD + select OBJTOOL config FTRACE_MCOUNT_USE_RECORDMCOUNT def_bool y diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac..c0e4e47f3ce3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -485,24 +485,25 @@ config FRAME_POINTER larger and slower, but it gives very useful debugging information in case of kernel bugs. (precise oopses/stacktraces/warnings) +config OBJTOOL + bool + config STACK_VALIDATION bool "Compile-time stack metadata validation" - depends on HAVE_STACK_VALIDATION + depends on HAVE_STACK_VALIDATION && UNWINDER_FRAME_POINTER + select OBJTOOL default n help - Add compile-time checks to validate stack metadata, including frame - pointers (if CONFIG_FRAME_POINTER is enabled). This helps ensure - that runtime stack traces are more reliable. - - This is also a prerequisite for generation of ORC unwind data, which - is needed for CONFIG_UNWINDER_ORC. + Validate frame pointer rules at compile-time. This helps ensure that + runtime stack traces are more reliable. For more information, see tools/objtool/Documentation/stack-validation.txt. config VMLINUX_VALIDATION bool - depends on STACK_VALIDATION && DEBUG_ENTRY + depends on HAVE_OBJTOOL && DEBUG_ENTRY + select OBJTOOL default y config VMLINUX_MAP @@ -2035,10 +2036,11 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS - depends on !ARCH_WANTS_NO_INSTR || STACK_VALIDATION || \ + depends on !ARCH_WANTS_NO_INSTR || HAVE_OBJTOOL || \ GCC_VERSION >= 120000 || CLANG_VERSION >= 130000 select DEBUG_FS select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC + select OBJTOOL if HAVE_OBJTOOL help KCOV exposes kernel code coverage information in a form suitable for coverage-guided fuzzing (randomized testing). diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan index de022445fbba..901c3b509aca 100644 --- a/lib/Kconfig.kcsan +++ b/lib/Kconfig.kcsan @@ -187,7 +187,8 @@ config KCSAN_WEAK_MEMORY # We can either let objtool nop __tsan_func_{entry,exit}() and builtin # atomics instrumentation in .noinstr.text, or use a compiler that can # implement __no_kcsan to really remove all instrumentation. - depends on STACK_VALIDATION || CC_IS_GCC || CLANG_VERSION >= 140000 + depends on HAVE_OBJTOOL || CC_IS_GCC || CLANG_VERSION >= 140000 + select OBJTOOL if HAVE_OBJTOOL help Enable support for modeling a subset of weak memory, which allows detecting a subset of data races due to missing memory barriers. diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index f3c57ed51838..c4fe15d38b60 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -94,7 +94,7 @@ config UBSAN_UNREACHABLE bool "Perform checking for unreachable code" # objtool already handles unreachable checking and gets angry about # seeing UBSan instrumentation located in unreachable places. - depends on !STACK_VALIDATION + depends on !(OBJTOOL && (STACK_VALIDATION || UNWINDER_ORC || X86_SMAP)) depends on $(cc-option,-fsanitize=unreachable) help This option enables -fsanitize=unreachable which checks for control diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d5e15ae29156..0f73e02b7cf1 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -222,7 +222,7 @@ cmd_record_mcount = $(if $(findstring $(strip $(CC_FLAGS_FTRACE)),$(_c_flags)), $(sub_cmd_record_mcount)) endif # CONFIG_FTRACE_MCOUNT_USE_RECORDMCOUNT -ifdef CONFIG_STACK_VALIDATION +ifdef CONFIG_OBJTOOL objtool := $(objtree)/tools/objtool/objtool @@ -241,7 +241,7 @@ objtool_args = \ cmd_objtool = $(if $(objtool-enabled), ; $(objtool) $(objtool_args) $@) cmd_gen_objtooldep = $(if $(objtool-enabled), { echo ; echo '$@: $$(wildcard $(objtool))' ; } >> $(dot-target).cmd) -endif # CONFIG_STACK_VALIDATION +endif # CONFIG_OBJTOOL ifneq ($(CONFIG_LTO_CLANG)$(CONFIG_X86_KERNEL_IBT),) diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 0140bfa32c0c..5101a7fbfaaf 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -108,8 +108,11 @@ objtool_link() local objtoolcmd; local objtoolopt; - if is_enabled CONFIG_STACK_VALIDATION && \ - ( is_enabled CONFIG_LTO_CLANG || is_enabled CONFIG_X86_KERNEL_IBT ); then + if ! is_enabled CONFIG_OBJTOOL; then + return; + fi + + if is_enabled CONFIG_LTO_CLANG || is_enabled CONFIG_X86_KERNEL_IBT; then # Don't perform vmlinux validation unless explicitly requested, # but run objtool on vmlinux.o now that we have an object file. @@ -126,10 +129,23 @@ objtool_link() objtoolopt="${objtoolopt} --orc" fi + if is_enabled CONFIG_RETPOLINE; then + objtoolopt="${objtoolopt} --retpoline" + fi + + if is_enabled CONFIG_SLS; then + objtoolopt="${objtoolopt} --sls" + fi + if is_enabled CONFIG_STACK_VALIDATION; then objtoolopt="${objtoolopt} --stackval" fi + if is_enabled CONFIG_X86_SMAP; then + objtoolopt="${objtoolopt} --uaccess" + fi + + objtoolopt="${objtoolopt} --lto" fi @@ -139,18 +155,6 @@ objtool_link() if [ -n "${objtoolopt}" ]; then - if is_enabled CONFIG_RETPOLINE; then - objtoolopt="${objtoolopt} --retpoline" - fi - - if is_enabled CONFIG_SLS; then - objtoolopt="${objtoolopt} --sls" - fi - - if is_enabled CONFIG_X86_SMAP; then - objtoolopt="${objtoolopt} --uaccess" - fi - if ! is_enabled CONFIG_FRAME_POINTER; then objtoolopt="${objtoolopt} --no-fp" fi diff --git a/scripts/package/builddeb b/scripts/package/builddeb index 91a502bb97e8..67cd420dcf89 100755 --- a/scripts/package/builddeb +++ b/scripts/package/builddeb @@ -67,7 +67,7 @@ deploy_kernel_headers () { ) > debian/hdrsrcfiles { - if is_enabled CONFIG_STACK_VALIDATION; then + if is_enabled CONFIG_OBJTOOL; then echo tools/objtool/objtool fi diff --git a/tools/include/linux/objtool.h b/tools/include/linux/objtool.h index 586d35720f13..977d90ba642d 100644 --- a/tools/include/linux/objtool.h +++ b/tools/include/linux/objtool.h @@ -38,7 +38,7 @@ struct unwind_hint { #define UNWIND_HINT_TYPE_REGS_PARTIAL 2 #define UNWIND_HINT_TYPE_FUNC 3 -#ifdef CONFIG_STACK_VALIDATION +#ifdef CONFIG_OBJTOOL #ifndef __ASSEMBLY__ @@ -157,7 +157,7 @@ struct unwind_hint { #endif /* __ASSEMBLY__ */ -#else /* !CONFIG_STACK_VALIDATION */ +#else /* !CONFIG_OBJTOOL */ #ifndef __ASSEMBLY__ @@ -179,6 +179,6 @@ struct unwind_hint { .endm #endif -#endif /* CONFIG_STACK_VALIDATION */ +#endif /* CONFIG_OBJTOOL */ #endif /* _LINUX_OBJTOOL_H */ -- cgit v1.2.3-59-g8ed1b From 4ab7674f5951ac6a8ac4fa8828090edb64a4771f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 18 Apr 2022 09:50:39 -0700 Subject: objtool: Make jump label hack optional Objtool secretly does a jump label hack to overcome the limitations of the toolchain. Make the hack explicit (and optional for other arches) by turning it into a cmdline option and kernel config option. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Link: https://lkml.kernel.org/r/3bdcbfdd27ecb01ddec13c04bdf756a583b13d24.1650300597.git.jpoimboe@redhat.com --- arch/Kconfig | 4 ++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/jump_label.h | 6 +++--- scripts/Makefile.build | 1 + scripts/link-vmlinux.sh | 4 ++++ tools/objtool/builtin-check.c | 37 ++++++++++++++++++++++++++------- tools/objtool/check.c | 2 +- tools/objtool/include/objtool/builtin.h | 1 + 8 files changed, 44 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/Kconfig b/arch/Kconfig index 04cdef16db24..9dce6d6e3bc3 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -46,6 +46,7 @@ config JUMP_LABEL bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL depends on CC_HAS_ASM_GOTO + select OBJTOOL if HAVE_JUMP_LABEL_HACK help This option enables a transparent branch optimization that makes certain almost-always-true or almost-always-false branch @@ -1031,6 +1032,9 @@ config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT config HAVE_OBJTOOL bool +config HAVE_JUMP_LABEL_HACK + bool + config HAVE_STACK_VALIDATION bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 43e26ee1611e..26d012f1eeb8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -212,6 +212,7 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64 select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_JUMP_LABEL_HACK if HAVE_OBJTOOL select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 3ce0e67c579c..071572e23d3a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -20,7 +20,7 @@ _ASM_PTR "%c0 + %c1 - .\n\t" \ ".popsection \n\t" -#ifdef CONFIG_OBJTOOL +#ifdef CONFIG_HAVE_JUMP_LABEL_HACK static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { @@ -34,7 +34,7 @@ l_yes: return true; } -#else /* !CONFIG_OBJTOOL */ +#else /* !CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch(struct static_key * const key, const bool branch) { @@ -48,7 +48,7 @@ l_yes: return true; } -#endif /* CONFIG_OBJTOOL */ +#endif /* CONFIG_HAVE_JUMP_LABEL_HACK */ static __always_inline bool arch_static_branch_jump(struct static_key * const key, const bool branch) { diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 3f20d565733c..f1d2c2e4f15b 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -227,6 +227,7 @@ ifdef CONFIG_OBJTOOL objtool := $(objtree)/tools/objtool/objtool objtool_args = \ + $(if $(CONFIG_HAVE_JUMP_LABEL_HACK), --hacks=jump_label) \ $(if $(CONFIG_X86_KERNEL_IBT), --lto --ibt) \ $(if $(CONFIG_FTRACE_MCOUNT_USE_OBJTOOL), --mcount) \ $(if $(CONFIG_UNWINDER_ORC), --orc) \ diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 33f14fe1ddde..fa1f16840e76 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -117,6 +117,10 @@ objtool_link() # Don't perform vmlinux validation unless explicitly requested, # but run objtool on vmlinux.o now that we have an object file. + if is_enabled CONFIG_HAVE_JUMP_LABEL_HACK; then + objtoolopt="${objtoolopt} --hacks=jump_label" + fi + if is_enabled CONFIG_X86_KERNEL_IBT; then objtoolopt="${objtoolopt} --ibt" fi diff --git a/tools/objtool/builtin-check.c b/tools/objtool/builtin-check.c index c8c4d2bab42f..b2c626d9e2bf 100644 --- a/tools/objtool/builtin-check.c +++ b/tools/objtool/builtin-check.c @@ -31,8 +31,28 @@ static int parse_dump(const struct option *opt, const char *str, int unset) return -1; } +static int parse_hacks(const struct option *opt, const char *str, int unset) +{ + bool found = false; + + /* + * Use strstr() as a lazy method of checking for comma-separated + * options. + * + * No string provided == enable all options. + */ + + if (!str || strstr(str, "jump_label")) { + opts.hack_jump_label = true; + found = true; + } + + return found ? 0 : -1; +} + const struct option check_options[] = { OPT_GROUP("Actions:"), + OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label", "patch toolchain bugs/limitations", parse_hacks), OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), OPT_BOOLEAN('m', "mcount", &opts.mcount, "annotate mcount/fentry calls for ftrace"), OPT_BOOLEAN('n', "noinstr", &opts.noinstr, "validate noinstr rules"), @@ -87,14 +107,15 @@ int cmd_parse_options(int argc, const char **argv, const char * const usage[]) static bool opts_valid(void) { - if (opts.ibt || - opts.mcount || - opts.noinstr || - opts.orc || - opts.retpoline || - opts.sls || - opts.stackval || - opts.static_call || + if (opts.hack_jump_label || + opts.ibt || + opts.mcount || + opts.noinstr || + opts.orc || + opts.retpoline || + opts.sls || + opts.stackval || + opts.static_call || opts.uaccess) { if (opts.dump_orc) { fprintf(stderr, "--dump can't be combined with other options\n"); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index b9ac351ea08b..d157978c58b3 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1592,7 +1592,7 @@ static int handle_jump_alt(struct objtool_file *file, return -1; } - if (special_alt->key_addend & 2) { + if (opts.hack_jump_label && special_alt->key_addend & 2) { struct reloc *reloc = insn_reloc(file, orig_insn); if (reloc) { diff --git a/tools/objtool/include/objtool/builtin.h b/tools/objtool/include/objtool/builtin.h index dc4757205b8d..c6acf05ec859 100644 --- a/tools/objtool/include/objtool/builtin.h +++ b/tools/objtool/include/objtool/builtin.h @@ -12,6 +12,7 @@ extern const struct option check_options[]; struct opts { /* actions: */ bool dump_orc; + bool hack_jump_label; bool ibt; bool mcount; bool noinstr; -- cgit v1.2.3-59-g8ed1b From 8ad7e8f696951f192c6629a0cbda9ac94c773159 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Apr 2022 14:11:25 +0200 Subject: x86/fpu/xsave: Support XSAVEC in the kernel XSAVEC is the user space counterpart of XSAVES which cannot save supervisor state. In virtualization scenarios the hypervisor does not expose XSAVES but XSAVEC to the guest, though the kernel does not make use of it. That's unfortunate because XSAVEC uses the compacted format of saving the XSTATE. This is more efficient in terms of storage space vs. XSAVE[OPT] as it does not create holes for XSTATE components which are not supported or enabled by the kernel but are available in hardware. There is room for further optimizations when XSAVEC/S and XGETBV1 are supported. In order to support XSAVEC: - Define the XSAVEC ASM macro as it's not yet supported by the required minimal toolchain. - Create a software defined X86_FEATURE_XCOMPACTED to select the compacted XSTATE buffer format for both XSAVEC and XSAVES. - Make XSAVEC an option in the 'XSAVE' ASM alternatives Requested-by: Andrew Cooper Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220404104820.598704095@linutronix.de --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/fpu/xstate.c | 58 +++++++++++++++++++++++++------------- arch/x86/kernel/fpu/xstate.h | 14 +++++---- 3 files changed, 48 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 73e643ae94b6..ff08da857847 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,7 +201,7 @@ #define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ -/* FREE! ( 7*32+10) */ +#define X86_FEATURE_XCOMPACTED ( 7*32+10) /* "" Use compacted XSTATE (XSAVES or XSAVEC) */ #define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_LFENCE ( 7*32+13) /* "" Use LFENCE for Spectre variant 2 */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..31c12f4d0770 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -142,7 +142,8 @@ static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) * Non-compacted format and legacy features use the cached fixed * offsets. */ - if (!cpu_feature_enabled(X86_FEATURE_XSAVES) || xfeature <= XFEATURE_SSE) + if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || + xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* @@ -369,12 +370,12 @@ static void __init setup_init_fpu_buf(void) /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only - * works with XSAVE, but not with XSAVEOPT and XSAVES because + * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the - * data when XSAVES is available because XSAVES uses xstate + * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the @@ -584,7 +585,8 @@ static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); + bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; @@ -595,7 +597,7 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * Supervisor state components can be managed only by * XSAVES. */ - if (!compacted && xfeature_is_supervisor(i)) { + if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1); return false; } @@ -612,8 +614,11 @@ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. + * + * This also takes compaction into account. So this works for + * XSAVEC as well. */ -static unsigned int __init get_xsaves_size(void) +static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* @@ -623,6 +628,10 @@ static unsigned int __init get_xsaves_size(void) * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. + * + * When XSAVES is not available but XSAVEC is (virt), then there + * are no supervisor states, but XSAVEC still uses compacted + * format. */ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); return ebx; @@ -632,13 +641,13 @@ static unsigned int __init get_xsaves_size(void) * Get the total size of the enabled xstates without the independent supervisor * features. */ -static unsigned int __init get_xsaves_size_no_independent(void) +static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) - return get_xsaves_size(); + return get_compacted_size(); /* Disable independent features. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); @@ -647,7 +656,7 @@ static unsigned int __init get_xsaves_size_no_independent(void) * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ - size = get_xsaves_size(); + size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); @@ -687,20 +696,21 @@ static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* - * XSAVES kernel size includes supervisor states and - * uses compacted format when available. + * XSAVES kernel size includes supervisor states and uses compacted + * format. XSAVEC uses compacted format, but does not save + * supervisor states. * - * XSAVE does not support supervisor states so - * kernel and user size is identical. + * XSAVE[OPT] do not support supervisor states so kernel and user + * size is identical. */ if (compacted) - kernel_size = get_xsaves_size_no_independent(); + kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; @@ -813,8 +823,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; - fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | - XFEATURE_MASK_SUPERVISOR_SUPPORTED; + if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; + else + fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; @@ -837,6 +850,11 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; + /* Set up compaction feature bit */ + if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || + cpu_feature_enabled(X86_FEATURE_XSAVES)) + setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); + /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -873,7 +891,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, - boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: @@ -917,7 +935,7 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } @@ -1525,7 +1543,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * vendors into extending XFD for the pre AMX states, especially * AVX512. */ - bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); + bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = ¤t->group_leader->thread.fpu; struct fpu_state_perm *perm; unsigned int ksize, usize; diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index d22ace092ca2..5ad47031383b 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -16,7 +16,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - if (cpu_feature_enabled(X86_FEATURE_XSAVES)) + if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; } @@ -79,6 +79,7 @@ static inline u64 xfeatures_mask_independent(void) /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" +#define XSAVEC ".byte " REX_PREFIX "0x0f,0xc7,0x27" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" @@ -97,9 +98,11 @@ static inline u64 xfeatures_mask_independent(void) : "memory") /* - * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact - * format and supervisor states in addition to modified optimization in - * XSAVEOPT. + * If XSAVES is enabled, it replaces XSAVEC because it supports supervisor + * states in addition to XSAVEC. + * + * Otherwise if XSAVEC is enabled, it replaces XSAVEOPT because it supports + * compacted storage format in addition to XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. @@ -111,8 +114,9 @@ static inline u64 xfeatures_mask_independent(void) * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ - asm volatile(ALTERNATIVE_2(XSAVE, \ + asm volatile(ALTERNATIVE_3(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ + XSAVEC, X86_FEATURE_XSAVEC, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ -- cgit v1.2.3-59-g8ed1b From 306f7cc1e9061313c46d19e9bdffc819880794c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:56 +0800 Subject: uapi: always define F_GETLK64/F_SETLK64/F_SETLKW64 in fcntl.h The F_GETLK64/F_SETLK64/F_SETLKW64 fcntl opcodes are only implemented for the 32-bit syscall APIs, but are also needed for compat handling on 64-bit kernels. Consolidate them in unistd.h instead of definining the internal compat definitions in compat.h, which is rather error prone (e.g. parisc gets the values wrong currently). Note that before this change they were never visible to userspace due to the fact that CONFIG_64BIT is only set for kernel builds. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Link: https://lore.kernel.org/r/20220405071314.3225832-3-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 4 ---- arch/mips/include/asm/compat.h | 4 ---- arch/mips/include/uapi/asm/fcntl.h | 4 ++-- arch/powerpc/include/asm/compat.h | 4 ---- arch/s390/include/asm/compat.h | 4 ---- arch/sparc/include/asm/compat.h | 4 ---- arch/x86/include/asm/compat.h | 4 ---- include/uapi/asm-generic/fcntl.h | 4 ++-- tools/include/uapi/asm-generic/fcntl.h | 2 -- 9 files changed, 4 insertions(+), 30 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index eaa6ca062d89..276328765408 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -73,10 +73,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index bbb3bc5a42fd..6a350c1f70d7 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -65,10 +65,6 @@ struct compat_flock { s32 pad[4]; }; -#define F_GETLK64 33 -#define F_SETLK64 34 -#define F_SETLKW64 35 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/mips/include/uapi/asm/fcntl.h b/arch/mips/include/uapi/asm/fcntl.h index 9e44ac810db9..0369a38e3d4f 100644 --- a/arch/mips/include/uapi/asm/fcntl.h +++ b/arch/mips/include/uapi/asm/fcntl.h @@ -44,11 +44,11 @@ #define F_SETOWN 24 /* for sockets. */ #define F_GETOWN 23 /* for sockets. */ -#ifndef __mips64 +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #define F_GETLK64 33 /* using 'struct flock64' */ #define F_SETLK64 34 #define F_SETLKW64 35 -#endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #if _MIPS_SIM != _MIPS_SIM_ABI64 #define __ARCH_FLOCK_EXTRA_SYSID long l_sysid; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 7afc96fb6524..83d8f70779cb 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -52,10 +52,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index cdc7ae72529d..0f14b3188b1b 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -110,10 +110,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index bd949fcf9d63..108078751bb5 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -84,10 +84,6 @@ struct compat_flock { short __unused; }; -#define F_GETLK64 12 -#define F_SETLK64 13 -#define F_SETLKW64 14 - struct compat_flock64 { short l_type; short l_whence; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 7516e4199b3c..8d19a212f4f2 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -58,10 +58,6 @@ struct compat_flock { compat_pid_t l_pid; }; -#define F_GETLK64 12 /* using 'struct flock64' */ -#define F_SETLK64 13 -#define F_SETLKW64 14 - /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h index 77aa9f2ff98d..f13d37b60775 100644 --- a/include/uapi/asm-generic/fcntl.h +++ b/include/uapi/asm-generic/fcntl.h @@ -116,13 +116,13 @@ #define F_GETSIG 11 /* for sockets. */ #endif -#ifndef CONFIG_64BIT +#if __BITS_PER_LONG == 32 || defined(__KERNEL__) #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif -#endif +#endif /* __BITS_PER_LONG == 32 || defined(__KERNEL__) */ #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 diff --git a/tools/include/uapi/asm-generic/fcntl.h b/tools/include/uapi/asm-generic/fcntl.h index 99bc9b15ce2b..0197042b7dfb 100644 --- a/tools/include/uapi/asm-generic/fcntl.h +++ b/tools/include/uapi/asm-generic/fcntl.h @@ -115,13 +115,11 @@ #define F_GETSIG 11 /* for sockets. */ #endif -#ifndef CONFIG_64BIT #ifndef F_GETLK64 #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 #endif -#endif #ifndef F_SETOWN_EX #define F_SETOWN_EX 15 -- cgit v1.2.3-59-g8ed1b From 3ce0f2373f7073f04715b1ffd7b1812d3183f79a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 Apr 2022 15:12:57 +0800 Subject: compat: consolidate the compat_flock{,64} definition Provide a single common definition for the compat_flock and compat_flock64 structures using the same tricks as for the native variants. Another extra define is added for the packing required on x86. Signed-off-by: Christoph Hellwig Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-4-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 16 ---------------- arch/mips/include/asm/compat.h | 19 ++----------------- arch/parisc/include/asm/compat.h | 16 ---------------- arch/powerpc/include/asm/compat.h | 16 ---------------- arch/s390/include/asm/compat.h | 16 ---------------- arch/sparc/include/asm/compat.h | 18 +----------------- arch/x86/include/asm/compat.h | 20 +++----------------- include/linux/compat.h | 31 +++++++++++++++++++++++++++++++ 8 files changed, 37 insertions(+), 115 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 276328765408..e0faec1984a1 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -65,22 +65,6 @@ struct compat_stat { compat_ulong_t __unused4[2]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { int f_type; int f_bsize; diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 6a350c1f70d7..6d6e5a451f4d 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -55,23 +55,8 @@ struct compat_stat { s32 st_pad4[14]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - s32 l_sysid; - compat_pid_t l_pid; - s32 pad[4]; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; +#define __ARCH_COMPAT_FLOCK_EXTRA_SYSID s32 l_sysid; +#define __ARCH_COMPAT_FLOCK_PAD s32 pad[4]; struct compat_statfs { int f_type; diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index c04f5a637c39..a1e4534d8050 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -53,22 +53,6 @@ struct compat_stat { u32 st_spare4[3]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { s32 f_type; s32 f_bsize; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 83d8f70779cb..5ef3c7c83c34 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -44,22 +44,6 @@ struct compat_stat { u32 __unused4[2]; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { int f_type; int f_bsize; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 0f14b3188b1b..07f04d37068b 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -102,22 +102,6 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -}; - struct compat_statfs { u32 f_type; u32 f_bsize; diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 108078751bb5..d78fb44942e0 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -75,23 +75,7 @@ struct compat_stat64 { unsigned int __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; - short __unused; -}; - -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; - short __unused; -}; +#define __ARCH_COMPAT_FLOCK_PAD short __unused; struct compat_statfs { int f_type; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 8d19a212f4f2..de794d895866 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -50,25 +50,11 @@ struct compat_stat { u32 __unused5; }; -struct compat_flock { - short l_type; - short l_whence; - compat_off_t l_start; - compat_off_t l_len; - compat_pid_t l_pid; -}; - /* - * IA32 uses 4 byte alignment for 64 bit quantities, - * so we need to pack this structure. + * IA32 uses 4 byte alignment for 64 bit quantities, so we need to pack the + * compat flock64 structure. */ -struct compat_flock64 { - short l_type; - short l_whence; - compat_loff_t l_start; - compat_loff_t l_len; - compat_pid_t l_pid; -} __attribute__((packed)); +#define __ARCH_NEED_COMPAT_FLOCK64_PACKED struct compat_statfs { int f_type; diff --git a/include/linux/compat.h b/include/linux/compat.h index 1c758b0e0359..a0481fe6c5d5 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -258,6 +258,37 @@ struct compat_rlimit { compat_ulong_t rlim_max; }; +#ifdef __ARCH_NEED_COMPAT_FLOCK64_PACKED +#define __ARCH_COMPAT_FLOCK64_PACK __attribute__((packed)) +#else +#define __ARCH_COMPAT_FLOCK64_PACK +#endif + +struct compat_flock { + short l_type; + short l_whence; + compat_off_t l_start; + compat_off_t l_len; +#ifdef __ARCH_COMPAT_FLOCK_EXTRA_SYSID + __ARCH_COMPAT_FLOCK_EXTRA_SYSID +#endif + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK_PAD + __ARCH_COMPAT_FLOCK_PAD +#endif +}; + +struct compat_flock64 { + short l_type; + short l_whence; + compat_loff_t l_start; + compat_loff_t l_len; + compat_pid_t l_pid; +#ifdef __ARCH_COMPAT_FLOCK64_PAD + __ARCH_COMPAT_FLOCK64_PAD +#endif +} __ARCH_COMPAT_FLOCK64_PACK; + struct compat_rusage { struct old_timeval32 ru_utime; struct old_timeval32 ru_stime; -- cgit v1.2.3-59-g8ed1b From f18ed30db299458f809aec55bf1800dbeebeb953 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:12:59 +0800 Subject: fs: stat: compat: Add __ARCH_WANT_COMPAT_STAT RISC-V doesn't neeed compat_stat, so using __ARCH_WANT_COMPAT_STAT to exclude unnecessary SYSCALL functions. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-6-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/unistd.h | 1 + arch/mips/include/asm/unistd.h | 2 ++ arch/parisc/include/asm/unistd.h | 1 + arch/powerpc/include/asm/unistd.h | 1 + arch/s390/include/asm/unistd.h | 1 + arch/sparc/include/asm/unistd.h | 1 + arch/x86/include/asm/unistd.h | 1 + fs/stat.c | 2 +- 8 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 4e65da3445c7..037feba03a51 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -3,6 +3,7 @@ * Copyright (C) 2012 ARM Ltd. */ #ifdef CONFIG_COMPAT +#define __ARCH_WANT_COMPAT_STAT #define __ARCH_WANT_COMPAT_STAT64 #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index c2196b1b6604..25a5253db7f4 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -50,6 +50,8 @@ # ifdef CONFIG_32BIT # define __ARCH_WANT_STAT64 # define __ARCH_WANT_SYS_TIME32 +# else +# define __ARCH_WANT_COMPAT_STAT # endif # ifdef CONFIG_MIPS32_O32 # define __ARCH_WANT_SYS_TIME32 diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 7708a5806f09..81cbad73b517 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -164,6 +164,7 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE3 #define __ARCH_WANT_COMPAT_SYS_SENDFILE +#define __ARCH_WANT_COMPAT_STAT #ifdef CONFIG_64BIT #define __ARCH_WANT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 5eb462af6766..b1129b4ef57d 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -44,6 +44,7 @@ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_NEWFSTATAT +#define __ARCH_WANT_COMPAT_STAT #define __ARCH_WANT_COMPAT_SYS_SENDFILE #endif #define __ARCH_WANT_SYS_FORK diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 9e9f75ef046a..4260bc5ce7f8 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -28,6 +28,7 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK # ifdef CONFIG_COMPAT +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_SYS_TIME32 # define __ARCH_WANT_SYS_UTIME32 # endif diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 1e66278ba4a5..d6bc76706a7a 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -46,6 +46,7 @@ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_COMPAT_SYS_SENDFILE +#define __ARCH_WANT_COMPAT_STAT #endif #ifdef __32bit_syscall_numbers__ diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 80e9d5206a71..761173ccc33c 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -22,6 +22,7 @@ # include # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME +# define __ARCH_WANT_COMPAT_STAT # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 # define __ARCH_WANT_COMPAT_SYS_PREADV64V2 diff --git a/fs/stat.c b/fs/stat.c index 7f734be0e57e..5d8f723c1e0b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -660,7 +660,7 @@ SYSCALL_DEFINE5(statx, return ret; } -#ifdef CONFIG_COMPAT +#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT) static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) { struct compat_stat tmp; -- cgit v1.2.3-59-g8ed1b From 84a0c977ab9821a4b57f69d9a6108f64188d1e71 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Apr 2022 15:13:00 +0800 Subject: asm-generic: compat: Cleanup duplicate definitions There are 7 64bit architectures that support Linux COMPAT mode to run 32bit applications. A lot of definitions are duplicate: - COMPAT_USER_HZ - COMPAT_RLIM_INFINITY - COMPAT_OFF_T_MAX - __compat_uid_t, __compat_uid_t - compat_dev_t - compat_ipc_pid_t - struct compat_flock - struct compat_flock64 - struct compat_statfs - struct compat_ipc64_perm, compat_semid64_ds, compat_msqid64_ds, compat_shmid64_ds Cleanup duplicate definitions and merge them into asm-generic. Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Tested-by: Heiko Stuebner Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20220405071314.3225832-7-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/arm64/include/asm/compat.h | 73 ++++---------------------- arch/mips/include/asm/compat.h | 18 +++---- arch/parisc/include/asm/compat.h | 29 ++--------- arch/powerpc/include/asm/compat.h | 30 ++--------- arch/s390/include/asm/compat.h | 79 +++++----------------------- arch/sparc/include/asm/compat.h | 39 +++++--------- arch/x86/include/asm/compat.h | 80 +++++----------------------- include/asm-generic/compat.h | 106 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 170 insertions(+), 284 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index e0faec1984a1..9f362274a4f7 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -8,6 +8,15 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include #ifdef CONFIG_COMPAT @@ -19,21 +28,15 @@ typedef u16 compat_mode_t; #include #include -#define COMPAT_USER_HZ 100 #ifdef __AARCH64EB__ #define COMPAT_UTS_MACHINE "armv8b\0\0" #else #define COMPAT_UTS_MACHINE "armv8l\0\0" #endif -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; typedef u16 __compat_uid16_t; typedef u16 __compat_gid16_t; -typedef u32 compat_dev_t; typedef s32 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { #ifdef __AARCH64EB__ @@ -87,64 +90,6 @@ struct compat_statfs { #define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current))) #define COMPAT_MINSIGSTKSZ 2048 -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - compat_ulong_t unused1; - compat_ulong_t unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused3; - compat_ulong_t __unused4; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - static inline int is_compat_task(void) { return test_thread_flag(TIF_32BIT); diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index 6d6e5a451f4d..ec01dc000a41 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -9,28 +9,28 @@ #include #include +#define __compat_uid_t __compat_uid_t typedef s32 __compat_uid_t; typedef s32 __compat_gid_t; + typedef __compat_uid_t __compat_uid32_t; typedef __compat_gid_t __compat_gid32_t; #define __compat_uid32_t __compat_uid32_t -#define __compat_gid32_t __compat_gid32_t + +#define compat_statfs compat_statfs +#define compat_ipc64_perm compat_ipc64_perm #define _COMPAT_NSIG 128 /* Don't ask !$@#% ... */ #define _COMPAT_NSIG_BPW 32 typedef u32 compat_sigset_word; +#define COMPAT_RLIM_INFINITY 0x7fffffffUL + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "mips\0\0\0" -typedef u32 compat_dev_t; typedef u32 compat_nlink_t; -typedef s32 compat_ipc_pid_t; -typedef struct { - s32 val[2]; -} compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -73,10 +73,6 @@ struct compat_statfs { int f_spare[5]; }; -#define COMPAT_RLIM_INFINITY 0x7fffffffUL - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index a1e4534d8050..339d1b833fa7 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -11,16 +11,16 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "parisc\0\0" -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; -typedef u32 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; struct compat_stat { compat_dev_t st_dev; /* dev_t is 32 bits on parisc */ @@ -53,21 +53,6 @@ struct compat_stat { u32 st_spare4[3]; }; -struct compat_statfs { - s32 f_type; - s32 f_bsize; - s32 f_blocks; - s32 f_bfree; - s32 f_bavail; - s32 f_files; - s32 f_ffree; - __kernel_fsid_t f_fsid; - s32 f_namelen; - s32 f_frsize; - s32 f_flags; - s32 f_spare[4]; -}; - struct compat_sigcontext { compat_int_t sc_flags; compat_int_t sc_gr[32]; /* PSW in sc_gr[0] */ @@ -77,10 +62,6 @@ struct compat_sigcontext { compat_int_t sc_sar; /* cr11 */ }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid_t uid; diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 5ef3c7c83c34..dda4091fd012 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -8,21 +8,20 @@ #include #include +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + #include -#define COMPAT_USER_HZ 100 #ifdef __BIG_ENDIAN__ #define COMPAT_UTS_MACHINE "ppc\0\0" #else #define COMPAT_UTS_MACHINE "ppcle\0\0" #endif -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; -typedef u32 compat_dev_t; typedef s16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -44,25 +43,6 @@ struct compat_stat { u32 __unused4[2]; }; -struct compat_statfs { - int f_type; - int f_bsize; - int f_blocks; - int f_bfree; - int f_bavail; - int f_files; - int f_ffree; - compat_fsid_t f_fsid; - int f_namelen; /* SunOS ignores this field. */ - int f_frsize; - int f_flags; - int f_spare[4]; -}; - -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - /* * ipc64_perm is actually 32/64bit clean but since the compat layer refers to * it we may as well define it. diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 07f04d37068b..ad809cf3dd97 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -12,6 +12,18 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include #define __TYPE_IS_PTR(t) (!__builtin_types_compatible_p( \ @@ -53,15 +65,9 @@ typedef u16 compat_mode_t; PSW32_MASK_MCHECK | PSW32_MASK_PSTATE | \ PSW32_ASC_PRIMARY) -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "s390\0\0\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; typedef struct { u32 mask; @@ -132,10 +138,6 @@ struct compat_statfs64 { u32 f_spare[4]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - /* * A pointer passed in from user mode. This should not * be used for syscall parameters, just declare them @@ -158,61 +160,4 @@ static inline int is_compat_task(void) #endif -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - compat_mode_t mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - unsigned int __unused1; - unsigned int __unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused1; - compat_ulong_t __unused2; -}; #endif /* _ASM_S390X_COMPAT_H */ diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index d78fb44942e0..e4382d2efa56 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -9,17 +9,25 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_ipc64_perm compat_ipc64_perm + +#define COMPAT_RLIM_INFINITY 0x7fffffff + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "sparc\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef s16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -77,25 +85,6 @@ struct compat_stat64 { #define __ARCH_COMPAT_FLOCK_PAD short __unused; -struct compat_statfs { - int f_type; - int f_bsize; - int f_blocks; - int f_bfree; - int f_bavail; - int f_files; - int f_ffree; - compat_fsid_t f_fsid; - int f_namelen; /* SunOS ignores this field. */ - int f_frsize; - int f_flags; - int f_spare[4]; -}; - -#define COMPAT_RLIM_INFINITY 0x7fffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index de794d895866..e74a107de0d0 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -15,17 +15,23 @@ #define compat_mode_t compat_mode_t typedef u16 compat_mode_t; +#define __compat_uid_t __compat_uid_t +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; + +#define compat_dev_t compat_dev_t +typedef u16 compat_dev_t; + +#define compat_ipc_pid_t compat_ipc_pid_t +typedef u16 compat_ipc_pid_t; + +#define compat_statfs compat_statfs + #include -#define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" -typedef u16 __compat_uid_t; -typedef u16 __compat_gid_t; -typedef u16 compat_dev_t; typedef u16 compat_nlink_t; -typedef u16 compat_ipc_pid_t; -typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; @@ -71,68 +77,6 @@ struct compat_statfs { int f_spare[4]; }; -#define COMPAT_RLIM_INFINITY 0xffffffff - -#define COMPAT_OFF_T_MAX 0x7fffffff - -struct compat_ipc64_perm { - compat_key_t key; - __compat_uid32_t uid; - __compat_gid32_t gid; - __compat_uid32_t cuid; - __compat_gid32_t cgid; - unsigned short mode; - unsigned short __pad1; - unsigned short seq; - unsigned short __pad2; - compat_ulong_t unused1; - compat_ulong_t unused2; -}; - -struct compat_semid64_ds { - struct compat_ipc64_perm sem_perm; - compat_ulong_t sem_otime; - compat_ulong_t sem_otime_high; - compat_ulong_t sem_ctime; - compat_ulong_t sem_ctime_high; - compat_ulong_t sem_nsems; - compat_ulong_t __unused3; - compat_ulong_t __unused4; -}; - -struct compat_msqid64_ds { - struct compat_ipc64_perm msg_perm; - compat_ulong_t msg_stime; - compat_ulong_t msg_stime_high; - compat_ulong_t msg_rtime; - compat_ulong_t msg_rtime_high; - compat_ulong_t msg_ctime; - compat_ulong_t msg_ctime_high; - compat_ulong_t msg_cbytes; - compat_ulong_t msg_qnum; - compat_ulong_t msg_qbytes; - compat_pid_t msg_lspid; - compat_pid_t msg_lrpid; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - -struct compat_shmid64_ds { - struct compat_ipc64_perm shm_perm; - compat_size_t shm_segsz; - compat_ulong_t shm_atime; - compat_ulong_t shm_atime_high; - compat_ulong_t shm_dtime; - compat_ulong_t shm_dtime_high; - compat_ulong_t shm_ctime; - compat_ulong_t shm_ctime_high; - compat_pid_t shm_cpid; - compat_pid_t shm_lpid; - compat_ulong_t shm_nattch; - compat_ulong_t __unused4; - compat_ulong_t __unused5; -}; - #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) diff --git a/include/asm-generic/compat.h b/include/asm-generic/compat.h index d46c0201cc34..11653d6846cc 100644 --- a/include/asm-generic/compat.h +++ b/include/asm-generic/compat.h @@ -2,6 +2,18 @@ #ifndef __ASM_GENERIC_COMPAT_H #define __ASM_GENERIC_COMPAT_H +#ifndef COMPAT_USER_HZ +#define COMPAT_USER_HZ 100 +#endif + +#ifndef COMPAT_RLIM_INFINITY +#define COMPAT_RLIM_INFINITY 0xffffffff +#endif + +#ifndef COMPAT_OFF_T_MAX +#define COMPAT_OFF_T_MAX 0x7fffffff +#endif + /* These types are common across all compat ABIs */ typedef u32 compat_size_t; typedef s32 compat_ssize_t; @@ -24,6 +36,11 @@ typedef u32 compat_caddr_t; typedef u32 compat_aio_context_t; typedef u32 compat_old_sigset_t; +#ifndef __compat_uid_t +typedef u32 __compat_uid_t; +typedef u32 __compat_gid_t; +#endif + #ifndef __compat_uid32_t typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; @@ -47,4 +64,93 @@ typedef u32 compat_sigset_word; #define _COMPAT_NSIG_BPW 32 #endif +#ifndef compat_dev_t +typedef u32 compat_dev_t; +#endif + +#ifndef compat_ipc_pid_t +typedef s32 compat_ipc_pid_t; +#endif + +#ifndef compat_fsid_t +typedef __kernel_fsid_t compat_fsid_t; +#endif + +#ifndef compat_statfs +struct compat_statfs { + compat_int_t f_type; + compat_int_t f_bsize; + compat_int_t f_blocks; + compat_int_t f_bfree; + compat_int_t f_bavail; + compat_int_t f_files; + compat_int_t f_ffree; + compat_fsid_t f_fsid; + compat_int_t f_namelen; + compat_int_t f_frsize; + compat_int_t f_flags; + compat_int_t f_spare[4]; +}; +#endif + +#ifndef compat_ipc64_perm +struct compat_ipc64_perm { + compat_key_t key; + __compat_uid32_t uid; + __compat_gid32_t gid; + __compat_uid32_t cuid; + __compat_gid32_t cgid; + compat_mode_t mode; + unsigned char __pad1[4 - sizeof(compat_mode_t)]; + compat_ushort_t seq; + compat_ushort_t __pad2; + compat_ulong_t unused1; + compat_ulong_t unused2; +}; + +struct compat_semid64_ds { + struct compat_ipc64_perm sem_perm; + compat_ulong_t sem_otime; + compat_ulong_t sem_otime_high; + compat_ulong_t sem_ctime; + compat_ulong_t sem_ctime_high; + compat_ulong_t sem_nsems; + compat_ulong_t __unused3; + compat_ulong_t __unused4; +}; + +struct compat_msqid64_ds { + struct compat_ipc64_perm msg_perm; + compat_ulong_t msg_stime; + compat_ulong_t msg_stime_high; + compat_ulong_t msg_rtime; + compat_ulong_t msg_rtime_high; + compat_ulong_t msg_ctime; + compat_ulong_t msg_ctime_high; + compat_ulong_t msg_cbytes; + compat_ulong_t msg_qnum; + compat_ulong_t msg_qbytes; + compat_pid_t msg_lspid; + compat_pid_t msg_lrpid; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; + +struct compat_shmid64_ds { + struct compat_ipc64_perm shm_perm; + compat_size_t shm_segsz; + compat_ulong_t shm_atime; + compat_ulong_t shm_atime_high; + compat_ulong_t shm_dtime; + compat_ulong_t shm_dtime_high; + compat_ulong_t shm_ctime; + compat_ulong_t shm_ctime_high; + compat_pid_t shm_cpid; + compat_pid_t shm_lpid; + compat_ulong_t shm_nattch; + compat_ulong_t __unused4; + compat_ulong_t __unused5; +}; +#endif + #endif -- cgit v1.2.3-59-g8ed1b From c2106a231c2ba36ff9af50cdf2867b9a5f8150a6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 22 Apr 2022 08:56:24 -0500 Subject: x86/sev: Get the AP jump table address from secrets page The GHCB specification section 2.7 states that when SEV-SNP is enabled, a guest should not rely on the hypervisor to provide the address of the AP jump table. Instead, if a guest BIOS wants to provide an AP jump table, it should record the address in the SNP secrets page so the guest operating system can obtain it directly from there. Fix this on the guest kernel side by having SNP guests use the AP jump table address published in the secrets page rather than issuing a GHCB request to get it. [ mroth: - Improve error handling when ioremap()/memremap() return NULL - Don't mix function calls with declarations - Add missing __init - Tweak commit message ] Fixes: 0afb6b660a6b ("x86/sev: Use SEV-SNP AP creation to start secondary CPUs") Signed-off-by: Brijesh Singh Signed-off-by: Michael Roth Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220422135624.114172-3-michael.roth@amd.com --- arch/x86/include/asm/sev.h | 35 +++++++++++++++ arch/x86/kernel/sev.c | 76 ++++++++++++++++++++++----------- drivers/virt/coco/sev-guest/sev-guest.h | 35 --------------- 3 files changed, 87 insertions(+), 59 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/sev.h b/arch/x86/include/asm/sev.h index 6e3dda4f82b5..19514524f0f8 100644 --- a/arch/x86/include/asm/sev.h +++ b/arch/x86/include/asm/sev.h @@ -99,6 +99,41 @@ struct sev_guest_platform_data { u64 secrets_gpa; }; +/* + * The secrets page contains 96-bytes of reserved field that can be used by + * the guest OS. The guest OS uses the area to save the message sequence + * number for each VMPCK. + * + * See the GHCB spec section Secret page layout for the format for this area. + */ +struct secrets_os_area { + u32 msg_seqno_0; + u32 msg_seqno_1; + u32 msg_seqno_2; + u32 msg_seqno_3; + u64 ap_jump_table_pa; + u8 rsvd[40]; + u8 guest_usage[32]; +} __packed; + +#define VMPCK_KEY_LEN 32 + +/* See the SNP spec version 0.9 for secrets page format */ +struct snp_secrets_page_layout { + u32 version; + u32 imien : 1, + rsvd1 : 31; + u32 fms; + u32 rsvd2; + u8 gosvw[16]; + u8 vmpck0[VMPCK_KEY_LEN]; + u8 vmpck1[VMPCK_KEY_LEN]; + u8 vmpck2[VMPCK_KEY_LEN]; + u8 vmpck3[VMPCK_KEY_LEN]; + struct secrets_os_area os_area; + u8 rsvd3[3840]; +} __packed; + #ifdef CONFIG_AMD_MEM_ENCRYPT extern struct static_key_false sev_es_enable_key; extern void __sev_es_ist_enter(struct pt_regs *regs); diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c index b7fd1915560d..166375084b1f 100644 --- a/arch/x86/kernel/sev.c +++ b/arch/x86/kernel/sev.c @@ -558,6 +558,55 @@ void noinstr __sev_es_nmi_complete(void) __sev_put_ghcb(&state); } +static u64 __init get_secrets_page(void) +{ + u64 pa_data = boot_params.cc_blob_address; + struct cc_blob_sev_info info; + void *map; + + /* + * The CC blob contains the address of the secrets page, check if the + * blob is present. + */ + if (!pa_data) + return 0; + + map = early_memremap(pa_data, sizeof(info)); + if (!map) { + pr_err("Unable to locate SNP secrets page: failed to map the Confidential Computing blob.\n"); + return 0; + } + memcpy(&info, map, sizeof(info)); + early_memunmap(map, sizeof(info)); + + /* smoke-test the secrets page passed */ + if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) + return 0; + + return info.secrets_phys; +} + +static u64 __init get_snp_jump_table_addr(void) +{ + struct snp_secrets_page_layout *layout; + u64 pa, addr; + + pa = get_secrets_page(); + if (!pa) + return 0; + + layout = (__force void *)ioremap_encrypted(pa, PAGE_SIZE); + if (!layout) { + pr_err("Unable to locate AP jump table address: failed to map the SNP secrets page.\n"); + return 0; + } + + addr = layout->os_area.ap_jump_table_pa; + iounmap(layout); + + return addr; +} + static u64 __init get_jump_table_addr(void) { struct ghcb_state state; @@ -565,6 +614,9 @@ static u64 __init get_jump_table_addr(void) struct ghcb *ghcb; u64 ret = 0; + if (cc_platform_has(CC_ATTR_GUEST_SEV_SNP)) + return get_snp_jump_table_addr(); + local_irq_save(flags); ghcb = __sev_get_ghcb(&state); @@ -2171,30 +2223,6 @@ static struct platform_device sev_guest_device = { .id = -1, }; -static u64 __init get_secrets_page(void) -{ - u64 pa_data = boot_params.cc_blob_address; - struct cc_blob_sev_info info; - void *map; - - /* - * The CC blob contains the address of the secrets page, check if the - * blob is present. - */ - if (!pa_data) - return 0; - - map = early_memremap(pa_data, sizeof(info)); - memcpy(&info, map, sizeof(info)); - early_memunmap(map, sizeof(info)); - - /* smoke-test the secrets page passed */ - if (!info.secrets_phys || info.secrets_len != PAGE_SIZE) - return 0; - - return info.secrets_phys; -} - static int __init snp_init_platform_device(void) { struct sev_guest_platform_data data; diff --git a/drivers/virt/coco/sev-guest/sev-guest.h b/drivers/virt/coco/sev-guest/sev-guest.h index d39bdd013765..21bda26fdb95 100644 --- a/drivers/virt/coco/sev-guest/sev-guest.h +++ b/drivers/virt/coco/sev-guest/sev-guest.h @@ -60,39 +60,4 @@ struct snp_guest_msg { u8 payload[4000]; } __packed; -/* - * The secrets page contains 96-bytes of reserved field that can be used by - * the guest OS. The guest OS uses the area to save the message sequence - * number for each VMPCK. - * - * See the GHCB spec section Secret page layout for the format for this area. - */ -struct secrets_os_area { - u32 msg_seqno_0; - u32 msg_seqno_1; - u32 msg_seqno_2; - u32 msg_seqno_3; - u64 ap_jump_table_pa; - u8 rsvd[40]; - u8 guest_usage[32]; -} __packed; - -#define VMPCK_KEY_LEN 32 - -/* See the SNP spec version 0.9 for secrets page format */ -struct snp_secrets_page_layout { - u32 version; - u32 imien : 1, - rsvd1 : 31; - u32 fms; - u32 rsvd2; - u8 gosvw[16]; - u8 vmpck0[VMPCK_KEY_LEN]; - u8 vmpck1[VMPCK_KEY_LEN]; - u8 vmpck2[VMPCK_KEY_LEN]; - u8 vmpck3[VMPCK_KEY_LEN]; - struct secrets_os_area os_area; - u8 rsvd3[3840]; -} __packed; - #endif /* __VIRT_SEVGUEST_H__ */ -- cgit v1.2.3-59-g8ed1b From b0b592cf08367719e1d1ef07c9f136e8c17f7ec3 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Sat, 23 Apr 2022 20:24:10 +0200 Subject: x86/pm: Fix false positive kmemleak report in msr_build_context() Since e2a1256b17b1 ("x86/speculation: Restore speculation related MSRs during S3 resume") kmemleak reports this issue: unreferenced object 0xffff888009cedc00 (size 256): comm "swapper/0", pid 1, jiffies 4294693823 (age 73.764s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 48 00 00 00 00 00 00 00 ........H....... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: msr_build_context (include/linux/slab.h:621) pm_check_save_msr (arch/x86/power/cpu.c:520) do_one_initcall (init/main.c:1298) kernel_init_freeable (init/main.c:1370) kernel_init (init/main.c:1504) ret_from_fork (arch/x86/entry/entry_64.S:304) Reproducer: - boot the VM with a debug kernel config (see https://github.com/multipath-tcp/mptcp_net-next/issues/268) - wait ~1 minute - start a kmemleak scan The root cause here is alignment within the packed struct saved_context (from suspend_64.h). Kmemleak only searches for pointers that are aligned (see how pointers are scanned in kmemleak.c), but pahole shows that the saved_msrs struct member and all members after it in the structure are unaligned: struct saved_context { struct pt_regs regs; /* 0 168 */ /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */ u16 ds; /* 168 2 */ ... u64 misc_enable; /* 232 8 */ bool misc_enable_saved; /* 240 1 */ /* Note below odd offset values for the remainder of this struct */ struct saved_msrs saved_msrs; /* 241 16 */ /* --- cacheline 4 boundary (256 bytes) was 1 bytes ago --- */ long unsigned int efer; /* 257 8 */ u16 gdt_pad; /* 265 2 */ struct desc_ptr gdt_desc; /* 267 10 */ u16 idt_pad; /* 277 2 */ struct desc_ptr idt; /* 279 10 */ u16 ldt; /* 289 2 */ u16 tss; /* 291 2 */ long unsigned int tr; /* 293 8 */ long unsigned int safety; /* 301 8 */ long unsigned int return_address; /* 309 8 */ /* size: 317, cachelines: 5, members: 25 */ /* last cacheline: 61 bytes */ } __attribute__((__packed__)); Move misc_enable_saved to the end of the struct declaration so that saved_msrs fits in before the cacheline 4 boundary. The comment above the saved_context declaration says to fix wakeup_64.S file and __save/__restore_processor_state() if the struct is modified: it looks like all the accesses in wakeup_64.S are done through offsets which are computed at build-time. Update that comment accordingly. At the end, the false positive kmemleak report is due to a limitation from kmemleak but it is always good to avoid unaligned members for optimisation purposes. Please note that it looks like this issue is not new, e.g. https://lore.kernel.org/all/9f1bb619-c4ee-21c4-a251-870bd4db04fa@lwfinger.net/ https://lore.kernel.org/all/94e48fcd-1dbd-ebd2-4c91-f39941735909@molgen.mpg.de/ [ bp: Massage + cleanup commit message. ] Fixes: 7a9c2dd08ead ("x86/pm: Introduce quirk framework to save/restore extra MSR registers around suspend/resume") Suggested-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Borislav Petkov Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220426202138.498310-1-matthieu.baerts@tessares.net --- arch/x86/include/asm/suspend_32.h | 2 +- arch/x86/include/asm/suspend_64.h | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 7b132d0312eb..a800abb1a992 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -19,7 +19,6 @@ struct saved_context { u16 gs; unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; struct desc_ptr gdt_desc; struct desc_ptr idt; @@ -28,6 +27,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); /* routines for saving/restoring kernel state */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 35bb35d28733..54df06687d83 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -14,9 +14,13 @@ * Image of the saved processor state, used by the low level ACPI suspend to * RAM code and by the low level hibernation code. * - * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that - * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c, - * still work as required. + * If you modify it, check how it is used in arch/x86/kernel/acpi/wakeup_64.S + * and make sure that __save/__restore_processor_state(), defined in + * arch/x86/power/cpu.c, still work as required. + * + * Because the structure is packed, make sure to avoid unaligned members. For + * optimisation purposes but also because tools like kmemleak only search for + * pointers that are aligned. */ struct saved_context { struct pt_regs regs; @@ -36,7 +40,6 @@ struct saved_context { unsigned long cr0, cr2, cr3, cr4; u64 misc_enable; - bool misc_enable_saved; struct saved_msrs saved_msrs; unsigned long efer; u16 gdt_pad; /* Unused */ @@ -48,6 +51,7 @@ struct saved_context { unsigned long tr; unsigned long safety; unsigned long return_address; + bool misc_enable_saved; } __attribute__((packed)); #define loaddebug(thread,register) \ -- cgit v1.2.3-59-g8ed1b From ef79970d7ccdc4e8855aa6079fc2f4797a6807fb Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Thu, 10 Mar 2022 12:48:54 -0800 Subject: x86/split-lock: Remove unused TIF_SLD bit Changes to the "warn" mode of split lock handling mean that TIF_SLD is never set. Remove the bit, and the functions that use it. Signed-off-by: Tony Luck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220310204854.31752-3-tony.luck@intel.com --- arch/x86/include/asm/cpu.h | 2 -- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/kernel/cpu/intel.c | 12 ------------ arch/x86/kernel/process.c | 3 --- 4 files changed, 1 insertion(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..d1c86b221bb9 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -43,14 +43,12 @@ unsigned int x86_model(unsigned int sig); unsigned int x86_stepping(unsigned int sig); #ifdef CONFIG_CPU_SUP_INTEL extern void __init sld_setup(struct cpuinfo_x86 *c); -extern void switch_to_sld(unsigned long tifn); extern bool handle_user_split_lock(struct pt_regs *regs, long error_code); extern bool handle_guest_split_lock(unsigned long ip); extern void handle_bus_lock(struct pt_regs *regs); u8 get_this_hybrid_cpu_type(void); #else static inline void __init sld_setup(struct cpuinfo_x86 *c) {} -static inline void switch_to_sld(unsigned long tifn) {} static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) { return false; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ebec69c35e95..f0cb881c1d69 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -92,7 +92,6 @@ struct thread_info { #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ -#define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -116,7 +115,6 @@ struct thread_info { #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) -#define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) @@ -128,7 +126,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW_BASE \ (_TIF_NOCPUID | _TIF_NOTSC | _TIF_BLOCKSTEP | \ - _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE | _TIF_SLD) + _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE) /* * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated. diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index be2a0bdb9527..672e253a58b9 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -1232,18 +1232,6 @@ void handle_bus_lock(struct pt_regs *regs) } } -/* - * This function is called only when switching between tasks with - * different split-lock detection modes. It sets the MSR for the - * mode of the new task. This is right most of the time, but since - * the MSR is shared by hyperthreads on a physical core there can - * be glitches when the two threads need different modes. - */ -void switch_to_sld(unsigned long tifn) -{ - sld_update_msr(!(tifn & _TIF_SLD)); -} - /* * Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should * only be trusted if it is confirmed that a CPU model implements a diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b370767f5b19..bcc76c187e11 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -686,9 +686,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) /* Enforce MSR update to ensure consistent state */ __speculation_ctrl_update(~tifn, tifn); } - - if ((tifp ^ tifn) & _TIF_SLD) - switch_to_sld(tifn); } /* -- cgit v1.2.3-59-g8ed1b From 138a7f9c6beae8d652113b8e7a44994b4200bbcd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:53 +0200 Subject: x86/aperfmperf: Separate AP/BP frequency invariance init This code is convoluted and because it can be invoked post init via the ACPI/CPPC code, all of the initialization functionality is built in instead of being part of init text and init data. As a first step create separate calls for the boot and the application processors. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.536733494@linutronix.de --- arch/x86/include/asm/topology.h | 12 +++++------- arch/x86/kernel/acpi/cppc.c | 3 ++- arch/x86/kernel/cpu/aperfmperf.c | 23 +++++++++++------------ arch/x86/kernel/smpboot.c | 4 ++-- 4 files changed, 20 insertions(+), 22 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 9619385bf749..e2faedc6e793 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -216,14 +216,12 @@ extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern void arch_set_max_freq_ratio(bool turbo_disabled); -void init_freq_invariance(bool secondary, bool cppc_ready); +extern void bp_init_freq_invariance(bool cppc_ready); +extern void ap_init_freq_invariance(void); #else -static inline void arch_set_max_freq_ratio(bool turbo_disabled) -{ -} -static inline void init_freq_invariance(bool secondary, bool cppc_ready) -{ -} +static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } +static inline void bp_init_freq_invariance(bool cppc_ready) { } +static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index df1644d9b3b6..06109d927a18 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -96,7 +96,8 @@ void init_freq_invariance_cppc(void) mutex_lock(&freq_invariance_lock); - init_freq_invariance(secondary, true); + if (!secondary) + bp_init_freq_invariance(true); secondary = true; mutex_unlock(&freq_invariance_lock); diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 35fff01e87b4..87f34f23a974 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -428,31 +428,24 @@ static void register_freq_invariance_syscore_ops(void) static inline void register_freq_invariance_syscore_ops(void) {} #endif -void init_freq_invariance(bool secondary, bool cppc_ready) +void bp_init_freq_invariance(bool cppc_ready) { - bool ret = false; + bool ret; - if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - if (secondary) { - if (static_branch_likely(&arch_scale_freq_key)) { - init_counter_refs(); - } - return; - } + init_counter_refs(); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ret = intel_set_max_freq_ratio(); else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) { + if (!cppc_ready) return; - } ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); } if (ret) { - init_counter_refs(); static_branch_enable(&arch_scale_freq_key); register_freq_invariance_syscore_ops(); pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); @@ -461,6 +454,12 @@ void init_freq_invariance(bool secondary, bool cppc_ready) } } +void ap_init_freq_invariance(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); +} + static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a9fc16a9c408..023feb40f5c0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -186,7 +186,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - init_freq_invariance(true, false); + ap_init_freq_invariance(); /* * Get our bogomips. @@ -1396,7 +1396,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - init_freq_invariance(false, false); + bp_init_freq_invariance(false); smp_sanity_check(); switch (apic_intr_mode) { -- cgit v1.2.3-59-g8ed1b From 0dfaf3f6ecc0c7f4f876255aa82e8959d3721365 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:54 +0200 Subject: x86/aperfmperf: Untangle Intel and AMD frequency invariance init AMD boot CPU initialization happens late via ACPI/CPPC which prevents the Intel parts from being marked __init. Split out the common code and provide a dedicated interface for the AMD initialization and mark the Intel specific code and data __init. The remaining text size is almost cut in half: text: 2614 -> 1350 init.text: 0 -> 786 Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.592465719@linutronix.de --- arch/x86/include/asm/topology.h | 13 +++------ arch/x86/kernel/acpi/cppc.c | 30 +++++++++---------- arch/x86/kernel/cpu/aperfmperf.c | 62 +++++++++++++++++++++------------------- arch/x86/kernel/smpboot.c | 2 +- 4 files changed, 51 insertions(+), 56 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index e2faedc6e793..cc317077e73e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -216,24 +216,19 @@ extern void arch_scale_freq_tick(void); #define arch_scale_freq_tick arch_scale_freq_tick extern void arch_set_max_freq_ratio(bool turbo_disabled); -extern void bp_init_freq_invariance(bool cppc_ready); +extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); +extern void bp_init_freq_invariance(void); extern void ap_init_freq_invariance(void); #else static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } -static inline void bp_init_freq_invariance(bool cppc_ready) { } +static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } +static inline void bp_init_freq_invariance(void) { } static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc - -bool amd_set_max_freq_ratio(u64 *ratio); -#else -static inline bool amd_set_max_freq_ratio(u64 *ratio) -{ - return false; -} #endif #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 06109d927a18..8b8cbf22461a 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -50,20 +50,17 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) return err; } -bool amd_set_max_freq_ratio(u64 *ratio) +static void amd_set_max_freq_ratio(void) { struct cppc_perf_caps perf_caps; u64 highest_perf, nominal_perf; u64 perf_ratio; int rc; - if (!ratio) - return false; - rc = cppc_get_perf_caps(0, &perf_caps); if (rc) { pr_debug("Could not retrieve perf counters (%d)\n", rc); - return false; + return; } highest_perf = amd_get_highest_perf(); @@ -71,7 +68,7 @@ bool amd_set_max_freq_ratio(u64 *ratio) if (!highest_perf || !nominal_perf) { pr_debug("Could not retrieve highest or nominal performance\n"); - return false; + return; } perf_ratio = div_u64(highest_perf * SCHED_CAPACITY_SCALE, nominal_perf); @@ -79,26 +76,27 @@ bool amd_set_max_freq_ratio(u64 *ratio) perf_ratio = (perf_ratio + SCHED_CAPACITY_SCALE) >> 1; if (!perf_ratio) { pr_debug("Non-zero highest/nominal perf values led to a 0 ratio\n"); - return false; + return; } - *ratio = perf_ratio; - arch_set_max_freq_ratio(false); - - return true; + freq_invariance_set_perf_ratio(perf_ratio, false); } static DEFINE_MUTEX(freq_invariance_lock); void init_freq_invariance_cppc(void) { - static bool secondary; + static bool init_done; - mutex_lock(&freq_invariance_lock); + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return; - if (!secondary) - bp_init_freq_invariance(true); - secondary = true; + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return; + mutex_lock(&freq_invariance_lock); + if (!init_done) + amd_set_max_freq_ratio(); + init_done = true; mutex_unlock(&freq_invariance_lock); } diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 87f34f23a974..b4f4ea529f2d 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -206,7 +206,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled) } EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio); -static bool turbo_disabled(void) +static bool __init turbo_disabled(void) { u64 misc_en; int err; @@ -218,7 +218,7 @@ static bool turbo_disabled(void) return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); } -static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; @@ -240,26 +240,26 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \ INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL) -static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = { X86_MATCH(XEON_PHI_KNL), X86_MATCH(XEON_PHI_KNM), {} }; -static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = { X86_MATCH(SKYLAKE_X), {} }; -static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = { X86_MATCH(ATOM_GOLDMONT), X86_MATCH(ATOM_GOLDMONT_D), X86_MATCH(ATOM_GOLDMONT_PLUS), {} }; -static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, - int num_delta_fratio) +static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, + int num_delta_fratio) { int fratio, delta_fratio, found; int err, i; @@ -297,7 +297,7 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, return true; } -static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) +static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) { u64 ratios, counts; u32 group_size; @@ -328,7 +328,7 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size) return false; } -static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) +static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { u64 msr; int err; @@ -351,7 +351,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) return true; } -static bool intel_set_max_freq_ratio(void) +static bool __init intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; u64 turbo_ratio; @@ -418,40 +418,42 @@ static struct syscore_ops freq_invariance_syscore_ops = { static void register_freq_invariance_syscore_ops(void) { - /* Bail out if registered already. */ - if (freq_invariance_syscore_ops.node.prev) - return; - register_syscore_ops(&freq_invariance_syscore_ops); } #else static inline void register_freq_invariance_syscore_ops(void) {} #endif -void bp_init_freq_invariance(bool cppc_ready) +static void freq_invariance_enable(void) +{ + if (static_branch_unlikely(&arch_scale_freq_key)) { + WARN_ON_ONCE(1); + return; + } + static_branch_enable(&arch_scale_freq_key); + register_freq_invariance_syscore_ops(); + pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); +} + +void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { - bool ret; + arch_turbo_freq_ratio = ratio; + arch_set_max_freq_ratio(turbo_disabled); + freq_invariance_enable(); +} +void __init bp_init_freq_invariance(void) +{ if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; init_counter_refs(); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - ret = intel_set_max_freq_ratio(); - else if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (!cppc_ready) - return; - ret = amd_set_max_freq_ratio(&arch_turbo_freq_ratio); - } + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return; - if (ret) { - static_branch_enable(&arch_scale_freq_key); - register_freq_invariance_syscore_ops(); - pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio); - } else { - pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); - } + if (intel_set_max_freq_ratio()) + freq_invariance_enable(); } void ap_init_freq_invariance(void) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 023feb40f5c0..b1ba7ddfe930 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1396,7 +1396,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - bp_init_freq_invariance(false); + bp_init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { -- cgit v1.2.3-59-g8ed1b From bb6e89df9028b2fab0ce6ac71cd9ef25b6ada32d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 15 Apr 2022 21:19:59 +0200 Subject: x86/aperfmperf: Make parts of the frequency invariance code unconditional The frequency invariance support is currently limited to x86/64 and SMP, which is the vast majority of machines. arch_scale_freq_tick() is called every tick on all CPUs and reads the APERF and MPERF MSRs. The CPU frequency getters function do the same via dedicated IPIs. While it could be argued that on systems where frequency invariance support is disabled (32bit, !SMP) the per tick read of the APERF and MPERF MSRs can be avoided, it does not make sense to keep the extra code and the resulting runtime issues of mass IPIs around. As a first step split out the non frequency invariance specific initialization code and the read MSR portion of arch_scale_freq_tick(). The rest of the code is still conditional and guarded with a static key. Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lore.kernel.org/r/20220415161206.761988704@linutronix.de --- arch/x86/include/asm/cpu.h | 2 ++ arch/x86/include/asm/topology.h | 4 --- arch/x86/kernel/cpu/aperfmperf.c | 63 ++++++++++++++++++++++++---------------- arch/x86/kernel/smpboot.c | 3 +- 4 files changed, 41 insertions(+), 31 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..e89772dc17f1 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -36,6 +36,8 @@ extern int _debug_hotplug_cpu(int cpu, int action); #endif #endif +extern void ap_init_aperfmperf(void); + int mwait_usable(const struct cpuinfo_x86 *); unsigned int x86_family(unsigned int sig); diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cc317077e73e..1b2553dd3c64 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -217,13 +217,9 @@ extern void arch_scale_freq_tick(void); extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); -extern void bp_init_freq_invariance(void); -extern void ap_init_freq_invariance(void); #else static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } -static inline void bp_init_freq_invariance(void) { } -static inline void ap_init_freq_invariance(void) { } #endif #ifdef CONFIG_ACPI_CPPC_LIB diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6220503af26a..df528a4f6de3 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -164,6 +165,17 @@ unsigned int arch_freq_get_on_cpu(int cpu) return per_cpu(samples.khz, cpu); } +static void init_counter_refs(void) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(cpu_samples.aperf, aperf); + this_cpu_write(cpu_samples.mperf, mperf); +} + #if defined(CONFIG_X86_64) && defined(CONFIG_SMP) /* * APERF/MPERF frequency ratio computation. @@ -405,17 +417,6 @@ out: return true; } -static void init_counter_refs(void) -{ - u64 aperf, mperf; - - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); - - this_cpu_write(cpu_samples.aperf, aperf); - this_cpu_write(cpu_samples.mperf, mperf); -} - #ifdef CONFIG_PM_SLEEP static struct syscore_ops freq_invariance_syscore_ops = { .resume = init_counter_refs, @@ -447,13 +448,8 @@ void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) freq_invariance_enable(); } -void __init bp_init_freq_invariance(void) +static void __init bp_init_freq_invariance(void) { - if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - return; - - init_counter_refs(); - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return; @@ -461,12 +457,6 @@ void __init bp_init_freq_invariance(void) freq_invariance_enable(); } -void ap_init_freq_invariance(void) -{ - if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) - init_counter_refs(); -} - static void disable_freq_invariance_workfn(struct work_struct *work) { static_branch_disable(&arch_scale_freq_key); @@ -481,6 +471,9 @@ static void scale_freq_tick(u64 acnt, u64 mcnt) { u64 freq_scale; + if (!arch_scale_freq_invariant()) + return; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) goto error; @@ -501,13 +494,17 @@ error: pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); schedule_work(&disable_freq_invariance_work); } +#else +static inline void bp_init_freq_invariance(void) { } +static inline void scale_freq_tick(u64 acnt, u64 mcnt) { } +#endif /* CONFIG_X86_64 && CONFIG_SMP */ void arch_scale_freq_tick(void) { struct aperfmperf *s = this_cpu_ptr(&cpu_samples); u64 acnt, mcnt, aperf, mperf; - if (!arch_scale_freq_invariant()) + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; rdmsrl(MSR_IA32_APERF, aperf); @@ -520,4 +517,20 @@ void arch_scale_freq_tick(void) scale_freq_tick(acnt, mcnt); } -#endif /* CONFIG_X86_64 && CONFIG_SMP */ + +static int __init bp_init_aperfmperf(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + return 0; + + init_counter_refs(); + bp_init_freq_invariance(); + return 0; +} +early_initcall(bp_init_aperfmperf); + +void ap_init_aperfmperf(void) +{ + if (cpu_feature_enabled(X86_FEATURE_APERFMPERF)) + init_counter_refs(); +} diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b1ba7ddfe930..eb7de776a2a6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -186,7 +186,7 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); - ap_init_freq_invariance(); + ap_init_aperfmperf(); /* * Get our bogomips. @@ -1396,7 +1396,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) { smp_prepare_cpus_common(); - bp_init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { -- cgit v1.2.3-59-g8ed1b From 830fe3c30dffe0b9f9485772070c29fcd8c2473d Mon Sep 17 00:00:00 2001 From: Suma Hegde Date: Wed, 27 Apr 2022 20:52:48 +0530 Subject: amd_hsmp: Add HSMP protocol version 5 messages HSMP protocol version 5 is supported on AMD family 19h model 10h EPYC processors. This version brings new features such as -- DIMM statistics -- Bandwidth for IO and xGMI links -- Monitor socket and core frequency limits -- Configure power efficiency modes, DF pstate range etc Signed-off-by: Suma Hegde Reviewed-by: Carlos Bilbao Signed-off-by: Naveen Krishna Chatradhi Link: https://lore.kernel.org/r/20220427152248.25643-1-nchatrad@amd.com Reviewed-by: Hans de Goede Signed-off-by: Hans de Goede --- arch/x86/include/uapi/asm/amd_hsmp.h | 114 +++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/amd_hsmp.h b/arch/x86/include/uapi/asm/amd_hsmp.h index 7ee7ba0d63a3..769b939444ae 100644 --- a/arch/x86/include/uapi/asm/amd_hsmp.h +++ b/arch/x86/include/uapi/asm/amd_hsmp.h @@ -31,9 +31,22 @@ enum hsmp_message_ids { HSMP_GET_CCLK_THROTTLE_LIMIT, /* 10h Get CCLK frequency limit in socket */ HSMP_GET_C0_PERCENT, /* 11h Get average C0 residency in socket */ HSMP_SET_NBIO_DPM_LEVEL, /* 12h Set max/min LCLK DPM Level for a given NBIO */ - /* 13h Reserved */ - HSMP_GET_DDR_BANDWIDTH = 0x14, /* 14h Get theoretical maximum and current DDR Bandwidth */ - HSMP_GET_TEMP_MONITOR, /* 15h Get per-DIMM temperature and refresh rates */ + HSMP_GET_NBIO_DPM_LEVEL, /* 13h Get LCLK DPM level min and max for a given NBIO */ + HSMP_GET_DDR_BANDWIDTH, /* 14h Get theoretical maximum and current DDR Bandwidth */ + HSMP_GET_TEMP_MONITOR, /* 15h Get socket temperature */ + HSMP_GET_DIMM_TEMP_RANGE, /* 16h Get per-DIMM temperature range and refresh rate */ + HSMP_GET_DIMM_POWER, /* 17h Get per-DIMM power consumption */ + HSMP_GET_DIMM_THERMAL, /* 18h Get per-DIMM thermal sensors */ + HSMP_GET_SOCKET_FREQ_LIMIT, /* 19h Get current active frequency per socket */ + HSMP_GET_CCLK_CORE_LIMIT, /* 1Ah Get CCLK frequency limit per core */ + HSMP_GET_RAILS_SVI, /* 1Bh Get SVI-based Telemetry for all rails */ + HSMP_GET_SOCKET_FMAX_FMIN, /* 1Ch Get Fmax and Fmin per socket */ + HSMP_GET_IOLINK_BANDWITH, /* 1Dh Get current bandwidth on IO Link */ + HSMP_GET_XGMI_BANDWITH, /* 1Eh Get current bandwidth on xGMI Link */ + HSMP_SET_GMI3_WIDTH, /* 1Fh Set max and min GMI3 Link width */ + HSMP_SET_PCI_RATE, /* 20h Control link rate on PCIe devices */ + HSMP_SET_POWER_MODE, /* 21h Select power efficiency profile policy */ + HSMP_SET_PSTATE_MAX_MIN, /* 22h Set the max and min DF P-State */ HSMP_MSG_ID_MAX, }; @@ -175,8 +188,12 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { */ {1, 0, HSMP_SET}, - /* RESERVED message */ - {0, 0, HSMP_RSVD}, + /* + * HSMP_GET_NBIO_DPM_LEVEL, num_args = 1, response_sz = 1 + * input: args[0] = nbioid[23:16] + * output: args[0] = max dpm level[15:8] + min dpm level[7:0] + */ + {1, 1, HSMP_GET}, /* * HSMP_GET_DDR_BANDWIDTH, num_args = 0, response_sz = 1 @@ -191,6 +208,93 @@ static const struct hsmp_msg_desc hsmp_msg_desc_table[] = { * [7:5] fractional part */ {0, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_TEMP_RANGE, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = refresh rate[3] + temperature range[2:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_POWER, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = DIMM power in mW[31:17] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_DIMM_THERMAL, num_args = 1, response_sz = 1 + * input: args[0] = DIMM address[7:0] + * output: args[0] = temperature in degree celcius[31:21] + update rate in ms[16:8] + + * DIMM address[7:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FREQ_LIMIT, num_args = 0, response_sz = 1 + * output: args[0] = frequency in MHz[31:16] + frequency source[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_CCLK_CORE_LIMIT, num_args = 1, response_sz = 1 + * input: args[0] = apic id [31:0] + * output: args[0] = frequency in MHz[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_RAILS_SVI, num_args = 0, response_sz = 1 + * output: args[0] = power in mW[31:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_SOCKET_FMAX_FMIN, num_args = 0, response_sz = 1 + * output: args[0] = fmax in MHz[31:16] + fmin in MHz[15:0] + */ + {0, 1, HSMP_GET}, + + /* + * HSMP_GET_IOLINK_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = io bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_GET_XGMI_BANDWITH, num_args = 1, response_sz = 1 + * input: args[0] = link id[15:8] + bw type[2:0] + * output: args[0] = xgmi bandwidth in Mbps[31:0] + */ + {1, 1, HSMP_GET}, + + /* + * HSMP_SET_GMI3_WIDTH, num_args = 1, response_sz = 0 + * input: args[0] = min link width[15:8] + max link width[7:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PCI_RATE, num_args = 1, response_sz = 1 + * input: args[0] = link rate control value + * output: args[0] = previous link rate control value + */ + {1, 1, HSMP_SET}, + + /* + * HSMP_SET_POWER_MODE, num_args = 1, response_sz = 0 + * input: args[0] = power efficiency mode[2:0] + */ + {1, 0, HSMP_SET}, + + /* + * HSMP_SET_PSTATE_MAX_MIN, num_args = 1, response_sz = 0 + * input: args[0] = min df pstate[15:8] + max df pstate[7:0] + */ + {1, 0, HSMP_SET}, }; /* Reset to default packing */ -- cgit v1.2.3-59-g8ed1b From e10cd4b00904db127b178859d81f6b5d05d16c67 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 Apr 2022 23:16:13 -0700 Subject: x86/mm: enable ARCH_HAS_VM_GET_PAGE_PROT This defines and exports a platform specific custom vm_get_page_prot() via subscribing ARCH_HAS_VM_GET_PAGE_PROT. This also unsubscribes from config ARCH_HAS_FILTER_PGPROT, after dropping off arch_filter_pgprot() and arch_vm_get_page_prot(). Link: https://lkml.kernel.org/r/20220414062125.609297-6-anshuman.khandual@arm.com Signed-off-by: Christoph Hellwig Signed-off-by: Anshuman Khandual Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Catalin Marinas Cc: Christophe Leroy Cc: David S. Miller Cc: Khalid Aziz Cc: Michael Ellerman Cc: Paul Mackerras Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/include/uapi/asm/mman.h | 14 -------------- arch/x86/mm/Makefile | 2 +- arch/x86/mm/pgprot.c | 35 +++++++++++++++++++++++++++++++++++ 5 files changed, 37 insertions(+), 21 deletions(-) create mode 100644 arch/x86/mm/pgprot.c (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ae4e1a79a1e2..5b77b4301d36 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -76,7 +76,6 @@ config X86 select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER - select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV if X86_64 @@ -95,6 +94,7 @@ config X86 select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE select ARCH_HAS_SYSCALL_WRAPPER select ARCH_HAS_UBSAN_SANITIZE_ALL + select ARCH_HAS_VM_GET_PAGE_PROT select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 62ab07e24aef..3563f4645fa1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -649,11 +649,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) -static inline pgprot_t arch_filter_pgprot(pgprot_t prot) -{ - return canon_pgprot(prot); -} - static inline int is_new_memtype_allowed(u64 paddr, unsigned long size, enum page_cache_mode pcm, enum page_cache_mode new_pcm) diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h index d4a8d0424bfb..775dbd3aff73 100644 --- a/arch/x86/include/uapi/asm/mman.h +++ b/arch/x86/include/uapi/asm/mman.h @@ -5,20 +5,6 @@ #define MAP_32BIT 0x40 /* only give out 32bit addresses */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -/* - * Take the 4 protection key bits out of the vma->vm_flags - * value and turn them in to the bits that we can put in - * to a pte. - * - * Only override these if Protection Keys are available - * (which is only on 64-bit). - */ -#define arch_vm_get_page_prot(vm_flags) __pgprot( \ - ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \ - ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \ - ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \ - ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0)) - #define arch_calc_vm_prot_bits(prot, key) ( \ ((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \ ((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fe3d3061fc11..fb6b41a48ae5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,7 +20,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg endif obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \ - pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o + pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o maccess.o pgprot.o obj-y += pat/ diff --git a/arch/x86/mm/pgprot.c b/arch/x86/mm/pgprot.c new file mode 100644 index 000000000000..763742782286 --- /dev/null +++ b/arch/x86/mm/pgprot.c @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +pgprot_t vm_get_page_prot(unsigned long vm_flags) +{ + unsigned long val = pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]); + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + /* + * Take the 4 protection key bits out of the vma->vm_flags value and + * turn them in to the bits that we can put in to a pte. + * + * Only override these if Protection Keys are available (which is only + * on 64-bit). + */ + if (vm_flags & VM_PKEY_BIT0) + val |= _PAGE_PKEY_BIT0; + if (vm_flags & VM_PKEY_BIT1) + val |= _PAGE_PKEY_BIT1; + if (vm_flags & VM_PKEY_BIT2) + val |= _PAGE_PKEY_BIT2; + if (vm_flags & VM_PKEY_BIT3) + val |= _PAGE_PKEY_BIT3; +#endif + + val = __sme_set(val); + if (val & _PAGE_PRESENT) + val &= __supported_pte_mask; + return __pgprot(val); +} +EXPORT_SYMBOL(vm_get_page_prot); -- cgit v1.2.3-59-g8ed1b From 1ff2fb982c52ed6c3478adc944441d6ea065d8fb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 May 2022 09:01:55 +0200 Subject: x86/aperfperf: Make it correct on 32bit and UP kernels The utilization of arch_scale_freq_tick() for CPU frequency readouts is incomplete as it failed to move the function prototype and the define out of the CONFIG_SMP && CONFIG_X86_64 #ifdef. Make them unconditionally available. Fixes: bb6e89df9028 ("x86/aperfmperf: Make parts of the frequency invariance code unconditional") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/202205010106.06xRBR2C-lkp@intel.com --- arch/x86/include/asm/topology.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 1b2553dd3c64..458c891a8273 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -212,9 +212,6 @@ static inline long arch_scale_freq_capacity(int cpu) } #define arch_scale_freq_capacity arch_scale_freq_capacity -extern void arch_scale_freq_tick(void); -#define arch_scale_freq_tick arch_scale_freq_tick - extern void arch_set_max_freq_ratio(bool turbo_disabled); extern void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled); #else @@ -222,6 +219,9 @@ static inline void arch_set_max_freq_ratio(bool turbo_disabled) { } static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled) { } #endif +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + #ifdef CONFIG_ACPI_CPPC_LIB void init_freq_invariance_cppc(void); #define arch_init_invariance_cppc init_freq_invariance_cppc -- cgit v1.2.3-59-g8ed1b From 0aca53c6b522f8d6e2681ca875acbbe105f5fdcf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 21 Apr 2022 22:10:48 +0800 Subject: x86/traps: Use pt_regs directly in fixup_bad_iret() Always stash the address error_entry() is going to return to, in %r12 and get rid of the void *error_entry_ret; slot in struct bad_iret_stack which was supposed to account for it and pt_regs pushed on the stack. After this, both fixup_bad_iret() and sync_regs() can work on a struct pt_regs pointer directly. [ bp: Rewrite commit message, touch ups. ] Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220503032107.680190-2-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 5 ++++- arch/x86/include/asm/traps.h | 2 +- arch/x86/kernel/traps.c | 19 +++++++------------ 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 73d958522b6a..ecbfca3cc18c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1061,9 +1061,12 @@ SYM_CODE_START_LOCAL(error_entry) * Pretend that the exception came from user mode: set up pt_regs * as if we faulted immediately after IRET. */ - mov %rsp, %rdi + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ call fixup_bad_iret mov %rax, %rsp + ENCODE_FRAME_POINTER + pushq %r12 jmp .Lerror_entry_from_usermode_after_swapgs SYM_CODE_END(error_entry) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 35317c5c551d..47ecfff2c83d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,7 +13,7 @@ #ifdef CONFIG_X86_64 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs); asmlinkage __visible notrace -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s); +struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs); void __init trap_init(void); asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *eregs); #endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1563fb995005..4167215333fd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -892,14 +892,10 @@ sync: } #endif -struct bad_iret_stack { - void *error_entry_ret; - struct pt_regs regs; -}; - -asmlinkage __visible noinstr -struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) +asmlinkage __visible noinstr struct pt_regs *fixup_bad_iret(struct pt_regs *bad_regs) { + struct pt_regs tmp, *new_stack; + /* * This is called from entry_64.S early in handling a fault * caused by a bad iret to user mode. To handle the fault @@ -908,19 +904,18 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) * just below the IRET frame) and we want to pretend that the * exception came from the IRET target. */ - struct bad_iret_stack tmp, *new_stack = - (struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; + new_stack = (struct pt_regs *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1; /* Copy the IRET target to the temporary storage. */ - __memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8); + __memcpy(&tmp.ip, (void *)bad_regs->sp, 5*8); /* Copy the remainder of the stack from the current stack. */ - __memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip)); + __memcpy(&tmp, bad_regs, offsetof(struct pt_regs, ip)); /* Update the entry stack */ __memcpy(new_stack, &tmp, sizeof(tmp)); - BUG_ON(!user_mode(&new_stack->regs)); + BUG_ON(!user_mode(new_stack)); return new_stack; } #endif -- cgit v1.2.3-59-g8ed1b From c89191ce67efa4e5353db6a67f7287c28e673740 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 3 May 2022 11:21:07 +0800 Subject: x86/entry: Convert SWAPGS to swapgs and remove the definition of SWAPGS XENPV doesn't use swapgs_restore_regs_and_return_to_usermode(), error_entry() and the code between entry_SYSENTER_compat() and entry_SYSENTER_compat_after_hwframe. Change the PV-compatible SWAPGS to the ASM instruction swapgs in these places. Also remove the definition of SWAPGS since no more users. Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220503032107.680190-7-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 6 +++--- arch/x86/entry/entry_64_compat.S | 2 +- arch/x86/include/asm/irqflags.h | 8 -------- 3 files changed, 4 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 062aa9d95961..312186612f4e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1019,7 +1019,7 @@ SYM_CODE_START_LOCAL(error_entry) * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY /* We have user CR3. Change to kernel CR3. */ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax @@ -1051,7 +1051,7 @@ SYM_CODE_START_LOCAL(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - SWAPGS + swapgs /* * Issue an LFENCE to prevent GS speculation, regardless of whether it is a @@ -1072,7 +1072,7 @@ SYM_CODE_START_LOCAL(error_entry) * We came from an IRET to user mode, so we have user * gsbase and CR3. Switch to kernel gsbase and CR3: */ - SWAPGS + swapgs FENCE_SWAPGS_USER_ENTRY SWITCH_TO_KERNEL_CR3 scratch_reg=%rax diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4fdb007cddbd..c5aeb0819707 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -50,7 +50,7 @@ SYM_CODE_START(entry_SYSENTER_compat) UNWIND_HINT_EMPTY ENDBR /* Interrupts are off on entry. */ - SWAPGS + swapgs pushq %rax SWITCH_TO_KERNEL_CR3 scratch_reg=%rax diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 111104d1c2cd..7793e52d6237 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -137,14 +137,6 @@ static __always_inline void arch_local_irq_restore(unsigned long flags) if (!arch_irqs_disabled_flags(flags)) arch_local_irq_enable(); } -#else -#ifdef CONFIG_X86_64 -#ifdef CONFIG_XEN_PV -#define SWAPGS ALTERNATIVE "swapgs", "", X86_FEATURE_XENPV -#else -#define SWAPGS swapgs -#endif -#endif #endif /* !__ASSEMBLY__ */ #endif -- cgit v1.2.3-59-g8ed1b From 3ba75c1316390b2bc39c19cb8f0f85922ab3f9ed Mon Sep 17 00:00:00 2001 From: Baskov Evgeniy Date: Thu, 3 Mar 2022 17:21:19 +0300 Subject: efi: libstub: declare DXE services table UEFI DXE services are not yet used in kernel code but are required to manipulate page table memory protection flags. Add required declarations to use DXE services functions. Signed-off-by: Baskov Evgeniy Link: https://lore.kernel.org/r/20220303142120.1975-2-baskov@ispras.ru [ardb: ignore absent DXE table but warn if the signature check fails] Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 5 +++ drivers/firmware/efi/libstub/efistub.h | 74 +++++++++++++++++++++++++++++++++ drivers/firmware/efi/libstub/x86-stub.c | 9 +++- include/linux/efi.h | 2 + 4 files changed, 89 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 98938a68251c..bed74a0f2932 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -357,6 +357,11 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#define efi_dxe_call(func, ...) \ + (efi_is_native() \ + ? efi_dxe_table->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_dxe_table, func, __VA_ARGS__)) + #else /* CONFIG_EFI_MIXED */ static inline bool efi_is_64bit(void) diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index edb77b0621ea..2dc24776899a 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -36,6 +36,9 @@ extern bool efi_novamap; extern const efi_system_table_t *efi_system_table; +typedef union efi_dxe_services_table efi_dxe_services_table_t; +extern const efi_dxe_services_table_t *efi_dxe_table; + efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg); @@ -44,6 +47,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, #define efi_is_native() (true) #define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) #define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) +#define efi_dxe_call(func, ...) efi_dxe_table->func(__VA_ARGS__) #define efi_table_attr(inst, attr) (inst->attr) #define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) @@ -329,6 +333,76 @@ union efi_boot_services { } mixed_mode; }; +typedef enum { + EfiGcdMemoryTypeNonExistent, + EfiGcdMemoryTypeReserved, + EfiGcdMemoryTypeSystemMemory, + EfiGcdMemoryTypeMemoryMappedIo, + EfiGcdMemoryTypePersistent, + EfiGcdMemoryTypeMoreReliable, + EfiGcdMemoryTypeMaximum +} efi_gcd_memory_type_t; + +typedef struct { + efi_physical_addr_t base_address; + u64 length; + u64 capabilities; + u64 attributes; + efi_gcd_memory_type_t gcd_memory_type; + void *image_handle; + void *device_handle; +} efi_gcd_memory_space_desc_t; + +/* + * EFI DXE Services table + */ +union efi_dxe_services_table { + struct { + efi_table_hdr_t hdr; + void *add_memory_space; + void *allocate_memory_space; + void *free_memory_space; + void *remove_memory_space; + efi_status_t (__efiapi *get_memory_space_descriptor)(efi_physical_addr_t, + efi_gcd_memory_space_desc_t *); + efi_status_t (__efiapi *set_memory_space_attributes)(efi_physical_addr_t, + u64, u64); + void *get_memory_space_map; + void *add_io_space; + void *allocate_io_space; + void *free_io_space; + void *remove_io_space; + void *get_io_space_descriptor; + void *get_io_space_map; + void *dispatch; + void *schedule; + void *trust; + void *process_firmware_volume; + void *set_memory_space_capabilities; + }; + struct { + efi_table_hdr_t hdr; + u32 add_memory_space; + u32 allocate_memory_space; + u32 free_memory_space; + u32 remove_memory_space; + u32 get_memory_space_descriptor; + u32 set_memory_space_attributes; + u32 get_memory_space_map; + u32 add_io_space; + u32 allocate_io_space; + u32 free_io_space; + u32 remove_io_space; + u32 get_io_space_descriptor; + u32 get_io_space_map; + u32 dispatch; + u32 schedule; + u32 trust; + u32 process_firmware_volume; + u32 set_memory_space_capabilities; + } mixed_mode; +}; + typedef union efi_uga_draw_protocol efi_uga_draw_protocol_t; union efi_uga_draw_protocol { diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 01ddd4502e28..cda60aba2f12 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -22,6 +22,7 @@ #define MAXMEM_X86_64_4LEVEL (1ull << 46) const efi_system_table_t *efi_system_table; +const efi_dxe_services_table_t *efi_dxe_table; extern u32 image_offset; static efi_loaded_image_t *image = NULL; @@ -677,11 +678,17 @@ unsigned long efi_main(efi_handle_t handle, efi_status_t status; efi_system_table = sys_table_arg; - /* Check if we were booted by the EFI firmware */ if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); + efi_dxe_table = get_efi_config_table(EFI_DXE_SERVICES_TABLE_GUID); + if (efi_dxe_table && + efi_dxe_table->hdr.signature != EFI_DXE_SERVICES_TABLE_SIGNATURE) { + efi_warn("Ignoring DXE services table: invalid signature\n"); + efi_dxe_table = NULL; + } + /* * If the kernel isn't already loaded at a suitable address, * relocate it. diff --git a/include/linux/efi.h b/include/linux/efi.h index fd266198f9d8..b1f7c6a3e5bf 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -385,6 +385,7 @@ void efi_native_runtime_setup(void); #define EFI_LOAD_FILE_PROTOCOL_GUID EFI_GUID(0x56ec3091, 0x954c, 0x11d2, 0x8e, 0x3f, 0x00, 0xa0, 0xc9, 0x69, 0x72, 0x3b) #define EFI_LOAD_FILE2_PROTOCOL_GUID EFI_GUID(0x4006c0c1, 0xfcb3, 0x403e, 0x99, 0x6d, 0x4a, 0x6c, 0x87, 0x24, 0xe0, 0x6d) #define EFI_RT_PROPERTIES_TABLE_GUID EFI_GUID(0xeb66918a, 0x7eef, 0x402a, 0x84, 0x2e, 0x93, 0x1d, 0x21, 0xc3, 0x8a, 0xe9) +#define EFI_DXE_SERVICES_TABLE_GUID EFI_GUID(0x05ad34ba, 0x6f02, 0x4214, 0x95, 0x2e, 0x4d, 0xa0, 0x39, 0x8e, 0x2b, 0xb9) #define EFI_IMAGE_SECURITY_DATABASE_GUID EFI_GUID(0xd719b2cb, 0x3d3a, 0x4596, 0xa3, 0xbc, 0xda, 0xd0, 0x0e, 0x67, 0x65, 0x6f) #define EFI_SHIM_LOCK_GUID EFI_GUID(0x605dab50, 0xe046, 0x4300, 0xab, 0xb6, 0x3d, 0xd8, 0x10, 0xdd, 0x8b, 0x23) @@ -438,6 +439,7 @@ typedef struct { } efi_config_table_type_t; #define EFI_SYSTEM_TABLE_SIGNATURE ((u64)0x5453595320494249ULL) +#define EFI_DXE_SERVICES_TABLE_SIGNATURE ((u64)0x565245535f455844ULL) #define EFI_2_30_SYSTEM_TABLE_REVISION ((2 << 16) | (30)) #define EFI_2_20_SYSTEM_TABLE_REVISION ((2 << 16) | (20)) -- cgit v1.2.3-59-g8ed1b From d6d0c7f681fda1d07e005c8f653e578b77a0eb40 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:53 +0530 Subject: x86/cpufeatures: Add PerfMonV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Additionally, Core PMCs can be managed using new global control and status registers. For better utilization of feature words, PerfMonV2 is added as a scattered feature bit. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/c70e497e22f18e7f05b025bb64ca21cc12b17792.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 0d62afd525e3..b50e0872ad1e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -211,7 +211,7 @@ #define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ -/* FREE! ( 7*32+20) */ +#define X86_FEATURE_PERFMON_V2 ( 7*32+20) /* AMD Performance Monitoring Version 2 */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 4143b1e4c5c6..dbaa8326d6f2 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -43,6 +43,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit v1.2.3-59-g8ed1b From 089be16d5992dd0bc6df15ef12042fd1023ded9a Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:54 +0530 Subject: x86/msr: Add PerfCntrGlobal* registers Add MSR definitions that will be used to enable the new AMD Performance Monitoring Version 2 (PerfMonV2) features. These include: * Performance Counter Global Control (PerfCntrGlobalCtl) * Performance Counter Global Status (PerfCntrGlobalStatus) * Performance Counter Global Status Clear (PerfCntrGlobalStatusClr) The new Performance Counter Global Control and Status MSRs provide an interface for enabling or disabling multiple counters at the same time and for testing overflow without probing the individual registers for each PMC. The availability of these registers is indicated through the PerfMonV2 feature bit of CPUID leaf 0x80000022 EAX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/cdc0d8f75bd519848731b5c64d924f5a0619a573.1650515382.git.sandipan.das@amd.com --- arch/x86/include/asm/msr-index.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8179ea351bd8..58a44dceef9a 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -524,6 +524,11 @@ #define AMD_CPPC_DES_PERF(x) (((x) & 0xff) << 16) #define AMD_CPPC_ENERGY_PERF_PREF(x) (((x) & 0xff) << 24) +/* AMD Performance Counter Global Status and Control MSRs */ +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS 0xc0000300 +#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 +#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 -- cgit v1.2.3-59-g8ed1b From 56e026a7ca3f92b8e44359e1f705febd1833f701 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 21 Apr 2022 11:16:56 +0530 Subject: perf/x86/amd/core: Detect available counters If AMD Performance Monitoring Version 2 (PerfMonV2) is supported, use CPUID leaf 0x80000022 EBX to detect the number of Core PMCs. This offers more flexibility if the counts change in later processor families. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/68a6d9688df189267db26530378870edd34f7b06.1650515382.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 6 ++++++ arch/x86/include/asm/perf_event.h | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b70dfa028ba5..52fd7941a724 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1186,6 +1186,7 @@ static const struct attribute_group *amd_attr_update[] = { static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -1206,9 +1207,14 @@ static int __init amd_core_pmu_init(void) /* Check for Performance Monitoring v2 support */ if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + /* Update PMU version for later usage */ x86_pmu.version = 2; + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; } diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8199fc5a37ea..c6cc07f46556 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -186,6 +186,18 @@ union cpuid28_ecx { unsigned int full; }; +/* + * AMD "Extended Performance Monitoring and Debug" CPUID + * detection/enumeration details: + */ +union cpuid_0x80000022_ebx { + struct { + /* Number of Core Performance Counters */ + unsigned int num_core_pmc:4; + } split; + unsigned int full; +}; + struct x86_pmu_capability { int version; int num_counters_gp; @@ -367,6 +379,11 @@ struct pebs_xmm { u64 xmm[16*2]; /* two entries for each register */ }; +/* + * AMD Extended Performance Monitoring and Debug cpuid feature detection + */ +#define EXT_PERFMON_DEBUG_FEATURES 0x80000022 + /* * IBS cpuid feature detection */ -- cgit v1.2.3-59-g8ed1b From a1e2c031ec3949b8c039b739c0b5bf9c30007b00 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 6 May 2022 14:14:32 +0200 Subject: x86/mm: Simplify RESERVE_BRK() RESERVE_BRK() reserves data in the .brk_reservation section. The data is initialized to zero, like BSS, so the macro specifies 'nobits' to prevent the data from taking up space in the vmlinux binary. The only way to get the compiler to do that (without putting the variable in .bss proper) is to use inline asm. The macro also has a hack which encloses the inline asm in a discarded function, which allows the size to be passed (global inline asm doesn't allow inputs). Remove the need for the discarded function hack by just stringifying the size rather than supplying it as an input to the inline asm. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Reviewed-by: Borislav Petkov Link: https://lore.kernel.org/r/20220506121631.133110232@infradead.org --- arch/x86/include/asm/setup.h | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 896e48d45828..bec5ff4d6d98 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -109,27 +109,19 @@ extern unsigned long _brk_end; void *extend_brk(size_t size, size_t align); /* - * Reserve space in the brk section. The name must be unique within - * the file, and somewhat descriptive. The size is in bytes. Must be - * used at file scope. + * Reserve space in the brk section. The name must be unique within the file, + * and somewhat descriptive. The size is in bytes. * - * (This uses a temp function to wrap the asm so we can pass it the - * size parameter; otherwise we wouldn't be able to. We can't use a - * "section" attribute on a normal variable because it always ends up - * being @progbits, which ends up allocating space in the vmlinux - * executable.) + * The allocation is done using inline asm (rather than using a section + * attribute on a normal variable) in order to allow the use of @nobits, so + * that it doesn't take up any space in the vmlinux file. */ -#define RESERVE_BRK(name,sz) \ - static void __section(".discard.text") __noendbr __used notrace \ - __brk_reservation_fn_##name##__(void) { \ - asm volatile ( \ - ".pushsection .brk_reservation,\"aw\",@nobits;" \ - ".brk." #name ":" \ - " 1:.skip %c0;" \ - " .size .brk." #name ", . - 1b;" \ - " .popsection" \ - : : "i" (sz)); \ - } +#define RESERVE_BRK(name, size) \ + asm(".pushsection .brk_reservation,\"aw\",@nobits\n\t" \ + ".brk." #name ":\n\t" \ + ".skip " __stringify(size) "\n\t" \ + ".size .brk." #name ", " __stringify(size) "\n\t" \ + ".popsection\n\t") extern void probe_roms(void); #ifdef __i386__ -- cgit v1.2.3-59-g8ed1b From 5bd2e97c868a8a44470950ed01846cab6328e540 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 12 Apr 2022 10:18:48 -0500 Subject: fork: Generalize PF_IO_WORKER handling Add fn and fn_arg members into struct kernel_clone_args and test for them in copy_thread (instead of testing for PF_KTHREAD | PF_IO_WORKER). This allows any task that wants to be a user space task that only runs in kernel mode to use this functionality. The code on x86 is an exception and still retains a PF_KTHREAD test because x86 unlikely everything else handles kthreads slightly differently than user space tasks that start with a function. The functions that created tasks that start with a function have been updated to set ".fn" and ".fn_arg" instead of ".stack" and ".stack_size". These functions are fork_idle(), create_io_thread(), kernel_thread(), and user_mode_thread(). Link: https://lkml.kernel.org/r/20220506141512.516114-4-ebiederm@xmission.com Signed-off-by: "Eric W. Biederman" --- arch/alpha/kernel/process.c | 7 +++---- arch/arc/kernel/process.c | 7 +++---- arch/arm/kernel/process.c | 7 +++---- arch/arm64/kernel/process.c | 7 +++---- arch/csky/kernel/process.c | 7 +++---- arch/h8300/kernel/process.c | 7 +++---- arch/hexagon/kernel/process.c | 7 +++---- arch/ia64/kernel/process.c | 6 +++--- arch/m68k/kernel/process.c | 7 +++---- arch/microblaze/kernel/process.c | 7 +++---- arch/mips/kernel/process.c | 7 +++---- arch/nios2/kernel/process.c | 7 +++---- arch/openrisc/kernel/process.c | 7 +++---- arch/parisc/kernel/process.c | 11 +++++------ arch/powerpc/kernel/process.c | 9 ++++----- arch/riscv/kernel/process.c | 7 +++---- arch/s390/kernel/process.c | 7 +++---- arch/sh/kernel/process_32.c | 7 +++---- arch/sparc/kernel/process_32.c | 7 +++---- arch/sparc/kernel/process_64.c | 7 +++---- arch/um/kernel/process.c | 10 ++++------ arch/x86/include/asm/fpu/sched.h | 2 +- arch/x86/include/asm/switch_to.h | 8 ++++---- arch/x86/kernel/fpu/core.c | 4 ++-- arch/x86/kernel/process.c | 13 ++++++------- arch/xtensa/kernel/process.c | 11 +++++------ include/linux/sched/task.h | 2 ++ kernel/fork.c | 16 ++++++++-------- 28 files changed, 95 insertions(+), 116 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 732e39217c7f..6cbba7370b4e 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -237,7 +237,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; extern void ret_from_fork(void); extern void ret_from_kernel_thread(void); @@ -251,13 +250,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childti->pcb.ksp = (unsigned long) childstack; childti->pcb.flags = 1; /* set FEN, clear everything else */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); childstack->r26 = (unsigned long) ret_from_kernel_thread; - childstack->r9 = usp; /* function */ - childstack->r10 = kthread_arg; + childstack->r9 = (unsigned long) args->fn; + childstack->r10 = (unsigned long) args->fn_arg; childregs->hae = alpha_mv.hae_cache; childti->pcb.usp = 0; return 0; diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index caf948ba647c..3369f0700702 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -166,7 +166,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *c_regs; /* child's pt_regs */ unsigned long *childksp; /* to unwind out of __switch_to() */ @@ -193,11 +192,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childksp[0] = 0; /* fp */ childksp[1] = (unsigned long)ret_from_fork; /* blink */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(c_regs, 0, sizeof(struct pt_regs)); - c_callee->r13 = kthread_arg; - c_callee->r14 = usp; /* function */ + c_callee->r13 = (unsigned long)args->fn_arg; + c_callee->r14 = (unsigned long)args->fn; return 0; } diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e13b426dd26..3d9cace63884 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -242,7 +242,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long stack_start = args->stack; - unsigned long stk_sz = args->stack_size; unsigned long tls = args->tls; struct thread_info *thread = task_thread_info(p); struct pt_regs *childregs = task_pt_regs(p); @@ -259,15 +258,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) thread->cpu_domain = get_domain(); #endif - if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->ARM_r0 = 0; if (stack_start) childregs->ARM_sp = stack_start; } else { memset(childregs, 0, sizeof(struct pt_regs)); - thread->cpu_context.r4 = stk_sz; - thread->cpu_context.r5 = stack_start; + thread->cpu_context.r4 = (unsigned long)args->fn_arg; + thread->cpu_context.r5 = (unsigned long)args->fn; childregs->ARM_cpsr = SVC_MODE; } thread->cpu_context.pc = (unsigned long)ret_from_fork; diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index e002f6681c8d..d0ef05c661b0 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -320,7 +320,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long stack_start = args->stack; - unsigned long stk_sz = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); @@ -337,7 +336,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ptrauth_thread_init_kernel(p); - if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { + if (likely(!args->fn)) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; @@ -371,8 +370,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; - p->thread.cpu_context.x19 = stack_start; - p->thread.cpu_context.x20 = stk_sz; + p->thread.cpu_context.x19 = (unsigned long)args->fn; + p->thread.cpu_context.x20 = (unsigned long)args->fn_arg; } p->thread.cpu_context.pc = (unsigned long)ret_from_fork; p->thread.cpu_context.sp = (unsigned long)childregs; diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 7dba33d37e1a..9af49aea1c3b 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -34,7 +34,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct switch_stack *childstack; struct pt_regs *childregs = task_pt_regs(p); @@ -49,11 +48,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* setup thread.sp for switch_to !!! */ p->thread.sp = (unsigned long)childstack; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; - childstack->r10 = kthread_arg; - childstack->r9 = usp; + childstack->r10 = (unsigned long) args->fn_arg; + childstack->r9 = (unsigned long) args->fn; childregs->sr = mfcr("psr"); } else { *childregs = *(current_pt_regs()); diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 752cbd9b0bf6..9028262c96a9 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -108,16 +108,15 @@ void flush_thread(void) int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long usp = args->stack; - unsigned long topstk = args->stack_size; struct pt_regs *childregs; childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); childregs->retpc = (unsigned long) ret_from_kernel_thread; - childregs->er4 = topstk; /* arg */ - childregs->er5 = usp; /* fn */ + childregs->er4 = (unsigned long) args->fn_arg; + childregs->er5 = (unsigned long) args->fn; } else { *childregs = *current_pt_regs(); childregs->er0 = 0; diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index f1c1f6f21941..f0552f98a7ba 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -54,7 +54,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct hexagon_switch_stack *ss; @@ -76,11 +75,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sizeof(*ss)); ss->lr = (unsigned long)ret_from_fork; p->thread.switch_sp = ss; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); /* r24 <- fn, r25 <- arg */ - ss->r24 = usp; - ss->r25 = arg; + ss->r24 = (unsigned long)args->fn; + ss->r25 = (unsigned long)args->fn_arg; pt_set_kmode(childregs); return 0; } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 8f010ae818bc..167b1765bea1 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -341,14 +341,14 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { if (unlikely(args->idle)) { /* fork_idle() called us */ return 0; } memset(child_stack, 0, sizeof(*child_ptregs) + sizeof(*child_stack)); - child_stack->r4 = user_stack_base; /* payload */ - child_stack->r5 = user_stack_size; /* argument */ + child_stack->r4 = (unsigned long) args->fn; + child_stack->r5 = (unsigned long) args->fn_arg; /* * Preserve PSR bits, except for bits 32-34 and 37-45, * which we can't read. diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 8ac575656fc4..221feb0269f1 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -142,7 +142,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct fork_frame { struct switch_stack sw; @@ -160,12 +159,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ p->thread.fc = USER_DATA; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(frame, 0, sizeof(struct fork_frame)); frame->regs.sr = PS_S; - frame->sw.a3 = usp; /* function */ - frame->sw.d7 = arg; + frame->sw.a3 = (unsigned long)args->fn; + frame->sw.d7 = (unsigned long)args->fn_arg; frame->sw.retpc = (unsigned long)ret_from_kernel_thread; p->thread.usp = 0; return 0; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index b5f549125c6a..3c6241bcaea8 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -56,19 +56,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* if we're creating a new kernel thread then just zeroing all * the registers. That's OK for a brand new thread.*/ memset(childregs, 0, sizeof(struct pt_regs)); memset(&ti->cpu_context, 0, sizeof(struct cpu_context)); ti->cpu_context.r1 = (unsigned long)childregs; - ti->cpu_context.r20 = (unsigned long)usp; /* fn */ - ti->cpu_context.r19 = (unsigned long)arg; + ti->cpu_context.r20 = (unsigned long)args->fn; + ti->cpu_context.r19 = (unsigned long)args->fn_arg; childregs->pt_mode = 1; local_save_flags(childregs->msr); ti->cpu_context.msr = childregs->msr & ~MSR_IE; diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index a572d097b16b..35b912bce429 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -109,7 +109,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); @@ -122,12 +121,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Put the stack after the struct pt_regs. */ childksp = (unsigned long) childregs; p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ unsigned long status = p->thread.cp0_status; memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.reg16 = usp; /* fn */ - p->thread.reg17 = kthread_arg; + p->thread.reg16 = (unsigned long)args->fn; + p->thread.reg17 = (unsigned long)args->fn_arg; p->thread.reg29 = childksp; p->thread.reg31 = (unsigned long) ret_from_kernel_thread; #if defined(CONFIG_CPU_R3000) diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 98c4bfe972e0..29593b98567d 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -104,7 +104,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); struct pt_regs *regs; @@ -112,12 +111,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) struct switch_stack *childstack = ((struct switch_stack *)childregs) - 1; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); - childstack->r16 = usp; /* fn */ - childstack->r17 = arg; + childstack->r16 = (unsigned long) args->fn; + childstack->r17 = (unsigned long) args->fn_arg; childstack->ra = (unsigned long) ret_from_kernel_thread; childregs->estatus = STATUS_PIE; childregs->sp = (unsigned long) childstack; diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 486e46dd5883..d9697cc9bc4d 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -156,7 +156,6 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *userregs; struct pt_regs *kregs; @@ -175,10 +174,10 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *)sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(kregs, 0, sizeof(struct pt_regs)); - kregs->gpr[20] = usp; /* fn, kernel thread */ - kregs->gpr[22] = arg; + kregs->gpr[20] = (unsigned long)args->fn; + kregs->gpr[22] = (unsigned long)args->fn_arg; } else { *userregs = *current_pt_regs(); diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 30a5874ca845..a6a2a558fc5b 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -210,7 +210,6 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *cregs = &(p->thread.regs); void *stack = task_stack_page(p); @@ -221,7 +220,7 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) extern void * const ret_from_kernel_thread; extern void * const child_return; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(cregs, 0, sizeof(struct pt_regs)); if (args->idle) /* idle thread */ @@ -236,12 +235,12 @@ copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * ret_from_kernel_thread. */ #ifdef CONFIG_64BIT - cregs->gr[27] = ((unsigned long *)usp)[3]; - cregs->gr[26] = ((unsigned long *)usp)[2]; + cregs->gr[27] = ((unsigned long *)args->fn)[3]; + cregs->gr[26] = ((unsigned long *)args->fn)[2]; #else - cregs->gr[26] = usp; + cregs->gr[26] = (unsigned long) args->fn; #endif - cregs->gr[25] = kthread_arg; + cregs->gr[25] = (unsigned long) args->fn_arg; } else { /* user thread */ /* usp must be word aligned. This also prevents users from diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3fd67c861d54..4f367bb68906 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1720,7 +1720,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long kthread_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); @@ -1738,18 +1737,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); /* function */ - if (usp) - childregs->gpr[14] = ppc_function_entry((void *)usp); + if (args->fn) + childregs->gpr[14] = ppc_function_entry((void *)args->fn); #ifdef CONFIG_PPC64 clear_tsk_thread_flag(p, TIF_32BIT); childregs->softe = IRQS_ENABLED; #endif - childregs->gpr[15] = kthread_arg; + childregs->gpr[15] = (unsigned long)args->fn_arg; p->thread.regs = NULL; /* no user register state */ ti->flags |= _TIF_RESTOREALL; f = ret_from_kernel_thread; diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 334382731725..24efabdbc551 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -124,12 +124,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); /* p->thread holds context to be restored by __switch_to() */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp_in_global; @@ -137,8 +136,8 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs->status = SR_PP | SR_PIE; p->thread.ra = (unsigned long)ret_from_kernel_thread; - p->thread.s[0] = usp; /* fn */ - p->thread.s[1] = arg; + p->thread.s[0] = (unsigned long)args->fn; + p->thread.s[1] = (unsigned long)args->fn_arg; } else { *childregs = *(current_pt_regs()); if (usp) /* User fork */ diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bb5daec39516..89949b9f3cf8 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -98,7 +98,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long new_stackp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct fake_frame { @@ -133,15 +132,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->sf.gprs[9] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = (unsigned long)__ret_from_fork; - frame->childregs.gprs[9] = new_stackp; /* function */ - frame->childregs.gprs[10] = arg; + frame->childregs.gprs[9] = (unsigned long)args->fn; + frame->childregs.gprs[10] = (unsigned long)args->fn_arg; frame->childregs.orig_gpr2 = -1; frame->childregs.last_break = 1; return 0; diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 6023399b1892..a808843375e7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -96,7 +96,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs; @@ -117,11 +116,11 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(childregs, 0, sizeof(struct pt_regs)); p->thread.pc = (unsigned long) ret_from_kernel_thread; - childregs->regs[4] = arg; - childregs->regs[5] = usp; + childregs->regs[4] = (unsigned long) args->fn_arg; + childregs->regs[5] = (unsigned long) args->fn; childregs->sr = SR_MD; #if defined(CONFIG_SH_FPU) childregs->sr |= SR_FD; diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 80e6775e18c0..33b0215a4182 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -263,7 +263,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); @@ -299,13 +298,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) ti->ksp = (unsigned long) new_stack; p->thread.kregs = childregs; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { extern int nwindows; unsigned long psr; memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ); ti->kpc = (((unsigned long) ret_from_kernel_thread) - 0x8); - childregs->u_regs[UREG_G1] = sp; /* function */ - childregs->u_regs[UREG_G2] = arg; + childregs->u_regs[UREG_G1] = (unsigned long) args->fn; + childregs->u_regs[UREG_G2] = (unsigned long) args->fn_arg; psr = childregs->psr = get_psr(); ti->kpsr = psr | PSR_PIL; ti->kwim = 1 << (((psr & PSR_CWP) + 1) % nwindows); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 38c46ca826d9..6335b698a4b4 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -568,7 +568,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct thread_info *t = task_thread_info(p); struct pt_regs *regs = current_pt_regs(); @@ -587,12 +586,12 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) sizeof(struct sparc_stackf)); t->fpsaved[0] = 0; - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(args->fn)) { memset(child_trap_frame, 0, child_stack_sz); __thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] = (current_pt_regs()->tstate + 1) & TSTATE_CWP; - t->kregs->u_regs[UREG_G1] = sp; /* function */ - t->kregs->u_regs[UREG_G2] = arg; + t->kregs->u_regs[UREG_G1] = (unsigned long) args->fn; + t->kregs->u_regs[UREG_G2] = (unsigned long) args->fn_arg; return 0; } diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index fd2d2361484d..181cc9aafb25 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -158,15 +158,13 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; void (*handler)(void); - int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER); int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; - if (!kthread) { + if (!args->fn) { memcpy(&p->thread.regs.regs, current_pt_regs(), sizeof(p->thread.regs.regs)); PT_REGS_SET_SYSCALL_RETURN(&p->thread.regs, 0); @@ -178,14 +176,14 @@ int copy_thread(struct task_struct * p, const struct kernel_clone_args *args) arch_copy_thread(¤t->thread.arch, &p->thread.arch); } else { get_safe_registers(p->thread.regs.regs.gp, p->thread.regs.regs.fp); - p->thread.request.u.thread.proc = (int (*)(void *))sp; - p->thread.request.u.thread.arg = (void *)arg; + p->thread.request.u.thread.proc = args->fn; + p->thread.request.u.thread.arg = args->fn_arg; handler = new_thread_handler; } new_thread(task_stack_page(p), &p->thread.switch_buf, handler); - if (!kthread) { + if (!args->fn) { clear_flushed_tls(p); /* diff --git a/arch/x86/include/asm/fpu/sched.h b/arch/x86/include/asm/fpu/sched.h index 99a8820e8cc4..b2486b2cbc6e 100644 --- a/arch/x86/include/asm/fpu/sched.h +++ b/arch/x86/include/asm/fpu/sched.h @@ -11,7 +11,7 @@ extern void save_fpregs_to_fpstate(struct fpu *fpu); extern void fpu__drop(struct fpu *fpu); -extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags); +extern int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal); extern void fpu_flush_thread(void); /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index b5f0d2ff47e4..c08eb0fdd11f 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -78,13 +78,13 @@ static inline void update_task_stack(struct task_struct *task) } static inline void kthread_frame_init(struct inactive_task_frame *frame, - unsigned long fun, unsigned long arg) + int (*fun)(void *), void *arg) { - frame->bx = fun; + frame->bx = (unsigned long)fun; #ifdef CONFIG_X86_32 - frame->di = arg; + frame->di = (unsigned long)arg; #else - frame->r12 = arg; + frame->r12 = (unsigned long)arg; #endif } diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..fbade5a3975b 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -556,7 +556,7 @@ static inline void fpu_inherit_perms(struct fpu *dst_fpu) } /* Clone current's FPU state on fork */ -int fpu_clone(struct task_struct *dst, unsigned long clone_flags) +int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal) { struct fpu *src_fpu = ¤t->thread.fpu; struct fpu *dst_fpu = &dst->thread.fpu; @@ -579,7 +579,7 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags) * No FPU state inheritance for kernel threads and IO * worker threads. */ - if (dst->flags & (PF_KTHREAD | PF_IO_WORKER)) { + if (minimal) { /* Clear out the minimal state */ memcpy(&dst_fpu->fpstate->regs, &init_fpstate.regs, init_fpstate_copy_size()); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0fce52b10dc4..d20eaad52a85 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -134,7 +134,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long sp = args->stack; - unsigned long arg = args->stack_size; unsigned long tls = args->tls; struct inactive_task_frame *frame; struct fork_frame *fork_frame; @@ -172,13 +171,13 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame->flags = X86_EFLAGS_FIXED; #endif - fpu_clone(p, clone_flags); + fpu_clone(p, clone_flags, args->fn); /* Kernel thread ? */ if (unlikely(p->flags & PF_KTHREAD)) { p->thread.pkru = pkru_get_init_value(); memset(childregs, 0, sizeof(struct pt_regs)); - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } @@ -198,10 +197,10 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) task_user_gs(p) = get_user_gs(current_pt_regs()); #endif - if (unlikely(p->flags & PF_IO_WORKER)) { + if (unlikely(args->fn)) { /* - * An IO thread is a user space thread, but it doesn't - * return to ret_after_fork(). + * A user space thread, but it doesn't return to + * ret_after_fork(). * * In order to indicate that to tools like gdb, * we reset the stack and instruction pointers. @@ -211,7 +210,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) */ childregs->sp = 0; childregs->ip = 0; - kthread_frame_init(frame, sp, arg); + kthread_frame_init(frame, args->fn, args->fn_arg); return 0; } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 15ce25073142..c3751cc88e5d 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -205,7 +205,6 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; unsigned long usp_thread_fn = args->stack; - unsigned long thread_fn_arg = args->stack_size; unsigned long tls = args->tls; struct pt_regs *childregs = task_pt_regs(p); @@ -226,7 +225,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) #error Unsupported Xtensa ABI #endif - if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (!args->fn) { struct pt_regs *regs = current_pt_regs(); unsigned long usp = usp_thread_fn ? usp_thread_fn : regs->areg[1]; @@ -278,15 +277,15 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) * Window underflow will load registers from the * spill slots on the stack on return from _switch_to. */ - SPILL_SLOT(childregs, 2) = usp_thread_fn; - SPILL_SLOT(childregs, 3) = thread_fn_arg; + SPILL_SLOT(childregs, 2) = (unsigned long)args->fn; + SPILL_SLOT(childregs, 3) = (unsigned long)args->fn_arg; #elif defined(__XTENSA_CALL0_ABI__) /* * a12 = thread_fn, a13 = thread_fn arg. * _switch_to epilogue will load registers from the stack. */ - ((unsigned long *)p->thread.sp)[0] = usp_thread_fn; - ((unsigned long *)p->thread.sp)[1] = thread_fn_arg; + ((unsigned long *)p->thread.sp)[0] = (unsigned long)args->fn; + ((unsigned long *)p->thread.sp)[1] = (unsigned long)args->fn_arg; #else #error Unsupported Xtensa ABI #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d6b99ce5408..505aaf9fe477 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -34,6 +34,8 @@ struct kernel_clone_args { int io_thread; int kthread; int idle; + int (*fn)(void *); + void *fn_arg; struct cgroup *cgrp; struct css_set *cset; }; diff --git a/kernel/fork.c b/kernel/fork.c index 93d77ee921ff..8e17c3fbce42 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2555,8 +2555,8 @@ struct task_struct * __init fork_idle(int cpu) struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, - .stack = (unsigned long)&idle_dummy, - .stack_size = (unsigned long)NULL, + .fn = &idle_dummy, + .fn_arg = NULL, .kthread = 1, .idle = 1, }; @@ -2589,8 +2589,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .io_thread = 1, }; @@ -2694,8 +2694,8 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, .kthread = 1, }; @@ -2711,8 +2711,8 @@ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), - .stack = (unsigned long)fn, - .stack_size = (unsigned long)arg, + .fn = fn, + .fn_arg = arg, }; return kernel_clone(&args); -- cgit v1.2.3-59-g8ed1b From 6d97af487dee3176cb1342d4ab16637e495440ad Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 4 May 2022 08:23:50 +0200 Subject: entry: Rename arch_check_user_regs() to arch_enter_from_user_mode() arch_check_user_regs() is used at the moment to verify that struct pt_regs contains valid values when entering the kernel from userspace. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). When entering the kernel from userspace, arch_check_user_regs() is used to verify that struct pt_regs contains valid values. Note that the NMI codepath doesn't call this function. s390 needs a place in the generic entry code to modify a cpu data structure when switching from userspace to kernel mode. As arch_check_user_regs() is exactly this, rename it to arch_enter_from_user_mode(). Signed-off-by: Sven Schnelle Reviewed-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Link: https://lore.kernel.org/r/20220504062351.2954280-2-tmricht@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/include/asm/entry-common.h | 4 ++-- arch/x86/include/asm/entry-common.h | 4 ++-- include/linux/entry-common.h | 8 ++++---- kernel/entry/common.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index 2f0a1cacdf85..99d654ccd3db 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -15,12 +15,12 @@ void do_per_trap(struct pt_regs *regs); #ifdef CONFIG_DEBUG_ENTRY -static __always_inline void arch_check_user_regs(struct pt_regs *regs) +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) { debug_user_asce(0); } -#define arch_check_user_regs arch_check_user_regs +#define arch_enter_from_user_mode arch_enter_from_user_mode #endif /* CONFIG_DEBUG_ENTRY */ static __always_inline void arch_exit_to_user_mode_work(struct pt_regs *regs, diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 43184640b579..674ed46d3ced 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -10,7 +10,7 @@ #include /* Check that the stack and regs on entry from user mode are sane. */ -static __always_inline void arch_check_user_regs(struct pt_regs *regs) +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) { /* @@ -42,7 +42,7 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) WARN_ON_ONCE(regs != task_pt_regs(current)); } } -#define arch_check_user_regs arch_check_user_regs +#define arch_enter_from_user_mode arch_enter_from_user_mode static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ab78bd4c2eb0..c92ac75d6556 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -63,7 +63,7 @@ ARCH_EXIT_TO_USER_MODE_WORK) /** - * arch_check_user_regs - Architecture specific sanity check for user mode regs + * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs * * Defaults to an empty implementation. Can be replaced by architecture @@ -73,10 +73,10 @@ * section. Use __always_inline so the compiler cannot push it out of line * and make it instrumentable. */ -static __always_inline void arch_check_user_regs(struct pt_regs *regs); +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs); -#ifndef arch_check_user_regs -static __always_inline void arch_check_user_regs(struct pt_regs *regs) {} +#ifndef arch_enter_from_user_mode +static __always_inline void arch_enter_from_user_mode(struct pt_regs *regs) {} #endif /** diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 93c3b86e781c..9e63923c5a0f 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -17,7 +17,7 @@ /* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { - arch_check_user_regs(regs); + arch_enter_from_user_mode(regs); lockdep_hardirqs_off(CALLER_ADDR0); CT_WARN_ON(ct_state() != CONTEXT_USER); -- cgit v1.2.3-59-g8ed1b From 3e20889cfbee1e9716544a14c5d23be598412ddf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 9 May 2022 18:20:46 -0700 Subject: x86/pgtable: support __HAVE_ARCH_PTE_SWP_EXCLUSIVE Let's use bit 3 to remember PG_anon_exclusive in swap ptes. [david@redhat.com: fix 32-bit swap layout] Link: https://lkml.kernel.org/r/d875c292-46b3-f281-65ae-71d0b0c6f592@redhat.com Link: https://lkml.kernel.org/r/20220329164329.208407-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christoph Hellwig Cc: Dave Hansen Cc: Don Dutile Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Liang Zhang Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Kravetz Cc: Mike Rapoport Cc: Nadav Amit Cc: Oded Gabbay Cc: Oleg Nesterov Cc: Paul Mackerras Cc: Pedro Demarchi Gomes Cc: Peter Xu Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 4 +++- arch/x86/include/asm/pgtable_64_types.h | 5 +++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3563f4645fa1..1e09519af143 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1286,6 +1286,23 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { } +#ifdef _PAGE_SWP_EXCLUSIVE +#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE +static inline pte_t pte_swp_mkexclusive(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE); +} + +static inline int pte_swp_exclusive(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE; +} + +static inline pte_t pte_swp_clear_exclusive(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE); +} +#endif /* _PAGE_SWP_EXCLUSIVE */ #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 56d0399a0cd1..e479491da8d5 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -186,7 +186,7 @@ static inline void native_pgd_clear(pgd_t *pgd) * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -203,6 +203,8 @@ static inline void native_pgd_clear(pgd_t *pgd) * F (2) in swp entry is used to record when a pagetable is * writeprotected by userfaultfd WP support. * + * E (3) in swp entry is used to rememeber PG_anon_exclusive. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 91ac10654570..70e360a2e5fb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -163,4 +163,9 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +/* + * We borrow bit 3 to remember PG_anon_exclusive. + */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ -- cgit v1.2.3-59-g8ed1b From ba5d35b442c65f32d38ef61f732218274c6dcf4c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:09 +0530 Subject: perf/amd/ibs: Add support for L3 miss filtering IBS L3 miss filtering works by tagging an instruction on IBS counter overflow and generating an NMI if the tagged instruction causes an L3 miss. Samples without an L3 miss are discarded and counter is reset with random value (between 1-15 for fetch pmu and 1-127 for op pmu). This helps in reducing sampling overhead when user is interested only in such samples. One of the use case of such filtered samples is to feed data to page-migration daemon in tiered memory systems. Add support for L3 miss filtering in IBS driver via new pmu attribute "l3missonly". Example usage: # perf record -a -e ibs_op/l3missonly=1/ --raw-samples sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 67 +++++++++++++++++++++++++++++++++++---- arch/x86/include/asm/perf_event.h | 3 ++ 2 files changed, 63 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ece4f6a7d24b..2dc8b7ec030a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -544,22 +544,46 @@ static const struct attribute_group *empty_attr_groups[] = { PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, }; +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, NULL, }; +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + NULL, +}; + static umode_t cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) { @@ -571,14 +595,26 @@ static struct attribute *cnt_ctl_attrs[] = { NULL, }; +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, .is_visible = cnt_ctl_is_visible, }; +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, + &group_op_l3missonly, NULL, }; @@ -805,10 +841,8 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init int perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - int ret; - /* * Some chips fail to reset the fetch count when it is written; instead * they need a 0-1 transition of IbsFetchEn. @@ -819,12 +853,17 @@ static __init int perf_event_ibs_init(void) if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); - if (ret) - return ret; + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} +static __init int perf_ibs_op_init(void) +{ if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; @@ -834,10 +873,24 @@ static __init int perf_event_ibs_init(void) perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + perf_ibs_op.pmu.attr_groups = empty_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; - ret = perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); if (ret) goto err_op; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 7aa1d420c779..409725e86f42 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -410,6 +410,7 @@ struct pebs_xmm { #define IBS_CAPS_OPBRNFUSE (1U<<8) #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) +#define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -423,6 +424,7 @@ struct pebs_xmm { #define IBSCTL_LVT_OFFSET_MASK 0x0F /* IBS fetch bits/masks */ +#define IBS_FETCH_L3MISSONLY (1ULL<<59) #define IBS_FETCH_RAND_EN (1ULL<<57) #define IBS_FETCH_VAL (1ULL<<49) #define IBS_FETCH_ENABLE (1ULL<<48) @@ -439,6 +441,7 @@ struct pebs_xmm { #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_L3MISSONLY (1ULL<<16) #define IBS_OP_MAX_CNT 0x0000FFFFULL #define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #define IBS_OP_MAX_CNT_EXT_MASK (0x7FULL<<20) /* separate upper 7 bits */ -- cgit v1.2.3-59-g8ed1b From 9cb23f598c641c1dcbe18defd219cdc439bc94a8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Mon, 9 May 2022 10:19:14 +0530 Subject: perf/ibs: Fix comment s/IBS Op Data 2/IBS Op Data 1/ for MSR 0xc0011035. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220509044914.1473-9-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 2 +- tools/arch/x86/include/asm/amd-ibs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index 46e1df45efc0..aabdbb5ab920 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { diff --git a/tools/arch/x86/include/asm/amd-ibs.h b/tools/arch/x86/include/asm/amd-ibs.h index 174e7d83fcbd..765e9e752d03 100644 --- a/tools/arch/x86/include/asm/amd-ibs.h +++ b/tools/arch/x86/include/asm/amd-ibs.h @@ -49,7 +49,7 @@ union ibs_op_ctl { }; }; -/* MSR 0xc0011035: IBS Op Data 2 */ +/* MSR 0xc0011035: IBS Op Data 1 */ union ibs_op_data { __u64 val; struct { -- cgit v1.2.3-59-g8ed1b From 566fb90e050dfa2132340bbdab9533b727dda6f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 22 Apr 2022 06:37:57 +0200 Subject: swiotlb-xen: fix DMA_ATTR_NO_KERNEL_MAPPING on arm swiotlb-xen uses very different ways to allocate coherent memory on x86 vs arm. On the former it allocates memory from the page allocator, while on the later it reuses the dma-direct allocator the handles the complexities of non-coherent DMA on arm platforms. Unfortunately the complexities of trying to deal with the two cases in the swiotlb-xen.c code lead to a bug in the handling of DMA_ATTR_NO_KERNEL_MAPPING on arm. With the DMA_ATTR_NO_KERNEL_MAPPING flag the coherent memory allocator does not actually allocate coherent memory, but just a DMA handle for some memory that is DMA addressable by the device, but which does not have to have a kernel mapping. Thus dereferencing the return value will lead to kernel crashed and memory corruption. Fix this by using the dma-direct allocator directly for arm, which works perfectly fine because on arm swiotlb-xen is only used when the domain is 1:1 mapped, and then simplifying the remaining code to only cater for the x86 case with DMA coherent device. Reported-by: Rahul Singh Signed-off-by: Christoph Hellwig Reviewed-by: Rahul Singh Reviewed-by: Stefano Stabellini Tested-by: Rahul Singh --- arch/arm/include/asm/xen/page-coherent.h | 2 - arch/arm/xen/mm.c | 14 ----- arch/arm64/include/asm/xen/page-coherent.h | 2 - arch/x86/include/asm/xen/page-coherent.h | 24 -------- arch/x86/include/asm/xen/swiotlb-xen.h | 6 ++ arch/x86/xen/mmu_pv.c | 1 + drivers/xen/swiotlb-xen.c | 99 ++++++++++-------------------- include/xen/arm/page-coherent.h | 20 ------ include/xen/swiotlb-xen.h | 6 -- include/xen/xen-ops.h | 7 --- 10 files changed, 41 insertions(+), 140 deletions(-) delete mode 100644 arch/arm/include/asm/xen/page-coherent.h delete mode 100644 arch/arm64/include/asm/xen/page-coherent.h delete mode 100644 arch/x86/include/asm/xen/page-coherent.h delete mode 100644 include/xen/arm/page-coherent.h (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h deleted file mode 100644 index 27e984977402..000000000000 --- a/arch/arm/include/asm/xen/page-coherent.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index ff05a7899cb8..3d826c0b5fee 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -116,20 +116,6 @@ bool xen_arch_need_swiotlb(struct device *dev, !dev_is_dma_coherent(dev)); } -int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle) -{ - /* the domain is 1:1 mapped to use swiotlb-xen */ - *dma_handle = pstart; - return 0; -} - -void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) -{ - return; -} - static int __init xen_mm_init(void) { struct gnttab_cache_flush cflush; diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h deleted file mode 100644 index 27e984977402..000000000000 --- a/arch/arm64/include/asm/xen/page-coherent.h +++ /dev/null @@ -1,2 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include diff --git a/arch/x86/include/asm/xen/page-coherent.h b/arch/x86/include/asm/xen/page-coherent.h deleted file mode 100644 index 63cd41b2e17a..000000000000 --- a/arch/x86/include/asm/xen/page-coherent.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_XEN_PAGE_COHERENT_H -#define _ASM_X86_XEN_PAGE_COHERENT_H - -#include -#include - -static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) -{ - void *vstart = (void*)__get_free_pages(flags, get_order(size)); - *dma_handle = virt_to_phys(vstart); - return vstart; -} - -static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, - unsigned long attrs) -{ - free_pages((unsigned long) cpu_addr, get_order(size)); -} - -#endif /* _ASM_X86_XEN_PAGE_COHERENT_H */ diff --git a/arch/x86/include/asm/xen/swiotlb-xen.h b/arch/x86/include/asm/xen/swiotlb-xen.h index e5a90b42e4dd..77a2d19cc990 100644 --- a/arch/x86/include/asm/xen/swiotlb-xen.h +++ b/arch/x86/include/asm/xen/swiotlb-xen.h @@ -8,4 +8,10 @@ extern int pci_xen_swiotlb_init_late(void); static inline int pci_xen_swiotlb_init_late(void) { return -ENXIO; } #endif +int xen_swiotlb_fixup(void *buf, unsigned long nslabs); +int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle); +void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); + #endif /* _ASM_X86_SWIOTLB_XEN_H */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 00354866921b..ee29fb558f2e 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "multicalls.h" #include "mmu.h" diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index df8085b50df1..67aa74d20162 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -36,7 +36,6 @@ #include #include -#include #include #define MAX_DMA_BITS 32 @@ -104,6 +103,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr) return 0; } +#ifdef CONFIG_X86 int xen_swiotlb_fixup(void *buf, unsigned long nslabs) { int rc; @@ -131,94 +131,58 @@ int xen_swiotlb_fixup(void *buf, unsigned long nslabs) } static void * -xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, - unsigned long attrs) +xen_swiotlb_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - void *ret; + u64 dma_mask = dev->coherent_dma_mask; int order = get_order(size); - u64 dma_mask = DMA_BIT_MASK(32); phys_addr_t phys; - dma_addr_t dev_addr; - - /* - * Ignore region specifiers - the kernel's ideas of - * pseudo-phys memory layout has nothing to do with the - * machine physical layout. We can't allocate highmem - * because we can't return a pointer to it. - */ - flags &= ~(__GFP_DMA | __GFP_HIGHMEM); + void *ret; - /* Convert the size to actually allocated. */ + /* Align the allocation to the Xen page size */ size = 1UL << (order + XEN_PAGE_SHIFT); - /* On ARM this function returns an ioremap'ped virtual address for - * which virt_to_phys doesn't return the corresponding physical - * address. In fact on ARM virt_to_phys only works for kernel direct - * mapped RAM memory. Also see comment below. - */ - ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); - + ret = (void *)__get_free_pages(flags, get_order(size)); if (!ret) return ret; - - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; - - /* At this point dma_handle is the dma address, next we are - * going to set it to the machine address. - * Do not use virt_to_phys(ret) because on ARM it doesn't correspond - * to *dma_handle. */ - phys = dma_to_phys(hwdev, *dma_handle); - dev_addr = xen_phys_to_dma(hwdev, phys); - if (((dev_addr + size - 1 <= dma_mask)) && - !range_straddles_page_boundary(phys, size)) - *dma_handle = dev_addr; - else { - if (xen_create_contiguous_region(phys, order, - fls64(dma_mask), dma_handle) != 0) { - xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); - return NULL; - } - *dma_handle = phys_to_dma(hwdev, *dma_handle); + phys = virt_to_phys(ret); + + *dma_handle = xen_phys_to_dma(dev, phys); + if (*dma_handle + size - 1 > dma_mask || + range_straddles_page_boundary(phys, size)) { + if (xen_create_contiguous_region(phys, order, fls64(dma_mask), + dma_handle) != 0) + goto out_free_pages; SetPageXenRemapped(virt_to_page(ret)); } + memset(ret, 0, size); return ret; + +out_free_pages: + free_pages((unsigned long)ret, get_order(size)); + return NULL; } static void -xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, - dma_addr_t dev_addr, unsigned long attrs) +xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs) { + phys_addr_t phys = virt_to_phys(vaddr); int order = get_order(size); - phys_addr_t phys; - u64 dma_mask = DMA_BIT_MASK(32); - struct page *page; - - if (hwdev && hwdev->coherent_dma_mask) - dma_mask = hwdev->coherent_dma_mask; - - /* do not use virt_to_phys because on ARM it doesn't return you the - * physical address */ - phys = xen_dma_to_phys(hwdev, dev_addr); /* Convert the size to actually allocated. */ size = 1UL << (order + XEN_PAGE_SHIFT); - if (is_vmalloc_addr(vaddr)) - page = vmalloc_to_page(vaddr); - else - page = virt_to_page(vaddr); + if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || + WARN_ON_ONCE(range_straddles_page_boundary(phys, size))) + return; - if (!WARN_ON((dev_addr + size - 1 > dma_mask) || - range_straddles_page_boundary(phys, size)) && - TestClearPageXenRemapped(page)) + if (TestClearPageXenRemapped(virt_to_page(vaddr))) xen_destroy_contiguous_region(phys, order); - - xen_free_coherent_pages(hwdev, size, vaddr, phys_to_dma(hwdev, phys), - attrs); + free_pages((unsigned long)vaddr, get_order(size)); } +#endif /* CONFIG_X86 */ /* * Map a single buffer of the indicated size for DMA in streaming mode. The @@ -421,8 +385,13 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) } const struct dma_map_ops xen_swiotlb_dma_ops = { +#ifdef CONFIG_X86 .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, +#else + .alloc = dma_direct_alloc, + .free = dma_direct_free, +#endif .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, .sync_single_for_device = xen_swiotlb_sync_single_for_device, .sync_sg_for_cpu = xen_swiotlb_sync_sg_for_cpu, diff --git a/include/xen/arm/page-coherent.h b/include/xen/arm/page-coherent.h deleted file mode 100644 index b9cc11e887ed..000000000000 --- a/include/xen/arm/page-coherent.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_ARM_PAGE_COHERENT_H -#define _XEN_ARM_PAGE_COHERENT_H - -#include -#include - -static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) -{ - return dma_direct_alloc(hwdev, size, dma_handle, flags, attrs); -} - -static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) -{ - dma_direct_free(hwdev, size, cpu_addr, dma_handle, attrs); -} - -#endif /* _XEN_ARM_PAGE_COHERENT_H */ diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 590ceb923f0c..808d17ad8d57 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -10,12 +10,6 @@ void xen_dma_sync_for_cpu(struct device *dev, dma_addr_t handle, void xen_dma_sync_for_device(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir); -#ifdef CONFIG_SWIOTLB_XEN -int xen_swiotlb_fixup(void *buf, unsigned long nslabs); -#else -#define xen_swiotlb_fixup NULL -#endif - extern const struct dma_map_ops xen_swiotlb_dma_ops; #endif /* __LINUX_SWIOTLB_XEN_H */ diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index a3584a357f35..c7c1b46ff4cd 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -42,13 +42,6 @@ int xen_setup_shutdown_event(void); extern unsigned long *xen_contiguous_bitmap; -#if defined(CONFIG_XEN_PV) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) -int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, - unsigned int address_bits, - dma_addr_t *dma_handle); -void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); -#endif - #if defined(CONFIG_XEN_PV) int xen_remap_pfn(struct vm_area_struct *vma, unsigned long addr, xen_pfn_t *pfn, int nr, int *err_ptr, pgprot_t prot, -- cgit v1.2.3-59-g8ed1b From d3287fb0d3c8afdfd4870a6cd4a852abc9008b3b Mon Sep 17 00:00:00 2001 From: Jithu Joseph Date: Fri, 6 May 2022 15:53:59 -0700 Subject: x86/microcode/intel: Expose collect_cpu_info_early() for IFS IFS is a CPU feature that allows a binary blob, similar to microcode, to be loaded and consumed to perform low level validation of CPU circuitry. In fact, it carries the same Processor Signature (family/model/stepping) details that are contained in Intel microcode blobs. In support of an IFS driver to trigger loading, validation, and running of these tests blobs, make the functionality of cpu_signatures_match() and collect_cpu_info_early() available outside of the microcode driver. Add an "intel_" prefix and drop the "_early" suffix from collect_cpu_info_early() and EXPORT_SYMBOL_GPL() it. Add declaration to x86 Make cpu_signatures_match() an inline function in x86 , and also give it an "intel_" prefix. No functional change intended. Reviewed-by: Dan Williams Signed-off-by: Jithu Joseph Co-developed-by: Tony Luck Signed-off-by: Tony Luck Reviewed-by: Thomas Gleixner Acked-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/r/20220506225410.1652287-2-tony.luck@intel.com Signed-off-by: Hans de Goede --- arch/x86/include/asm/cpu.h | 18 +++++++++++ arch/x86/kernel/cpu/intel.c | 32 +++++++++++++++++++ arch/x86/kernel/cpu/microcode/intel.c | 59 +++++------------------------------ 3 files changed, 57 insertions(+), 52 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 86e5e4e26fcb..990167357c34 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -76,4 +76,22 @@ static inline void init_ia32_feat_ctl(struct cpuinfo_x86 *c) {} extern __noendbr void cet_disable(void); +struct ucode_cpu_info; + +int intel_cpu_collect_info(struct ucode_cpu_info *uci); + +static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) +{ + if (s1 != s2) + return false; + + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; + + /* ... or they intersect. */ + return p1 & p2; +} + #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8321c43554a1..29d152219cb6 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,6 +181,38 @@ static bool bad_spectre_microcode(struct cpuinfo_x86 *c) return false; } +int intel_cpu_collect_info(struct ucode_cpu_info *uci) +{ + unsigned int val[2]; + unsigned int family, model; + struct cpu_signature csig = { 0 }; + unsigned int eax, ebx, ecx, edx; + + memset(uci, 0, sizeof(*uci)); + + eax = 0x00000001; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + csig.sig = eax; + + family = x86_family(eax); + model = x86_model(eax); + + if (model >= 5 || family > 6) { + /* get processor flags from MSR 0x17 */ + native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + csig.pf = 1 << ((val[1] >> 18) & 7); + } + + csig.rev = intel_get_microcode_revision(); + + uci->cpu_sig = csig; + uci->valid = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(intel_cpu_collect_info); + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index d28a9f8f3fec..025c8f0cd948 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -45,20 +45,6 @@ static struct microcode_intel *intel_ucode_patch; /* last level cache size per core */ static int llc_size_per_core; -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - /* * Returns 1 if update has been found, 0 otherwise. */ @@ -69,7 +55,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) struct extended_signature *ext_sig; int i; - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + if (intel_cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) return 1; /* Look for ext. headers: */ @@ -80,7 +66,7 @@ static int find_matching_signature(void *mc, unsigned int csig, int cpf) ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + if (intel_cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) return 1; ext_sig++; } @@ -342,37 +328,6 @@ next: return patch; } -static int collect_cpu_info_early(struct ucode_cpu_info *uci) -{ - unsigned int val[2]; - unsigned int family, model; - struct cpu_signature csig = { 0 }; - unsigned int eax, ebx, ecx, edx; - - memset(uci, 0, sizeof(*uci)); - - eax = 0x00000001; - ecx = 0; - native_cpuid(&eax, &ebx, &ecx, &edx); - csig.sig = eax; - - family = x86_family(eax); - model = x86_model(eax); - - if ((model >= 5) || (family > 6)) { - /* get processor flags from MSR 0x17 */ - native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - csig.pf = 1 << ((val[1] >> 18) & 7); - } - - csig.rev = intel_get_microcode_revision(); - - uci->cpu_sig = csig; - uci->valid = 1; - - return 0; -} - static void show_saved_mc(void) { #ifdef DEBUG @@ -386,7 +341,7 @@ static void show_saved_mc(void) return; } - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; @@ -502,7 +457,7 @@ void show_ucode_info_early(void) struct ucode_cpu_info uci; if (delay_ucode_info) { - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); print_ucode_info(&uci, current_mc_date); delay_ucode_info = 0; } @@ -604,7 +559,7 @@ int __init save_microcode_in_initrd_intel(void) if (!(cp.data && cp.size)) return 0; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); scan_microcode(cp.data, cp.size, &uci, true); @@ -637,7 +592,7 @@ static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) if (!(cp.data && cp.size)) return NULL; - collect_cpu_info_early(uci); + intel_cpu_collect_info(uci); return scan_microcode(cp.data, cp.size, uci, false); } @@ -712,7 +667,7 @@ void reload_ucode_intel(void) struct microcode_intel *p; struct ucode_cpu_info uci; - collect_cpu_info_early(&uci); + intel_cpu_collect_info(&uci); p = find_patch(&uci); if (!p) -- cgit v1.2.3-59-g8ed1b From db1af12929c99d15fc04cc5c4447b87ab51eab0a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Fri, 6 May 2022 15:54:00 -0700 Subject: x86/msr-index: Define INTEGRITY_CAPABILITIES MSR The INTEGRITY_CAPABILITIES MSR is enumerated by bit 2 of the CORE_CAPABILITIES MSR. Add defines for the CORE_CAPS enumeration as well as for the integrity MSR. Reviewed-by: Dan Williams Signed-off-by: Tony Luck Reviewed-by: Greg Kroah-Hartman Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/20220506225410.1652287-3-tony.luck@intel.com Signed-off-by: Hans de Goede --- arch/x86/include/asm/msr-index.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 0eb90d21049e..6fc5de0c61a6 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -76,6 +76,8 @@ /* Abbreviated from Intel SDM name IA32_CORE_CAPABILITIES */ #define MSR_IA32_CORE_CAPS 0x000000cf +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT 2 +#define MSR_IA32_CORE_CAPS_INTEGRITY_CAPS BIT(MSR_IA32_CORE_CAPS_INTEGRITY_CAPS_BIT) #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT 5 #define MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT BIT(MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT_BIT) @@ -154,6 +156,11 @@ #define MSR_IA32_POWER_CTL 0x000001fc #define MSR_IA32_POWER_CTL_BIT_EE 19 +/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ +#define MSR_INTEGRITY_CAPS 0x000002d9 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 +#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) + #define MSR_LBR_NHM_FROM 0x00000680 #define MSR_LBR_NHM_TO 0x000006c0 #define MSR_LBR_CORE_FROM 0x00000040 -- cgit v1.2.3-59-g8ed1b From f5c0b4f30416c670408a77be94703d04d22b57df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 May 2022 14:04:08 +0200 Subject: x86/prctl: Remove pointless task argument The functions invoked via do_arch_prctl_common() can only operate on the current task and none of these function uses the task argument. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/87lev7vtxj.ffs@tglx --- arch/x86/include/asm/fpu/api.h | 3 +-- arch/x86/include/asm/proto.h | 3 +-- arch/x86/kernel/fpu/xstate.c | 5 +---- arch/x86/kernel/process.c | 9 ++++----- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 4 ++-- 6 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index c83b3020350a..6b0f31fb53f7 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -162,7 +162,6 @@ static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) } /* prctl */ -struct task_struct; -extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); +extern long fpu_xstate_prctl(int option, unsigned long arg2); #endif /* _ASM_X86_FPU_API_H */ diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..80be803b96d2 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -39,7 +39,6 @@ void x86_report_nx(void); extern int reboot_force; -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2); +long do_arch_prctl_common(int option, unsigned long arg2); #endif /* _ASM_X86_PROTO_H */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 39e1c8626ab9..1b016a186c11 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -1687,16 +1687,13 @@ EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ -long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) +long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; - if (tsk != current) - return -EPERM; - switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b3d2d414cfad..e86d09a6aac7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -334,7 +334,7 @@ static int get_cpuid_mode(void) return !test_thread_flag(TIF_NOCPUID); } -static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) +static int set_cpuid_mode(unsigned long cpuid_enabled) { if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; @@ -985,20 +985,19 @@ unsigned long __get_wchan(struct task_struct *p) return addr; } -long do_arch_prctl_common(struct task_struct *task, int option, - unsigned long arg2) +long do_arch_prctl_common(int option, unsigned long arg2) { switch (option) { case ARCH_GET_CPUID: return get_cpuid_mode(); case ARCH_SET_CPUID: - return set_cpuid_mode(task, arg2); + return set_cpuid_mode(arg2); case ARCH_GET_XCOMP_SUPP: case ARCH_GET_XCOMP_PERM: case ARCH_REQ_XCOMP_PERM: case ARCH_GET_XCOMP_GUEST_PERM: case ARCH_REQ_XCOMP_GUEST_PERM: - return fpu_xstate_prctl(task, option, arg2); + return fpu_xstate_prctl(option, arg2); } return -EINVAL; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26edb1cd07a4..0faa5e28dd64 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -222,5 +222,5 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e459253649be..1962008fe743 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -844,7 +844,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) - ret = do_arch_prctl_common(current, option, arg2); + ret = do_arch_prctl_common(option, arg2); return ret; } @@ -852,7 +852,7 @@ SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { - return do_arch_prctl_common(current, option, arg2); + return do_arch_prctl_common(option, arg2); } #endif -- cgit v1.2.3-59-g8ed1b From c9fe66560bf2dc7d109754414e309888cb8c9ba9 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm/mprotect: do not flush when not required architecturally Currently, using mprotect() to unprotect a memory region or uffd to unprotect a memory region causes a TLB flush. However, in such cases the PTE is often not modified (i.e., remain RO) and therefore not TLB flush is needed. Add an arch-specific pte_needs_flush() which tells whether a TLB flush is needed based on the old PTE and the new one. Implement an x86 pte_needs_flush(). Always flush the TLB when it is architecturally needed even when skipping a TLB flush might only result in a spurious page-faults by skipping the flush. Even with such conservative manner, we can in the future further refine the checks to test whether a PTE is present by only considering the architectural _PAGE_PRESENT flag instead of {pte|pmd}_preesnt(). For not be careful and use the latter. Link: https://lkml.kernel.org/r/20220401180821.1986781-3-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Cc: Andrew Cooper Cc: Peter Xu Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_types.h | 2 + arch/x86/include/asm/tlbflush.h | 97 ++++++++++++++++++++++++++++++++++++ include/asm-generic/tlb.h | 14 ++++++ mm/huge_memory.c | 9 ++-- mm/mprotect.c | 3 +- 5 files changed, 120 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 40497a9020c6..8668bc661026 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -110,9 +110,11 @@ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) +#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4) #else #define _PAGE_NX (_AT(pteval_t, 0)) #define _PAGE_DEVMAP (_AT(pteval_t, 0)) +#define _PAGE_SOFTW4 (_AT(pteval_t, 0)) #endif #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 98fa0a114074..4af5579c7ef7 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); +static inline bool pte_flags_need_flush(unsigned long oldflags, + unsigned long newflags, + bool ignore_access) +{ + /* + * Flags that require a flush when cleared but not when they are set. + * Only include flags that would not trigger spurious page-faults. + * Non-present entries are not cached. Hardware would set the + * dirty/access bit if needed without a fault. + */ + const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT | + _PAGE_ACCESSED; + const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 | + _PAGE_SOFTW3 | _PAGE_SOFTW4; + const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT | + _PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT | + _PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 | + _PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX; + unsigned long diff = oldflags ^ newflags; + + BUILD_BUG_ON(flush_on_clear & software_flags); + BUILD_BUG_ON(flush_on_clear & flush_on_change); + BUILD_BUG_ON(flush_on_change & software_flags); + + /* Ignore software flags */ + diff &= ~software_flags; + + if (ignore_access) + diff &= ~_PAGE_ACCESSED; + + /* + * Did any of the 'flush_on_clear' flags was clleared set from between + * 'oldflags' and 'newflags'? + */ + if (diff & oldflags & flush_on_clear) + return true; + + /* Flush on modified flags. */ + if (diff & flush_on_change) + return true; + + /* Ensure there are no flags that were left behind */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + (diff & ~(flush_on_clear | software_flags | flush_on_change))) { + VM_WARN_ON_ONCE(1); + return true; + } + + return false; +} + +/* + * pte_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace PTEs. + */ +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pte_flags(oldpte) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pte_pfn(oldpte) != pte_pfn(newpte)) + return true; + + /* + * check PTE flags; ignore access-bit; see comment in + * ptep_clear_flush_young(). + */ + return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte), + true); +} +#define pte_needs_flush pte_needs_flush + +/* + * huge_pmd_needs_flush() checks whether permissions were demoted and require a + * flush. It should only be used for userspace huge PMDs. + */ +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + /* !PRESENT -> * ; no need for flush */ + if (!(pmd_flags(oldpmd) & _PAGE_PRESENT)) + return false; + + /* PFN changed ; needs flush */ + if (pmd_pfn(oldpmd) != pmd_pfn(newpmd)) + return true; + + /* + * check PMD flags; do not ignore access-bit; see + * pmdp_clear_flush_young(). + */ + return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd), + false); +} +#define huge_pmd_needs_flush huge_pmd_needs_flush + #endif /* !MODULE */ static inline void __native_tlb_flush_global(unsigned long cr4) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index eee6f7763a39..ff3e82553a76 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -658,6 +658,20 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb, } while (0) #endif +#ifndef pte_needs_flush +static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte) +{ + return true; +} +#endif + +#ifndef huge_pmd_needs_flush +static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd) +{ + return true; +} +#endif + #endif /* CONFIG_MMU */ #endif /* _ASM_GENERIC__TLB_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8db17c042aed..2befa9cfb46e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1715,7 +1715,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t entry; + pmd_t oldpmd, entry; bool preserve_write; int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; @@ -1804,9 +1804,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = pmdp_invalidate(vma, addr, pmd); + oldpmd = pmdp_invalidate(vma, addr, pmd); - entry = pmd_modify(entry, newprot); + entry = pmd_modify(oldpmd, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); if (uffd_wp) { @@ -1823,7 +1823,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); + if (huge_pmd_needs_flush(oldpmd, entry)) + tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: diff --git a/mm/mprotect.c b/mm/mprotect.c index 420be0201118..20a46f21cca8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -152,7 +152,8 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, ptent = pte_mkwrite(ptent); } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - tlb_flush_pte_range(tlb, addr, PAGE_SIZE); + if (pte_needs_flush(oldpte, ptent)) + tlb_flush_pte_range(tlb, addr, PAGE_SIZE); pages++; } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- cgit v1.2.3-59-g8ed1b From 4f83145721f362c2f4d312edc4755269a2069488 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 9 May 2022 18:20:50 -0700 Subject: mm: avoid unnecessary flush on change_huge_pmd() Calls to change_protection_range() on THP can trigger, at least on x86, two TLB flushes for one page: one immediately, when pmdp_invalidate() is called by change_huge_pmd(), and then another one later (that can be batched) when change_protection_range() finishes. The first TLB flush is only necessary to prevent the dirty bit (and with a lesser importance the access bit) from changing while the PTE is modified. However, this is not necessary as the x86 CPUs set the dirty-bit atomically with an additional check that the PTE is (still) present. One caveat is Intel's Knights Landing that has a bug and does not do so. Leverage this behavior to eliminate the unnecessary TLB flush in change_huge_pmd(). Introduce a new arch specific pmdp_invalidate_ad() that only invalidates the access and dirty bit from further changes. Link: https://lkml.kernel.org/r/20220401180821.1986781-4-namit@vmware.com Signed-off-by: Nadav Amit Cc: Andrea Arcangeli Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Dave Hansen Cc: Peter Xu Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Yu Zhao Cc: Nick Piggin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 5 +++++ arch/x86/mm/pgtable.c | 10 ++++++++++ include/linux/pgtable.h | 20 ++++++++++++++++++++ mm/huge_memory.c | 4 ++-- mm/pgtable-generic.c | 8 ++++++++ 5 files changed, 45 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1e09519af143..0821f87d495f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1168,6 +1168,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, } } #endif + +#define __HAVE_ARCH_PMDP_INVALIDATE_AD +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3481b35cb4ec..f16059e9a85e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } + +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + /* + * No flush is necessary. Once an invalid PTE is established, the PTE's + * access and dirty bits cannot be updated. + */ + return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); +} #endif /** diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 53750224e176..530b1817b58c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -570,6 +570,26 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD + +/* + * pmdp_invalidate_ad() invalidates the PMD while changing a transparent + * hugepage mapping in the page tables. This function is similar to + * pmdp_invalidate(), but should only be used if the access and dirty bits would + * not be cleared by the software in the new PMD value. The function ensures + * that hardware changes of the access and dirty bits updates would not be lost. + * + * Doing so can allow in certain architectures to avoid a TLB flush in most + * cases. Yet, another TLB flush might be necessary later if the PMD update + * itself requires such flush (e.g., if protection was set to be stricter). Yet, + * even when a TLB flush is needed because of the update, the caller may be able + * to batch these TLB flushing operations, so fewer TLB flush operations are + * needed. + */ +extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2befa9cfb46e..6f37f77eb48c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1801,10 +1801,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * - * pmdp_invalidate() is required to make sure we don't miss + * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware. */ - oldpmd = pmdp_invalidate(vma, addr, pmd); + oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); if (preserve_write) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 6523fda274e5..90ab721a12a8 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -201,6 +201,14 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, } #endif +#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD +pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + return pmdp_invalidate(vma, address, pmdp); +} +#endif + #ifndef pmdp_collapse_flush pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) -- cgit v1.2.3-59-g8ed1b From e5a554014618308f046af99ab9c950165ed6cb11 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: move pxx_user_accessible_page into x86 The pxx_user_accessible_page() checks the PTE bit, it's architecture-specific code, move them into x86's pgtable.h. These helpers are being moved out to make the page table check framework platform independent. Link: https://lkml.kernel.org/r/20220507110114.4128854-3-tongtiangen@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 17 +++++++++++++++++ mm/page_table_check.c | 17 ----------------- 2 files changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 0821f87d495f..46fa65d818bd 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1447,6 +1447,23 @@ static inline bool arch_faults_on_old_pte(void) return false; } +#ifdef CONFIG_PAGE_TABLE_CHECK +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER); +} +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/mm/page_table_check.c b/mm/page_table_check.c index eb0d0b71cdf6..3692bea2ea2c 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -52,23 +52,6 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) return (void *)(page_ext) + page_table_check_ops.offset; } -static inline bool pte_user_accessible_page(pte_t pte) -{ - return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); -} - -static inline bool pmd_user_accessible_page(pmd_t pmd) -{ - return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && - (pmd_val(pmd) & _PAGE_USER); -} - -static inline bool pud_user_accessible_page(pud_t pud) -{ - return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && - (pud_val(pud) & _PAGE_USER); -} - /* * An enty is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. -- cgit v1.2.3-59-g8ed1b From de8c8e52836d0082188508548d0b939f49f7f0e6 Mon Sep 17 00:00:00 2001 From: Tong Tiangen Date: Thu, 12 May 2022 20:23:06 -0700 Subject: mm: page_table_check: add hooks to public helpers Move ptep_clear() to the include/linux/pgtable.h and add page table check relate hooks to some helpers, it's prepare for support page table check feature on new architecture. Optimize the implementation of ptep_clear(), page table hooks added page table check stubs, the interface control should be at stubs, there is no rationale for doing a IS_ENABLED() check here. For architectures that do not enable CONFIG_PAGE_TABLE_CHECK, they will call a fallback page table check stubs[1] when getting their page table helpers[2] in include/linux/pgtable.h. [1] page table check stubs defined in include/linux/page_table_check.h [2] ptep_clear() ptep_get_and_clear() pmdp_huge_get_and_clear() pudp_huge_get_and_clear() Link: https://lkml.kernel.org/r/20220507110114.4128854-4-tongtiangen@huawei.com Signed-off-by: Tong Tiangen Acked-by: Pasha Tatashin Cc: Anshuman Khandual Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Kefeng Wang Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 10 ---------- include/linux/pgtable.h | 23 +++++++++++++++-------- 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 46fa65d818bd..44e2d6f1dbaa 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1072,16 +1072,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, return pte; } -#define __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) - ptep_get_and_clear(mm, addr, ptep); - else - pte_clear(mm, addr, ptep); -} - #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 39fa93e94b80..e0a449b477c5 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -12,6 +12,7 @@ #include #include #include +#include #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS @@ -259,14 +260,6 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -#ifndef __HAVE_ARCH_PTEP_CLEAR -static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_clear(mm, addr, ptep); -} -#endif - #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, @@ -274,10 +267,19 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = *ptep; pte_clear(mm, address, ptep); + page_table_check_pte_clear(mm, address, pte); return pte; } #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + ptep_get_and_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) { @@ -347,7 +349,10 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t *pmdp) { pmd_t pmd = *pmdp; + pmd_clear(pmdp); + page_table_check_pmd_clear(mm, address, pmd); + return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ @@ -359,6 +364,8 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); + page_table_check_pud_clear(mm, address, pud); + return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ -- cgit v1.2.3-59-g8ed1b From 3bd4abc07a267e6a8b33d7f8717136e18f921c53 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Fri, 8 Apr 2022 18:03:13 +0200 Subject: x86/tsc: Use fallback for random_get_entropy() instead of zero In the event that random_get_entropy() can't access a cycle counter or similar, falling back to returning 0 is suboptimal. Instead, fallback to calling random_get_entropy_fallback(), which isn't extremely high precision or guaranteed to be entropic, but is certainly better than returning zero all the time. If CONFIG_X86_TSC=n, then it's possible for the kernel to run on systems without RDTSC, such as 486 and certain 586, so the fallback code is only required for that case. As well, fix up both the new function and the get_cycles() function from which it was derived to use cpu_feature_enabled() rather than boot_cpu_has(), and use !IS_ENABLED() instead of #ifndef. Signed-off-by: Jason A. Donenfeld Reviewed-by: Thomas Gleixner Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Borislav Petkov Cc: x86@kernel.org --- arch/x86/include/asm/timex.h | 9 +++++++++ arch/x86/include/asm/tsc.h | 7 +++---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h index a4a8b1b16c0c..956e4145311b 100644 --- a/arch/x86/include/asm/timex.h +++ b/arch/x86/include/asm/timex.h @@ -5,6 +5,15 @@ #include #include +static inline unsigned long random_get_entropy(void) +{ + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) + return random_get_entropy_fallback(); + return rdtsc(); +} +#define random_get_entropy random_get_entropy + /* Assume we use the PIT time source for the clock tick */ #define CLOCK_TICK_RATE PIT_TICK_RATE diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 01a300a9700b..fbdc3d951494 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -20,13 +20,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { -#ifndef CONFIG_X86_TSC - if (!boot_cpu_has(X86_FEATURE_TSC)) + if (!IS_ENABLED(CONFIG_X86_TSC) && + !cpu_feature_enabled(X86_FEATURE_TSC)) return 0; -#endif - return rdtsc(); } +#define get_cycles get_cycles extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -- cgit v1.2.3-59-g8ed1b From b3fdf9398a16f01dc013967a4ab25e99c3f4fc12 Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Mon, 16 May 2022 11:21:46 -0700 Subject: x86/mce: relocate set{clear}_mce_nospec() functions Relocate the twin mce functions to arch/x86/mm/pat/set_memory.c file where they belong. While at it, fixup a function name in a comment. Reviewed-by: Christoph Hellwig Reviewed-by: Dan Williams Signed-off-by: Jane Chu Acked-by: Borislav Petkov Cc: Stephen Rothwell [sfr: gate {set,clear}_mce_nospec() by CONFIG_X86_64] Link: https://lore.kernel.org/r/165272527328.90175.8336008202048685278.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams --- arch/x86/include/asm/set_memory.h | 52 --------------------------------------- arch/x86/mm/pat/set_memory.c | 50 +++++++++++++++++++++++++++++++++++-- include/linux/set_memory.h | 8 +++--- 3 files changed, 52 insertions(+), 58 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 78ca53512486..b45c4d27fd46 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -86,56 +86,4 @@ bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; -#ifdef CONFIG_X86_64 -/* - * Prevent speculative access to the page by either unmapping - * it (if we do not require access to any part of the page) or - * marking it uncacheable (if we want to try to retrieve data - * from non-poisoned lines in the page). - */ -static inline int set_mce_nospec(unsigned long pfn, bool unmap) -{ - unsigned long decoy_addr; - int rc; - - /* SGX pages are not in the 1:1 map */ - if (arch_is_platform_page(pfn << PAGE_SHIFT)) - return 0; - /* - * We would like to just call: - * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); - * but doing that would radically increase the odds of a - * speculative access to the poison page because we'd have - * the virtual address of the kernel 1:1 mapping sitting - * around in registers. - * Instead we get tricky. We create a non-canonical address - * that looks just like the one we want, but has bit 63 flipped. - * This relies on set_memory_XX() properly sanitizing any __pa() - * results with __PHYSICAL_MASK or PTE_PFN_MASK. - */ - decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); - - if (unmap) - rc = set_memory_np(decoy_addr, 1); - else - rc = set_memory_uc(decoy_addr, 1); - if (rc) - pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); - return rc; -} -#define set_mce_nospec set_mce_nospec - -/* Restore full speculative operation to the pfn. */ -static inline int clear_mce_nospec(unsigned long pfn) -{ - return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); -} -#define clear_mce_nospec clear_mce_nospec -#else -/* - * Few people would run a 32-bit kernel on a machine that supports - * recoverable errors because they have too much memory to boot 32-bit. - */ -#endif - #endif /* _ASM_X86_SET_MEMORY_H */ diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index abf5ed76e4b7..0caf4b0edcbc 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include #include #include @@ -1816,7 +1816,7 @@ static inline int cpa_clear_pages_array(struct page **pages, int numpages, } /* - * _set_memory_prot is an internal helper for callers that have been passed + * __set_memory_prot is an internal helper for callers that have been passed * a pgprot_t value from upper layers and a reservation has already been taken. * If you want to set the pgprot to a specific page protocol, use the * set_memory_xx() functions. @@ -1925,6 +1925,52 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); +/* + * Prevent speculative access to the page by either unmapping + * it (if we do not require access to any part of the page) or + * marking it uncacheable (if we want to try to retrieve data + * from non-poisoned lines in the page). + */ +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn, bool unmap) +{ + unsigned long decoy_addr; + int rc; + + /* SGX pages are not in the 1:1 map */ + if (arch_is_platform_page(pfn << PAGE_SHIFT)) + return 0; + /* + * We would like to just call: + * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the poison page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_XX() properly sanitizing any __pa() + * results with __PHYSICAL_MASK or PTE_PFN_MASK. + */ + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); + + if (unmap) + rc = set_memory_np(decoy_addr, 1); + else + rc = set_memory_uc(decoy_addr, 1); + if (rc) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + return rc; +} + +/* Restore full speculative operation to the pfn. */ +int clear_mce_nospec(unsigned long pfn) +{ + return set_memory_wb((unsigned long) pfn_to_kaddr(pfn), 1); +} +EXPORT_SYMBOL_GPL(clear_mce_nospec); +#endif /* CONFIG_X86_64 */ + int set_memory_x(unsigned long addr, int numpages) { if (!(__supported_pte_mask & _PAGE_NX)) diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index f36be5166c19..683a6c3f7179 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -42,14 +42,14 @@ static inline bool can_set_direct_map(void) #endif #endif /* CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -#ifndef set_mce_nospec +#ifdef CONFIG_X86_64 +int set_mce_nospec(unsigned long pfn, bool unmap); +int clear_mce_nospec(unsigned long pfn); +#else static inline int set_mce_nospec(unsigned long pfn, bool unmap) { return 0; } -#endif - -#ifndef clear_mce_nospec static inline int clear_mce_nospec(unsigned long pfn) { return 0; -- cgit v1.2.3-59-g8ed1b From a7fed5c0431dbfa707037848830f980e0f93cfb3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 May 2022 13:39:34 +0200 Subject: x86/nmi: Make register_nmi_handler() more robust register_nmi_handler() has no sanity check whether a handler has been registered already. Such an unintended double-add leads to list corruption and hard to diagnose problems during the next NMI handling. Init the list head in the static NMI action struct and check it for being empty in register_nmi_handler(). [ bp: Fixups. ] Reported-by: Sean Christopherson Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/lkml/20220511234332.3654455-1-seanjc@google.com --- arch/x86/include/asm/nmi.h | 1 + arch/x86/kernel/nmi.c | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 1cb9c17a4cb4..5c5f1e56c404 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -47,6 +47,7 @@ struct nmiaction { #define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ static struct nmiaction init fn##_na = { \ + .list = LIST_HEAD_INIT(fn##_na.list), \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e73f7df362f5..cec0bfa3bc04 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -157,7 +157,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) struct nmi_desc *desc = nmi_to_desc(type); unsigned long flags; - if (!action->handler) + if (WARN_ON_ONCE(!action->handler || !list_empty(&action->list))) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); @@ -177,7 +177,7 @@ int __register_nmi_handler(unsigned int type, struct nmiaction *action) list_add_rcu(&action->list, &desc->head); else list_add_tail_rcu(&action->list, &desc->head); - + raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -186,7 +186,7 @@ EXPORT_SYMBOL(__register_nmi_handler); void unregister_nmi_handler(unsigned int type, const char *name) { struct nmi_desc *desc = nmi_to_desc(type); - struct nmiaction *n; + struct nmiaction *n, *found = NULL; unsigned long flags; raw_spin_lock_irqsave(&desc->lock, flags); @@ -200,12 +200,16 @@ void unregister_nmi_handler(unsigned int type, const char *name) WARN(in_nmi(), "Trying to free NMI (%s) from NMI context!\n", n->name); list_del_rcu(&n->list); + found = n; break; } } raw_spin_unlock_irqrestore(&desc->lock, flags); - synchronize_rcu(); + if (found) { + synchronize_rcu(); + INIT_LIST_HEAD(&found->list); + } } EXPORT_SYMBOL_GPL(unregister_nmi_handler); -- cgit v1.2.3-59-g8ed1b From c2df0a6af177b6c06a859806a876f92b072dc624 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 15 May 2022 20:42:04 +0200 Subject: locking/atomic/x86: Introduce arch_try_cmpxchg64 Introduce arch_try_cmpxchg64 for 64-bit and 32-bit targets to improve code using cmpxchg64. On 64-bit targets, the generated assembly improves from: ab: 89 c8 mov %ecx,%eax ad: 48 89 4c 24 60 mov %rcx,0x60(%rsp) b2: 83 e0 fd and $0xfffffffd,%eax b5: 89 54 24 64 mov %edx,0x64(%rsp) b9: 88 44 24 60 mov %al,0x60(%rsp) bd: 48 89 c8 mov %rcx,%rax c0: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) c5: 48 8b 74 24 60 mov 0x60(%rsp),%rsi ca: f0 49 0f b1 34 24 lock cmpxchg %rsi,(%r12) d0: 48 39 c1 cmp %rax,%rcx d3: 75 cf jne a4 to: b3: 89 c2 mov %eax,%edx b5: 48 89 44 24 60 mov %rax,0x60(%rsp) ba: 83 e2 fd and $0xfffffffd,%edx bd: 89 4c 24 64 mov %ecx,0x64(%rsp) c1: 88 54 24 60 mov %dl,0x60(%rsp) c5: c6 44 24 62 f2 movb $0xf2,0x62(%rsp) ca: 48 8b 54 24 60 mov 0x60(%rsp),%rdx cf: f0 48 0f b1 13 lock cmpxchg %rdx,(%rbx) d4: 75 d5 jne ab where a move and a compare after cmpxchg is saved. The improvements for 32-bit targets are even more noticeable, because dual-word compare after cmpxchg8b gets eliminated. Signed-off-by: Uros Bizjak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220515184205.103089-3-ubizjak@gmail.com --- arch/x86/include/asm/cmpxchg_32.h | 21 +++++++++++++++++++++ arch/x86/include/asm/cmpxchg_64.h | 6 ++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 0a7fe0321613..215f5a65790f 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -42,6 +42,9 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) #define arch_cmpxchg64_local(ptr, o, n) \ ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \ (unsigned long long)(n))) +#define arch_try_cmpxchg64(ptr, po, n) \ + __try_cmpxchg64((ptr), (unsigned long long *)(po), \ + (unsigned long long)(n)) #endif static inline u64 __cmpxchg64(volatile u64 *ptr, u64 old, u64 new) @@ -70,6 +73,24 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) return prev; } +static inline bool __try_cmpxchg64(volatile u64 *ptr, u64 *pold, u64 new) +{ + bool success; + u64 old = *pold; + asm volatile(LOCK_PREFIX "cmpxchg8b %[ptr]" + CC_SET(z) + : CC_OUT(z) (success), + [ptr] "+m" (*ptr), + "+A" (old) + : "b" ((u32)new), + "c" ((u32)(new >> 32)) + : "memory"); + + if (unlikely(!success)) + *pold = old; + return success; +} + #ifndef CONFIG_X86_CMPXCHG64 /* * Building a kernel capable running on 80386 and 80486. It may be necessary diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 072e5459fe2f..250187ac8248 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -19,6 +19,12 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) arch_cmpxchg_local((ptr), (o), (n)); \ }) +#define arch_try_cmpxchg64(ptr, po, n) \ +({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + arch_try_cmpxchg((ptr), (po), (n)); \ +}) + #define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ -- cgit v1.2.3-59-g8ed1b From 47f33de4aafb2f5e43d480d590a939d0f1d566a9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 12 Apr 2022 20:49:08 +0800 Subject: x86/sev: Mark the code returning to user space as syscall gap When returning to user space, %rsp is user-controlled value. If it is a SNP-guest and the hypervisor decides to mess with the code-page for this path while a CPU is executing it, a potential #VC could hit in the syscall return path and mislead the #VC handler. So make ip_within_syscall_gap() return true in this case. Signed-off-by: Lai Jiangshan Signed-off-by: Borislav Petkov Acked-by: Joerg Roedel Link: https://lore.kernel.org/r/20220412124909.10467-1-jiangshanlai@gmail.com --- arch/x86/entry/entry_64.S | 2 ++ arch/x86/entry/entry_64_compat.S | 2 ++ arch/x86/include/asm/proto.h | 4 ++++ arch/x86/include/asm/ptrace.h | 4 ++++ 4 files changed, 12 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f7bd8001bf07..2fd8a5cad853 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -215,8 +215,10 @@ syscall_return_via_sysret: popq %rdi popq %rsp +SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) swapgs sysretq +SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) SYM_CODE_END(entry_SYSCALL_64) /* diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 4fdb007cddbd..3c0e14960e2b 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -297,6 +297,7 @@ sysret32_from_system_call: * code. We zero R8-R10 to avoid info leaks. */ movq RSP-ORIG_RAX(%rsp), %rsp +SYM_INNER_LABEL(entry_SYSRETL_compat_unsafe_stack, SYM_L_GLOBAL) /* * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored @@ -314,6 +315,7 @@ sysret32_from_system_call: xorl %r10d, %r10d swapgs sysretl +SYM_INNER_LABEL(entry_SYSRETL_compat_end, SYM_L_GLOBAL) SYM_CODE_END(entry_SYSCALL_compat) /* diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index feed36d44d04..f042cfc9938f 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -13,6 +13,8 @@ void syscall_init(void); #ifdef CONFIG_X86_64 void entry_SYSCALL_64(void); void entry_SYSCALL_64_safe_stack(void); +void entry_SYSRETQ_unsafe_stack(void); +void entry_SYSRETQ_end(void); long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2); #endif @@ -28,6 +30,8 @@ void entry_SYSENTER_compat(void); void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_SYSCALL_compat_safe_stack(void); +void entry_SYSRETL_compat_unsafe_stack(void); +void entry_SYSRETL_compat_end(void); void entry_INT80_compat(void); #ifdef CONFIG_XEN_PV void xen_entry_INT80_compat(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 4357e0f2cd5f..f4db78b09c8f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -186,9 +186,13 @@ static __always_inline bool ip_within_syscall_gap(struct pt_regs *regs) bool ret = (regs->ip >= (unsigned long)entry_SYSCALL_64 && regs->ip < (unsigned long)entry_SYSCALL_64_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETQ_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETQ_end); #ifdef CONFIG_IA32_EMULATION ret = ret || (regs->ip >= (unsigned long)entry_SYSCALL_compat && regs->ip < (unsigned long)entry_SYSCALL_compat_safe_stack); + ret = ret || (regs->ip >= (unsigned long)entry_SYSRETL_compat_unsafe_stack && + regs->ip < (unsigned long)entry_SYSRETL_compat_end); #endif return ret; -- cgit v1.2.3-59-g8ed1b From fa6dae5d82081e8d9f8e6a2baf7149442a6c1ba5 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 19 May 2022 17:21:48 +0200 Subject: x86/PCI: Add kernel cmdline options to use/ignore E820 reserved regions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some firmware supplies PCI host bridge _CRS that includes address space unusable by PCI devices, e.g., space occupied by host bridge registers or used by hidden PCI devices. To avoid this unusable space, Linux currently excludes E820 reserved regions from _CRS windows; see 4dc2287c1805 ("x86: avoid E820 regions when allocating address space"). However, this use of E820 reserved regions to clip things out of _CRS is not supported by ACPI, UEFI, or PCI Firmware specs, and some systems have E820 reserved regions that cover the entire memory window from _CRS. 4dc2287c1805 clips the entire window, leaving no space for hot-added or uninitialized PCI devices. For example, from a Lenovo IdeaPad 3 15IIL 81WE: BIOS-e820: [mem 0x4bc50000-0xcfffffff] reserved pci_bus 0000:00: root bus resource [mem 0x65400000-0xbfffffff window] pci 0000:00:15.0: BAR 0: [mem 0x00000000-0x00000fff 64bit] pci 0000:00:15.0: BAR 0: no space for [mem size 0x00001000 64bit] Future patches will add quirks to enable/disable E820 clipping automatically. Add a "pci=no_e820" kernel command line option to disable clipping with E820 reserved regions. Also add a matching "pci=use_e820" option to enable clipping with E820 reserved regions if that has been disabled by default by further patches in this patch-set. Both options taint the kernel because they are intended for debugging and workaround purposes until a quirk can set them automatically. [bhelgaas: commit log, add printk] Link: https://bugzilla.redhat.com/show_bug.cgi?id=1868899 Lenovo IdeaPad 3 Link: https://lore.kernel.org/r/20220519152150.6135-2-hdegoede@redhat.com Signed-off-by: Hans de Goede Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Cc: Benoit GrĂ©goire Cc: Hui Wang --- Documentation/admin-guide/kernel-parameters.txt | 9 +++++++++ arch/x86/include/asm/pci_x86.h | 2 ++ arch/x86/pci/acpi.c | 18 ++++++++++++++++-- arch/x86/pci/common.c | 8 ++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3f1cc5e317ed..2477b639d5c4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4066,6 +4066,15 @@ please report a bug. nocrs [X86] Ignore PCI host bridge windows from ACPI. If you need to use this, please report a bug. + use_e820 [X86] Use E820 reservations to exclude parts of + PCI host bridge windows. This is a workaround + for BIOS defects in host bridge _CRS methods. + If you need to use this, please report a bug to + . + no_e820 [X86] Ignore E820 reservations for PCI host + bridge windows. This is the default on modern + hardware. If you need to use this, please report + a bug to . routeirq Do IRQ routing for all PCI devices. This is normally done in pci_enable_device(), so this option is a temporary workaround diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index a0627dfae541..ce3fd3311772 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -42,6 +42,8 @@ do { \ #define PCI_ROOT_NO_CRS 0x100000 #define PCI_NOASSIGN_BARS 0x200000 #define PCI_BIG_ROOT_WINDOW 0x400000 +#define PCI_USE_E820 0x800000 +#define PCI_NO_E820 0x1000000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 562c81a51ea0..c61c815efedb 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -20,6 +20,7 @@ struct pci_root_info { #endif }; +static bool pci_use_e820 = true; static bool pci_use_crs = true; static bool pci_ignore_seg; @@ -161,6 +162,17 @@ void __init pci_acpi_crs_quirks(void) "if necessary, use \"pci=%s\" and report a bug\n", pci_use_crs ? "Using" : "Ignoring", pci_use_crs ? "nocrs" : "use_crs"); + + /* "pci=use_e820"/"pci=no_e820" on the kernel cmdline takes precedence */ + if (pci_probe & PCI_NO_E820) + pci_use_e820 = false; + else if (pci_probe & PCI_USE_E820) + pci_use_e820 = true; + + printk(KERN_INFO "PCI: %s E820 reservations for host bridge windows\n", + pci_use_e820 ? "Using" : "Ignoring"); + if (pci_probe & (PCI_NO_E820 | PCI_USE_E820)) + printk(KERN_INFO "PCI: Please notify linux-pci@vger.kernel.org so future kernels can this automatically\n"); } #ifdef CONFIG_PCI_MMCONFIG @@ -301,8 +313,10 @@ static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci) status = acpi_pci_probe_root_resources(ci); - resource_list_for_each_entry(entry, &ci->resources) - remove_e820_regions(&device->dev, entry->res); + if (pci_use_e820) { + resource_list_for_each_entry(entry, &ci->resources) + remove_e820_regions(&device->dev, entry->res); + } if (pci_use_crs) { resource_list_for_each_entry_safe(entry, tmp, &ci->resources) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 9e1e6b8d8876..ddb798603201 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -595,6 +595,14 @@ char *__init pcibios_setup(char *str) } else if (!strcmp(str, "nocrs")) { pci_probe |= PCI_ROOT_NO_CRS; return NULL; + } else if (!strcmp(str, "use_e820")) { + pci_probe |= PCI_USE_E820; + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return NULL; + } else if (!strcmp(str, "no_e820")) { + pci_probe |= PCI_NO_E820; + add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK); + return NULL; #ifdef CONFIG_PHYS_ADDR_T_64BIT } else if (!strcmp(str, "big_root_window")) { pci_probe |= PCI_BIG_ROOT_WINDOW; -- cgit v1.2.3-59-g8ed1b From 69505e3d9a39a988aaed9b58aa6b3482238f6516 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 12 May 2022 06:56:23 -0700 Subject: bug: Use normal relative pointers in 'struct bug_entry' With CONFIG_GENERIC_BUG_RELATIVE_POINTERS, the addr/file relative pointers are calculated weirdly: based on the beginning of the bug_entry struct address, rather than their respective pointer addresses. Make the relative pointers less surprising to both humans and tools by calculating them the normal way. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Acked-by: Peter Zijlstra (Intel) Acked-by: Sven Schnelle # s390 Acked-by: Michael Ellerman (powerpc) Acked-by: Catalin Marinas Tested-by: Mark Rutland [arm64] Link: https://lkml.kernel.org/r/f0e05be797a16f4fc2401eeb88c8450dcbe61df6.1652362951.git.jpoimboe@kernel.org --- arch/arm64/include/asm/asm-bug.h | 4 ++-- arch/powerpc/include/asm/bug.h | 14 ++++++++------ arch/riscv/include/asm/bug.h | 4 ++-- arch/s390/include/asm/bug.h | 5 +++-- arch/x86/include/asm/bug.h | 2 +- lib/bug.c | 15 +++++++-------- 6 files changed, 23 insertions(+), 21 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/asm-bug.h b/arch/arm64/include/asm/asm-bug.h index 03f52f84a4f3..c762038ba400 100644 --- a/arch/arm64/include/asm/asm-bug.h +++ b/arch/arm64/include/asm/asm-bug.h @@ -14,7 +14,7 @@ 14472: .string file; \ .popsection; \ \ - .long 14472b - 14470b; \ + .long 14472b - .; \ .short line; #else #define _BUGVERBOSE_LOCATION(file, line) @@ -25,7 +25,7 @@ #define __BUG_ENTRY(flags) \ .pushsection __bug_table,"aw"; \ .align 2; \ - 14470: .long 14471f - 14470b; \ + 14470: .long 14471f - .; \ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ .short flags; \ .popsection; \ diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index ecbae1832de3..61a4736355c2 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -13,7 +13,8 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b, 5002f - 5001b +5001: .4byte \addr - . + .4byte 5002f - . .short \line, \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -24,7 +25,7 @@ #else .macro __EMIT_BUG_ENTRY addr,file,line,flags .section __bug_table,"aw" -5001: .4byte \addr - 5001b +5001: .4byte \addr - . .short \flags .org 5001b+BUG_ENTRY_SIZE .previous @@ -49,15 +50,16 @@ #ifdef CONFIG_DEBUG_BUGVERBOSE #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b, %0 - 2b\n" \ - "\t.short %1, %2\n" \ + "2: .4byte 1b - .\n" \ + " .4byte %0 - .\n" \ + " .short %1, %2\n" \ ".org 2b+%3\n" \ ".previous\n" #else #define _EMIT_BUG_ENTRY \ ".section __bug_table,\"aw\"\n" \ - "2:\t.4byte 1b - 2b\n" \ - "\t.short %2\n" \ + "2: .4byte 1b - .\n" \ + " .short %2\n" \ ".org 2b+%3\n" \ ".previous\n" #endif diff --git a/arch/riscv/include/asm/bug.h b/arch/riscv/include/asm/bug.h index d3804a2f9aad..1aaea81fb141 100644 --- a/arch/riscv/include/asm/bug.h +++ b/arch/riscv/include/asm/bug.h @@ -30,8 +30,8 @@ typedef u32 bug_insn_t; #ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS -#define __BUG_ENTRY_ADDR RISCV_INT " 1b - 2b" -#define __BUG_ENTRY_FILE RISCV_INT " %0 - 2b" +#define __BUG_ENTRY_ADDR RISCV_INT " 1b - ." +#define __BUG_ENTRY_FILE RISCV_INT " %0 - ." #else #define __BUG_ENTRY_ADDR RISCV_PTR " 1b" #define __BUG_ENTRY_FILE RISCV_PTR " %0" diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 0b25f28351ed..aebe1e22c7be 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,8 @@ "1: .asciz \""__FILE__"\"\n" \ ".previous\n" \ ".section __bug_table,\"awM\",@progbits,%2\n" \ - "2: .long 0b-2b,1b-2b\n" \ + "2: .long 0b-.\n" \ + " .long 1b-.\n" \ " .short %0,%1\n" \ " .org 2b+%2\n" \ ".previous\n" \ @@ -30,7 +31,7 @@ asm_inline volatile( \ "0: mc 0,0\n" \ ".section __bug_table,\"awM\",@progbits,%1\n" \ - "1: .long 0b-1b\n" \ + "1: .long 0b-.\n" \ " .short %0\n" \ " .org 1b+%1\n" \ ".previous\n" \ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 4d20a293c6fd..76fbe24cccf9 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -18,7 +18,7 @@ #ifdef CONFIG_X86_32 # define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_REL(val) ".long " __stringify(val) " - 2b" +# define __BUG_REL(val) ".long " __stringify(val) " - ." #endif #ifdef CONFIG_DEBUG_BUGVERBOSE diff --git a/lib/bug.c b/lib/bug.c index 45a0584f6541..c223a2575b72 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -6,8 +6,7 @@ CONFIG_BUG - emit BUG traps. Nothing happens without this. CONFIG_GENERIC_BUG - enable this code. - CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to - the containing struct bug_entry for bug_addr and file. + CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit relative pointers for bug_addr and file CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable @@ -53,10 +52,10 @@ extern struct bug_entry __start___bug_table[], __stop___bug_table[]; static inline unsigned long bug_addr(const struct bug_entry *bug) { -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - return bug->bug_addr; +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + return (unsigned long)&bug->bug_addr_disp + bug->bug_addr_disp; #else - return (unsigned long)bug + bug->bug_addr_disp; + return bug->bug_addr; #endif } @@ -131,10 +130,10 @@ void bug_get_file_line(struct bug_entry *bug, const char **file, unsigned int *line) { #ifdef CONFIG_DEBUG_BUGVERBOSE -#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS - *file = bug->file; +#ifdef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + *file = (const char *)&bug->file_disp + bug->file_disp; #else - *file = (const char *)bug + bug->file_disp; + *file = bug->file; #endif *line = bug->line; #else -- cgit v1.2.3-59-g8ed1b From d936411dc9caeb3edb992e39c33d4d1d81ca8c08 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 May 2022 12:12:30 +0200 Subject: x86: Remove empty files Remove empty files which were supposed to get removed with the respective commits removing the functionality in them: $ find arch/x86/ -empty arch/x86/lib/mmx_32.c arch/x86/include/asm/fpu/internal.h arch/x86/include/asm/mmx.h Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220520101723.12006-1-bp@alien8.de --- arch/x86/include/asm/fpu/internal.h | 0 arch/x86/include/asm/mmx.h | 0 arch/x86/lib/mmx_32.c | 0 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 arch/x86/include/asm/fpu/internal.h delete mode 100644 arch/x86/include/asm/mmx.h delete mode 100644 arch/x86/lib/mmx_32.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit v1.2.3-59-g8ed1b From aadd1b678ebef81cc3ed23663253ae3749cdc673 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 20 May 2022 16:57:52 -0700 Subject: x86/alternative: Introduce text_poke_set Introduce a memset like API for text_poke. This will be used to fill the unused RX memory with illegal instructions. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20220520235758.1858153-3-song@kernel.org --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 67 ++++++++++++++++++++++++++++++------ 2 files changed, 58 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index d20ab0921480..1cc15528ce29 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -45,6 +45,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d374cb3cf024..7563b5bc8328 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -994,7 +994,21 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -static void *__text_poke(void *addr, const void *opcode, size_t len) +static void text_poke_memcpy(void *dst, const void *src, size_t len) +{ + memcpy(dst, src, len); +} + +static void text_poke_memset(void *dst, const void *src, size_t len) +{ + int c = *(const int *)src; + + memset(dst, c, len); +} + +typedef void text_poke_f(void *dst, const void *src, size_t len); + +static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len) { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; @@ -1059,7 +1073,7 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) prev = use_temporary_mm(poking_mm); kasan_disable_current(); - memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + func((u8 *)poking_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1087,11 +1101,13 @@ static void *__text_poke(void *addr, const void *opcode, size_t len) (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); - /* - * If the text does not match what we just wrote then something is - * fundamentally screwy; there's nothing we can really do about that. - */ - BUG_ON(memcmp(addr, opcode, len)); + if (func == text_poke_memcpy) { + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, src, len)); + } local_irq_restore(flags); pte_unmap_unlock(ptep, ptl); @@ -1118,7 +1134,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) { lockdep_assert_held(&text_mutex); - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1137,7 +1153,7 @@ void *text_poke(void *addr, const void *opcode, size_t len) */ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) { - return __text_poke(addr, opcode, len); + return __text_poke(text_poke_memcpy, addr, opcode, len); } /** @@ -1167,7 +1183,38 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); - __text_poke((void *)ptr, opcode + patched, s); + __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); + patched += s; + } + mutex_unlock(&text_mutex); + return addr; +} + +/** + * text_poke_set - memset into (an unused part of) RX memory + * @addr: address to modify + * @c: the byte to fill the area with + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * This is useful to overwrite unused regions of RX memory with illegal + * instructions. + */ +void *text_poke_set(void *addr, int c, size_t len) +{ + unsigned long start = (unsigned long)addr; + size_t patched = 0; + + if (WARN_ON_ONCE(core_kernel_text(start))) + return NULL; + + mutex_lock(&text_mutex); + while (patched < len) { + unsigned long ptr = start + patched; + size_t s; + + s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched); + + __text_poke(text_poke_memset, (void *)ptr, (void *)&c, s); patched += s; } mutex_unlock(&text_mutex); -- cgit v1.2.3-59-g8ed1b From 5d7c854593a460706dacf8e1b16c9bdcb1c2d7bb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 28 Mar 2022 08:26:48 +0200 Subject: livepatch: Remove klp_arch_set_pc() and asm/livepatch.h All three versions of klp_arch_set_pc() do exactly the same: they call ftrace_instruction_pointer_set(). Call ftrace_instruction_pointer_set() directly and remove klp_arch_set_pc(). As klp_arch_set_pc() was the only thing remaining in asm/livepatch.h on x86 and s390, remove asm/livepatch.h livepatch.h remains on powerpc but its content is exclusively used by powerpc specific code. Signed-off-by: Christophe Leroy Acked-by: Petr Mladek Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Petr Mladek --- MAINTAINERS | 2 -- arch/powerpc/include/asm/livepatch.h | 10 +--------- arch/powerpc/kernel/irq.c | 1 - arch/powerpc/kernel/setup_64.c | 2 +- arch/s390/include/asm/livepatch.h | 22 ---------------------- arch/x86/include/asm/livepatch.h | 20 -------------------- include/linux/livepatch.h | 2 -- kernel/livepatch/patch.c | 2 +- 8 files changed, 3 insertions(+), 58 deletions(-) delete mode 100644 arch/s390/include/asm/livepatch.h delete mode 100644 arch/x86/include/asm/livepatch.h (limited to 'arch/x86/include') diff --git a/MAINTAINERS b/MAINTAINERS index 741825cff43c..589c61412e3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11338,8 +11338,6 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/livepatching/livepatching.g F: Documentation/ABI/testing/sysfs-kernel-livepatch F: Documentation/livepatch/ F: arch/powerpc/include/asm/livepatch.h -F: arch/s390/include/asm/livepatch.h -F: arch/x86/include/asm/livepatch.h F: include/linux/livepatch.h F: kernel/livepatch/ F: lib/livepatch/ diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 1c60094ea0cd..d044a1fd4f44 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -7,17 +7,9 @@ #ifndef _ASM_POWERPC_LIVEPATCH_H #define _ASM_POWERPC_LIVEPATCH_H -#include -#include +#include #include -#ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} -#endif /* CONFIG_LIVEPATCH */ - #ifdef CONFIG_LIVEPATCH_64 static inline void klp_init_thread_info(struct task_struct *p) { diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 752fb182eacb..fb99e3fa034d 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index e547066a06aa..2c1e9a5ac6ca 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -59,7 +59,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h deleted file mode 100644 index 5209f223331a..000000000000 --- a/arch/s390/include/asm/livepatch.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * livepatch.h - s390-specific Kernel Live Patching Core - * - * Copyright (c) 2013-2015 SUSE - * Authors: Jiri Kosina - * Vojtech Pavlik - * Jiri Slaby - */ - -#ifndef ASM_LIVEPATCH_H -#define ASM_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h deleted file mode 100644 index 7c5cc6660e4b..000000000000 --- a/arch/x86/include/asm/livepatch.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * livepatch.h - x86-specific Kernel Live Patching Core - * - * Copyright (C) 2014 Seth Jennings - * Copyright (C) 2014 SUSE - */ - -#ifndef _ASM_X86_LIVEPATCH_H -#define _ASM_X86_LIVEPATCH_H - -#include -#include - -static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) -{ - ftrace_instruction_pointer_set(fregs, ip); -} - -#endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 2614247a9781..293e29960c6e 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -16,8 +16,6 @@ #if IS_ENABLED(CONFIG_LIVEPATCH) -#include - /* task patch states */ #define KLP_UNDEFINED -1 #define KLP_UNPATCHED 0 diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index c172bf92b576..4c4f5a776d80 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(fregs, (unsigned long)func->new_func); + ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); -- cgit v1.2.3-59-g8ed1b From a6a5eb269f6f3a2fe392f725a8d9052190c731e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 2 May 2022 12:15:23 +0200 Subject: x86/cpu: Elide KCSAN for cpu_has() and friends As x86 uses the headers, the regular forms of all bitops are instrumented with explicit calls to KASAN and KCSAN checks. As these are explicit calls, these are not suppressed by the noinstr function attribute. This can result in calls to those check functions in noinstr code, which objtool warns about: vmlinux.o: warning: objtool: enter_from_user_mode+0x24: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x28: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x24: call to __kcsan_check_access() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x24: call to __kcsan_check_access() leaves .noinstr.text section Prevent this by using the arch_*() bitops, which are the underlying bitops without explciit instrumentation. [null: Changelog] Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220502111216.290518605@infradead.org --- arch/x86/include/asm/cpufeature.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 66d3e3b1d24d..ea34cc31b047 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -54,7 +54,7 @@ extern const char * const x86_power_flags[32]; extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ - test_bit(bit, (unsigned long *)((c)->x86_capability)) + arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits -- cgit v1.2.3-59-g8ed1b From 1894a4030582472336c2873cb07911ce67e0d14e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 7 May 2022 13:37:45 +0200 Subject: x86: Always inline on_thread_stack() and current_top_of_stack() Becaues GCC clearly lost it's marbles again... vmlinux.o: warning: objtool: enter_from_user_mode+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x53: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x4e: call to on_thread_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: enter_from_user_mode+0x4e: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode+0x53: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: syscall_enter_from_user_mode_prepare+0x4e: call to current_top_of_stack() leaves .noinstr.text section vmlinux.o: warning: objtool: irqentry_enter_from_user_mode+0x4e: call to current_top_of_stack() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220526105958.071435483@infradead.org --- arch/x86/include/asm/processor.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 91d0f93a00c7..356308c73951 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -559,7 +559,7 @@ static __always_inline void native_swapgs(void) #endif } -static inline unsigned long current_top_of_stack(void) +static __always_inline unsigned long current_top_of_stack(void) { /* * We can't read directly from tss.sp0: sp0 on x86_32 is special in @@ -569,7 +569,7 @@ static inline unsigned long current_top_of_stack(void) return this_cpu_read_stable(cpu_current_top_of_stack); } -static inline bool on_thread_stack(void) +static __always_inline bool on_thread_stack(void) { return (unsigned long)(current_top_of_stack() - current_stack_pointer) < THREAD_SIZE; -- cgit v1.2.3-59-g8ed1b From 2028a255f4df3af9e759f01f958d3237f825f256 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 20 May 2022 21:27:29 +0200 Subject: x86/extable: Annotate ex_handler_msr_mce() as a dead end Fix vmlinux.o: warning: objtool: fixup_exception+0x2d6: unreachable instruction Signed-off-by: Borislav Petkov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220520192729.23969-1-bp@alien8.de --- arch/x86/include/asm/extable.h | 8 ++++++-- tools/objtool/check.c | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h index 155c991ba95e..eeed395c3177 100644 --- a/arch/x86/include/asm/extable.h +++ b/arch/x86/include/asm/extable.h @@ -42,9 +42,13 @@ extern int ex_get_fixup_type(unsigned long ip); extern void early_fixup_exception(struct pt_regs *regs, int trapnr); #ifdef CONFIG_X86_MCE -extern void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); +extern void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr); #else -static inline void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) { } +static inline void __noreturn ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) +{ + for (;;) + cpu_relax(); +} #endif #if defined(CONFIG_BPF_JIT) && defined(CONFIG_X86_64) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 7a187dae2fcf..864bb9dd3584 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -187,6 +187,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, "__invalid_creds", "cpu_startup_entry", "__ubsan_handle_builtin_unreachable", + "ex_handler_msr_mce", }; if (!func) -- cgit v1.2.3-59-g8ed1b From 3e35142ef99fe6b4fe5d834ad43ee13cca10a2dc Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 19 May 2022 14:42:37 +0530 Subject: kexec_file: drop weak attribute from arch_kexec_apply_relocations[_add] Since commit d1bcae833b32f1 ("ELF: Don't generate unused section symbols") [1], binutils (v2.36+) started dropping section symbols that it thought were unused. This isn't an issue in general, but with kexec_file.c, gcc is placing kexec_arch_apply_relocations[_add] into a separate .text.unlikely section and the section symbol ".text.unlikely" is being dropped. Due to this, recordmcount is unable to find a non-weak symbol in .text.unlikely to generate a relocation record against. Address this by dropping the weak attribute from these functions. Instead, follow the existing pattern of having architectures #define the name of the function they want to override in their headers. [1] https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d1bcae833b32f1 [akpm@linux-foundation.org: arch/s390/include/asm/kexec.h needs linux/module.h] Link: https://lkml.kernel.org/r/20220519091237.676736-1-naveen.n.rao@linux.vnet.ibm.com Signed-off-by: Michael Ellerman Signed-off-by: Naveen N. Rao Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- arch/s390/include/asm/kexec.h | 10 ++++++++++ arch/x86/include/asm/kexec.h | 8 ++++++++ include/linux/kexec.h | 46 +++++++++++++++++++++++++++++++++++-------- kernel/kexec_file.c | 34 -------------------------------- 4 files changed, 56 insertions(+), 42 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 7f3c9ac34bd8..63098df81c9f 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -9,6 +9,8 @@ #ifndef _S390_KEXEC_H #define _S390_KEXEC_H +#include + #include #include #include @@ -83,4 +85,12 @@ struct kimage_arch { extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif /*_S390_KEXEC_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 11b7c06e2828..6ad8d946cd3e 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -186,6 +186,14 @@ extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages +#ifdef CONFIG_KEXEC_FILE +struct purgatory_info; +int arch_kexec_apply_relocations_add(struct purgatory_info *pi, + Elf_Shdr *section, + const Elf_Shdr *relsec, + const Elf_Shdr *symtab); +#define arch_kexec_apply_relocations_add arch_kexec_apply_relocations_add +#endif #endif typedef void crash_vmclear_fn(void); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 58d1b58a971e..fcd5035209f1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -193,14 +193,6 @@ void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len); void *arch_kexec_kernel_image_load(struct kimage *image); -int arch_kexec_apply_relocations_add(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); -int arch_kexec_apply_relocations(struct purgatory_info *pi, - Elf_Shdr *section, - const Elf_Shdr *relsec, - const Elf_Shdr *symtab); int arch_kimage_file_post_load_cleanup(struct kimage *image); #ifdef CONFIG_KEXEC_SIG int arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, @@ -229,6 +221,44 @@ extern int crash_exclude_mem_range(struct crash_mem *mem, unsigned long long mend); extern int crash_prepare_elf64_headers(struct crash_mem *mem, int kernel_map, void **addr, unsigned long *sz); + +#ifndef arch_kexec_apply_relocations_add +/* + * arch_kexec_apply_relocations_add - apply relocations of type RELA + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELAs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} +#endif + +#ifndef arch_kexec_apply_relocations +/* + * arch_kexec_apply_relocations - apply relocations of type REL + * @pi: Purgatory to be relocated. + * @section: Section relocations applying to. + * @relsec: Section containing RELs. + * @symtab: Corresponding symtab. + * + * Return: 0 on success, negative errno on error. + */ +static inline int +arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, + const Elf_Shdr *relsec, const Elf_Shdr *symtab) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} +#endif #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 8347fc158d2b..c108a2a88754 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -108,40 +108,6 @@ int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, } #endif -/* - * arch_kexec_apply_relocations_add - apply relocations of type RELA - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELAs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("RELA relocation unsupported.\n"); - return -ENOEXEC; -} - -/* - * arch_kexec_apply_relocations - apply relocations of type REL - * @pi: Purgatory to be relocated. - * @section: Section relocations applying to. - * @relsec: Section containing RELs. - * @symtab: Corresponding symtab. - * - * Return: 0 on success, negative errno on error. - */ -int __weak -arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, - const Elf_Shdr *relsec, const Elf_Shdr *symtab) -{ - pr_err("REL relocation unsupported.\n"); - return -ENOEXEC; -} - /* * Free up memory used by kernel, initrd, and command line. This is temporary * memory allocation which is not needed any more after these buffers have -- cgit v1.2.3-59-g8ed1b From b39181f7c6907dc66ff937b74758671fa6ba430c Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Thu, 26 May 2022 14:19:12 -0400 Subject: ftrace: Add FTRACE_MCOUNT_MAX_OFFSET to avoid adding weak function If an unused weak function was traced, it's call to fentry will still exist, which gets added into the __mcount_loc table. Ftrace will use kallsyms to retrieve the name for each location in __mcount_loc to display it in the available_filter_functions and used to enable functions via the name matching in set_ftrace_filter/notrace. Enabling these functions do nothing but enable an unused call to ftrace_caller. If a traced weak function is overridden, the symbol of the function would be used for it, which will either created duplicate names, or if the previous function was not traced, it would be incorrectly be listed in available_filter_functions as a function that can be traced. This became an issue with BPF[1] as there are tooling that enables the direct callers via ftrace but then checks to see if the functions were actually enabled. The case of one function that was marked notrace, but was followed by an unused weak function that was traced. The unused function's call to fentry was added to the __mcount_loc section, and kallsyms retrieved the untraced function's symbol as the weak function was overridden. Since the untraced function would not get traced, the BPF check would detect this and fail. The real fix would be to fix kallsyms to not show addresses of weak functions as the function before it. But that would require adding code in the build to add function size to kallsyms so that it can know when the function ends instead of just using the start of the next known symbol. In the mean time, this is a work around. Add a FTRACE_MCOUNT_MAX_OFFSET macro that if defined, ftrace will ignore any function that has its call to fentry/mcount that has an offset from the symbol that is greater than FTRACE_MCOUNT_MAX_OFFSET. If CONFIG_HAVE_FENTRY is defined for x86, define FTRACE_MCOUNT_MAX_OFFSET to zero (unless IBT is enabled), which will have ftrace ignore all locations that are not at the start of the function (or one after the ENDBR instruction). A worker thread is added at boot up to scan all the ftrace record entries, and will mark any that fail the FTRACE_MCOUNT_MAX_OFFSET test as disabled. They will still appear in the available_filter_functions file as: __ftrace_invalid_address___ (showing the offset that caused it to be invalid). This is required for tools that use libtracefs (like trace-cmd does) that scan the available_filter_functions and enable set_ftrace_filter and set_ftrace_notrace using indexes of the function listed in the file (this is a speedup, as enabling thousands of files via names is an O(n^2) operation and can take minutes to complete, where the indexing takes less than a second). The invalid functions cannot be removed from available_filter_functions as the names there correspond to the ftrace records in the array that manages them (and the indexing depends on this). [1] https://lore.kernel.org/all/20220412094923.0abe90955e5db486b7bca279@kernel.org/ Link: https://lkml.kernel.org/r/20220526141912.794c2786@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- arch/x86/include/asm/ftrace.h | 7 +++ kernel/trace/ftrace.c | 141 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 146 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 024d9797646e..b5ef474be858 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -9,6 +9,13 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ +/* Ignore unused weak functions which will have non zero offsets */ +#ifdef CONFIG_HAVE_FENTRY +# include +/* Add offset for endbr64 if IBT enabled */ +# define FTRACE_MCOUNT_MAX_OFFSET ENDBR_INSN_SIZE +#endif + #ifdef CONFIG_DYNAMIC_FTRACE #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d653ef4febc5..c5088c76a108 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -45,6 +45,8 @@ #include "trace_output.h" #include "trace_stat.h" +#define FTRACE_INVALID_FUNCTION "__ftrace_invalid_address__" + #define FTRACE_WARN_ON(cond) \ ({ \ int ___r = cond; \ @@ -3654,6 +3656,105 @@ static void add_trampoline_func(struct seq_file *m, struct ftrace_ops *ops, seq_printf(m, " ->%pS", ptr); } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +/* + * Weak functions can still have an mcount/fentry that is saved in + * the __mcount_loc section. These can be detected by having a + * symbol offset of greater than FTRACE_MCOUNT_MAX_OFFSET, as the + * symbol found by kallsyms is not the function that the mcount/fentry + * is part of. The offset is much greater in these cases. + * + * Test the record to make sure that the ip points to a valid kallsyms + * and if not, mark it disabled. + */ +static int test_for_valid_rec(struct dyn_ftrace *rec) +{ + char str[KSYM_SYMBOL_LEN]; + unsigned long offset; + const char *ret; + + ret = kallsyms_lookup(rec->ip, NULL, &offset, NULL, str); + + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + rec->flags |= FTRACE_FL_DISABLED; + return 0; + } + return 1; +} + +static struct workqueue_struct *ftrace_check_wq __initdata; +static struct work_struct ftrace_check_work __initdata; + +/* + * Scan all the mcount/fentry entries to make sure they are valid. + */ +static __init void ftrace_check_work_func(struct work_struct *work) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + test_for_valid_rec(rec); + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static int __init ftrace_check_for_weak_functions(void) +{ + INIT_WORK(&ftrace_check_work, ftrace_check_work_func); + + ftrace_check_wq = alloc_workqueue("ftrace_check_wq", WQ_UNBOUND, 0); + + queue_work(ftrace_check_wq, &ftrace_check_work); + return 0; +} + +static int __init ftrace_check_sync(void) +{ + /* Make sure the ftrace_check updates are finished */ + if (ftrace_check_wq) + destroy_workqueue(ftrace_check_wq); + return 0; +} + +late_initcall_sync(ftrace_check_sync); +subsys_initcall(ftrace_check_for_weak_functions); + +static int print_rec(struct seq_file *m, unsigned long ip) +{ + unsigned long offset; + char str[KSYM_SYMBOL_LEN]; + char *modname; + const char *ret; + + ret = kallsyms_lookup(ip, NULL, &offset, &modname, str); + /* Weak functions can cause invalid addresses */ + if (!ret || offset > FTRACE_MCOUNT_MAX_OFFSET) { + snprintf(str, KSYM_SYMBOL_LEN, "%s_%ld", + FTRACE_INVALID_FUNCTION, offset); + ret = NULL; + } + + seq_puts(m, str); + if (modname) + seq_printf(m, " [%s]", modname); + return ret == NULL ? -1 : 0; +} +#else +static inline int test_for_valid_rec(struct dyn_ftrace *rec) +{ + return 1; +} + +static inline int print_rec(struct seq_file *m, unsigned long ip) +{ + seq_printf(m, "%ps", (void *)ip); + return 0; +} +#endif + static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; @@ -3678,7 +3779,13 @@ static int t_show(struct seq_file *m, void *v) if (!rec) return 0; - seq_printf(m, "%ps", (void *)rec->ip); + if (print_rec(m, rec->ip)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(!(rec->flags & FTRACE_FL_DISABLED)); + seq_putc(m, '\n'); + return 0; + } + if (iter->flags & FTRACE_ITER_ENABLED) { struct ftrace_ops *ops; @@ -3996,6 +4103,24 @@ add_rec_by_index(struct ftrace_hash *hash, struct ftrace_glob *func_g, return 0; } +#ifdef FTRACE_MCOUNT_MAX_OFFSET +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + unsigned long offset; + + kallsyms_lookup(ip, NULL, &offset, modname, str); + if (offset > FTRACE_MCOUNT_MAX_OFFSET) + return -1; + return 0; +} +#else +static int lookup_ip(unsigned long ip, char **modname, char *str) +{ + kallsyms_lookup(ip, NULL, NULL, modname, str); + return 0; +} +#endif + static int ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, struct ftrace_glob *mod_g, int exclude_mod) @@ -4003,7 +4128,12 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, char str[KSYM_SYMBOL_LEN]; char *modname; - kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + if (lookup_ip(rec->ip, &modname, str)) { + /* This should only happen when a rec is disabled */ + WARN_ON_ONCE(system_state == SYSTEM_RUNNING && + !(rec->flags & FTRACE_FL_DISABLED)); + return 0; + } if (mod_g) { int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0; @@ -6819,6 +6949,13 @@ void ftrace_module_enable(struct module *mod) !within_module_init(rec->ip, mod)) break; + /* Weak functions should still be ignored */ + if (!test_for_valid_rec(rec)) { + /* Clear all other flags. Should not be enabled anyway */ + rec->flags = FTRACE_FL_DISABLED; + continue; + } + cnt = 0; /* -- cgit v1.2.3-59-g8ed1b From 41925b105e345ebc84cedb64f59d20cb14a62613 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Mon, 30 May 2022 10:26:34 +0200 Subject: xen: replace xen_remap() with memremap() xen_remap() is used to establish mappings for frames not under direct control of the kernel: for Xenstore and console ring pages, and for grant pages of non-PV guests. Today xen_remap() is defined to use ioremap() on x86 (doing uncached mappings), and ioremap_cache() on Arm (doing cached mappings). Uncached mappings for those use cases are bad for performance, so they should be avoided if possible. As all use cases of xen_remap() don't require uncached mappings (the mapped area is always physical RAM), a mapping using the standard WB cache mode is fine. As sparse is flagging some of the xen_remap() use cases to be not appropriate for iomem(), as the result is not annotated with the __iomem modifier, eliminate xen_remap() completely and replace all use cases with memremap() specifying the MEMREMAP_WB caching mode. xen_unmap() can be replaced with memunmap(). Reported-by: kernel test robot Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Acked-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220530082634.6339-1-jgross@suse.com Signed-off-by: Juergen Gross --- arch/x86/include/asm/xen/page.h | 3 --- drivers/tty/hvc/hvc_xen.c | 2 +- drivers/xen/grant-table.c | 6 +++--- drivers/xen/xenbus/xenbus_probe.c | 8 ++++---- include/xen/arm/page.h | 3 --- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index e989bc2269f5..777e6b21efe9 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -347,9 +347,6 @@ unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); -#define xen_remap(cookie, size) ioremap((cookie), (size)) -#define xen_unmap(cookie) iounmap((cookie)) - static inline bool xen_arch_need_swiotlb(struct device *dev, phys_addr_t phys, dma_addr_t dev_addr) diff --git a/drivers/tty/hvc/hvc_xen.c b/drivers/tty/hvc/hvc_xen.c index ebaf7500f48f..7c23112dc923 100644 --- a/drivers/tty/hvc/hvc_xen.c +++ b/drivers/tty/hvc/hvc_xen.c @@ -253,7 +253,7 @@ static int xen_hvm_console_init(void) if (r < 0 || v == 0) goto err; gfn = v; - info->intf = xen_remap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE); + info->intf = memremap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE, MEMREMAP_WB); if (info->intf == NULL) goto err; info->vtermno = HVC_COOKIE; diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 1b9b28bc3228..7a18292540bc 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -632,7 +632,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) if (xen_auto_xlat_grant_frames.count) return -EINVAL; - vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes); + vaddr = memremap(addr, XEN_PAGE_SIZE * max_nr_gframes, MEMREMAP_WB); if (vaddr == NULL) { pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", &addr); @@ -640,7 +640,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr) } pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); if (!pfn) { - xen_unmap(vaddr); + memunmap(vaddr); return -ENOMEM; } for (i = 0; i < max_nr_gframes; i++) @@ -659,7 +659,7 @@ void gnttab_free_auto_xlat_frames(void) if (!xen_auto_xlat_grant_frames.count) return; kfree(xen_auto_xlat_grant_frames.pfn); - xen_unmap(xen_auto_xlat_grant_frames.vaddr); + memunmap(xen_auto_xlat_grant_frames.vaddr); xen_auto_xlat_grant_frames.pfn = NULL; xen_auto_xlat_grant_frames.count = 0; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index d367f2bd2b93..58b732dcbfb8 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -752,8 +752,8 @@ static void xenbus_probe(void) xenstored_ready = 1; if (!xen_store_interface) { - xen_store_interface = xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, - XEN_PAGE_SIZE); + xen_store_interface = memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); /* * Now it is safe to free the IRQ used for xenstore late * initialization. No need to unbind: it is about to be @@ -1009,8 +1009,8 @@ static int __init xenbus_init(void) #endif xen_store_gfn = (unsigned long)v; xen_store_interface = - xen_remap(xen_store_gfn << XEN_PAGE_SHIFT, - XEN_PAGE_SIZE); + memremap(xen_store_gfn << XEN_PAGE_SHIFT, + XEN_PAGE_SIZE, MEMREMAP_WB); if (xen_store_interface->connection != XENSTORE_CONNECTED) wait = true; } diff --git a/include/xen/arm/page.h b/include/xen/arm/page.h index ac1b65470563..f831cfeca000 100644 --- a/include/xen/arm/page.h +++ b/include/xen/arm/page.h @@ -109,9 +109,6 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) return __set_phys_to_machine(pfn, mfn); } -#define xen_remap(cookie, size) ioremap_cache((cookie), (size)) -#define xen_unmap(cookie) iounmap((cookie)) - bool xen_arch_need_swiotlb(struct device *dev, phys_addr_t phys, dma_addr_t dev_addr); -- cgit v1.2.3-59-g8ed1b From 31f1a0edff78c43e8a3bd3692af0db1b25c21b17 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 20 May 2022 12:07:58 +0200 Subject: efi/x86: libstub: Make DXE calls mixed mode safe The newly added DXE calls use 64-bit quantities, which means we need to marshall them explicitly when running in mixed mode. Currently, we get away without it because we just bail when GetMemorySpaceDescriptor() fails, which is guaranteed to happen due to the function argument mixup. Let's fix this properly, though, by defining the macros that describe how to marshall the arguments. While at it, drop an incorrect cast on a status variable. Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 9 +++++++++ drivers/firmware/efi/libstub/x86-stub.c | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index bed74a0f2932..71943dce691e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -270,6 +270,8 @@ static inline u32 efi64_convert_status(efi_status_t status) return (u32)(status | (u64)status >> 32); } +#define __efi64_split(val) (val) & U32_MAX, (u64)(val) >> 32 + #define __efi64_argmap_free_pages(addr, size) \ ((addr), 0, (size)) @@ -317,6 +319,13 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_hash_log_extend_event(prot, fl, addr, size, ev) \ ((prot), (fl), 0ULL, (u64)(addr), 0ULL, (u64)(size), 0ULL, ev) +/* DXE services */ +#define __efi64_argmap_get_memory_space_descriptor(phys, desc) \ + (__efi64_split(phys), (desc)) + +#define __efi64_argmap_set_memory_space_descriptor(phys, size, flags) \ + (__efi64_split(phys), __efi64_split(size), __efi64_split(flags)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index b14e88ccefca..05ae8bcc9d67 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -260,10 +260,10 @@ adjust_memory_range_protection(unsigned long start, unsigned long size) EFI_MEMORY_WB); if (status != EFI_SUCCESS) { - efi_warn("Unable to unprotect memory range [%08lx,%08lx]: %d\n", + efi_warn("Unable to unprotect memory range [%08lx,%08lx]: %lx\n", unprotect_start, unprotect_start + unprotect_size, - (int)status); + status); } } } -- cgit v1.2.3-59-g8ed1b