diff options
Diffstat (limited to 'arch/x86')
62 files changed, 1124 insertions, 464 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index cad6e1bfa7d5..54e7d15dbd0d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2758,7 +2758,7 @@ SYM_FUNC_START(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2803,7 +2803,7 @@ SYM_FUNC_START(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index d01ddd73de65..ecc0a9a905c4 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -1228,7 +1228,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 563ef6e83cdd..0907243c501c 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -1339,7 +1339,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 0e6690e3618c..8501ec4532f4 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -75,7 +75,7 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp %rdi +#define bufp rdi #define bufp_dw %edi #define bufp_w %di #define bufp_b %dil @@ -105,9 +105,9 @@ SYM_FUNC_START(crc_pcl) ## 1) ALIGN: ################################################################ - mov bufp, bufptmp # rdi = *buf - neg bufp - and $7, bufp # calculate the unalignment amount of + mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of # the address je proc_block # Skip if aligned @@ -123,13 +123,13 @@ SYM_FUNC_START(crc_pcl) do_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer - add bufp, bufptmp # align buffer pointer for quadword + add %bufp, bufptmp # align buffer pointer for quadword # processing - sub bufp, len # update buffer length + sub %bufp, len # update buffer length align_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte - dec bufp + dec %bufp jne align_loop proc_block: @@ -169,10 +169,10 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), bufp - movzxw (bufp, %rax, 2), len - lea crc_array(%rip), bufp - lea (bufp, len, 1), bufp + lea jump_table(%rip), %bufp + movzxw (%bufp, %rax, 2), len + lea crc_array(%rip), %bufp + lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp ################################################################ @@ -218,9 +218,9 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) subq %rax, tmp # tmp -= rax*24 diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index a801ffc10cbb..18200135603f 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -21,7 +21,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 6394b5fe8db6..dd06249229e1 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -34,7 +34,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index 82cc1b3ced1d..b0b05c93409e 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -32,7 +32,6 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/cryptohash.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/sha.h> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..7e7ffb7a5147 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -816,7 +816,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - CALL_NOSPEC %ebx + CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -1501,7 +1501,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception_read_cr2) @@ -1522,7 +1522,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3063aa9090f9..64fe3d82157e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -348,7 +348,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 9a7a1446cb3a..4a809c6cbd2f 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -10,11 +10,11 @@ config PERF_EVENTS_INTEL_UNCORE available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL - tristate "Intel rapl performance events" - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + tristate "Intel/AMD rapl performance events" + depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y ---help--- - Include support for Intel rapl performance events for power + Include support for Intel and AMD rapl performance events for power monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9e07f554333f..12c42eba77ec 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y += core.o probe.o +obj-$(PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ +obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/ +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/ diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..9e63ee50b19a 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1839,6 +1839,10 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + err = zhaoxin_pmu_init(); + break; default: err = -ENOTSUPP; } diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3468b0c1dc7c..e67a5886336c 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -2,8 +2,6 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o -intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6a3b599ee0fe..731dd8d0dbb1 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[0]; + struct bts_phys buf[]; }; static struct pmu bts_pmu; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 332954cccece..ca35c8b5ee10 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1892,8 +1892,8 @@ static __initconst const u64 tnt_hw_cache_extra_regs static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1), EVENT_EXTRA_END }; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1db7a51d9792..e94af4a54d0d 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -226,8 +226,6 @@ static int __init pt_pmu_hw_init(void) pt_pmu.vmx = true; } - attrs = NULL; - for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 0da4a4605536..b469ddd45515 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -130,7 +130,7 @@ struct intel_uncore_box { struct list_head list; struct list_head active_list; void __iomem *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; + struct intel_uncore_extra_reg shared_regs[]; }; /* CFL uncore 8th cbox MSRs */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f1cd1ca1a77b..e17a3d8a47ed 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -618,6 +618,7 @@ struct x86_pmu { /* PMI handler bits */ unsigned int late_ack :1, + enabled_ack :1, counter_freezing :1; /* * sysfs attrs @@ -1133,3 +1134,12 @@ static inline int is_ht_workaround_enabled(void) return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ + +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) +int zhaoxin_pmu_init(void); +#else +static inline int zhaoxin_pmu_init(void) +{ + return 0; +} +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/ diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index c2ede2f3b277..136a1e847254 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i) return 0; } +/* + * Accepts msr[] array with non populated entries as long as either + * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs + * visibility is visible when group->is_visible callback is set. + */ unsigned long perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) { @@ -24,8 +29,16 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + /* skip entry with no group */ + if (!grp) + continue; + grp->is_visible = not_visible; + /* skip unpopulated entry */ + if (!msr[bit].msr) + continue; + if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/rapl.c index a5dbd25852cb..0f2bf59f4354 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/rapl.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Support Intel RAPL energy consumption counters + * Support Intel/AMD RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b * section 14.7.1 (September 2013) * + * AMD RAPL interface for Fam17h is described in the public PPR: + * https://bugzilla.kernel.org/show_bug.cgi?id=206537 + * * RAPL provides more controls than just reporting energy consumption * however here we only expose the 3 energy consumption free running * counters (pp0, pkg, dram). @@ -58,8 +61,8 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#include "../perf_event.h" -#include "../probe.h" +#include "perf_event.h" +#include "probe.h" MODULE_LICENSE("GPL"); @@ -128,7 +131,9 @@ struct rapl_pmus { }; struct rapl_model { + struct perf_msr *rapl_msrs; unsigned long events; + unsigned int msr_power_unit; bool apply_quirk; }; @@ -138,7 +143,7 @@ static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; +static struct perf_msr *rapl_msrs; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { @@ -455,9 +460,16 @@ static struct attribute *rapl_events_cores[] = { NULL, }; +static umode_t +rapl_not_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return 0; +} + static struct attribute_group rapl_events_cores_group = { .name = "events", .attrs = rapl_events_cores, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_pkg[] = { @@ -470,6 +482,7 @@ static struct attribute *rapl_events_pkg[] = { static struct attribute_group rapl_events_pkg_group = { .name = "events", .attrs = rapl_events_pkg, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_ram[] = { @@ -482,6 +495,7 @@ static struct attribute *rapl_events_ram[] = { static struct attribute_group rapl_events_ram_group = { .name = "events", .attrs = rapl_events_ram, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_gpu[] = { @@ -494,6 +508,7 @@ static struct attribute *rapl_events_gpu[] = { static struct attribute_group rapl_events_gpu_group = { .name = "events", .attrs = rapl_events_gpu, + .is_visible = rapl_not_visible, }; static struct attribute *rapl_events_psys[] = { @@ -506,6 +521,7 @@ static struct attribute *rapl_events_psys[] = { static struct attribute_group rapl_events_psys_group = { .name = "events", .attrs = rapl_events_psys, + .is_visible = rapl_not_visible, }; static bool test_msr(int idx, void *data) @@ -513,7 +529,7 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } -static struct perf_msr rapl_msrs[] = { +static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, @@ -521,6 +537,16 @@ static struct perf_msr rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, }; +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +}; + + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -575,13 +601,13 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_check_hw_unit(bool apply_quirk) +static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; @@ -592,7 +618,7 @@ static int rapl_check_hw_unit(bool apply_quirk) * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ - if (apply_quirk) + if (rm->apply_quirk) rapl_hw_unit[PERF_RAPL_RAM] = 16; /* @@ -673,6 +699,8 @@ static struct rapl_model model_snb = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { @@ -680,6 +708,8 @@ static struct rapl_model model_snbep = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { @@ -688,6 +718,8 @@ static struct rapl_model model_hsw = { BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { @@ -695,12 +727,16 @@ static struct rapl_model model_hsx = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), .apply_quirk = true, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { @@ -710,6 +746,15 @@ static struct rapl_model model_skl = { BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, +}; + +static struct rapl_model model_amd_fam17h = { + .events = BIT(PERF_RAPL_PKG), + .apply_quirk = false, + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { @@ -738,8 +783,11 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); @@ -755,10 +803,13 @@ static int __init rapl_pmu_init(void) return -ENODEV; rm = (struct rapl_model *) id->driver_data; + + rapl_msrs = rm->rapl_msrs; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events); - ret = rapl_check_hw_unit(rm->apply_quirk); + ret = rapl_check_hw_unit(rm); if (ret) return ret; diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile new file mode 100644 index 000000000000..642c1174d662 --- /dev/null +++ b/arch/x86/events/zhaoxin/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core.o diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c new file mode 100644 index 000000000000..898fa1ae9ceb --- /dev/null +++ b/arch/x86/events/zhaoxin/core.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Zhoaxin PMU; like Intel Architectural PerfMon-v2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/nmi.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "../perf_event.h" + +/* + * Zhaoxin PerfMon, used on zxc and later. + */ +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { + + [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, + [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, +}; + +static struct event_constraint zxc_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint zxd_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ + EVENT_CONSTRAINT_END +}; + +static __initconst const u64 zxd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x0538, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0700, + [C(RESULT_MISS)] = 0x0709, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static __initconst const u64 zxe_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x054b, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0028, + [C(RESULT_MISS)] = 0x0029, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static void zhaoxin_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void zhaoxin_pmu_enable_all(int added) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static inline u64 zhaoxin_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void zhaoxin_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void zxc_pmu_ack_status(u64 ack) +{ + /* + * ZXC needs global control enabled in order to clear status bits. + */ + zhaoxin_pmu_enable_all(0); + zhaoxin_pmu_ack_status(ack); + zhaoxin_pmu_disable_all(); +} + +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + u64 status; + int bit; + + cpuc = this_cpu_ptr(&cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); + zhaoxin_pmu_disable_all(); + status = zhaoxin_pmu_get_status(); + if (!status) + goto done; + +again: + if (x86_pmu.enabled_ack) + zxc_pmu_ack_status(status); + else + zhaoxin_pmu_ack_status(status); + + inc_irq_stat(apic_perf_irqs); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + x86_perf_event_update(event); + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = zhaoxin_pmu_get_status(); + if (status) + goto again; + +done: + zhaoxin_pmu_enable_all(0); + return handled; +} + +static u64 zhaoxin_pmu_event_map(int hw_event) +{ + return zx_pmon_event_map[hw_event]; +} + +static struct event_constraint * +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + +PMU_FORMAT_ATTR(event, "config:0-7"); +PMU_FORMAT_ATTR(umask, "config:8-15"); +PMU_FORMAT_ATTR(edge, "config:18"); +PMU_FORMAT_ATTR(inv, "config:23"); +PMU_FORMAT_ATTR(cmask, "config:24-31"); + +static struct attribute *zx_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static const struct x86_pmu zhaoxin_pmu __initconst = { + .name = "zhaoxin", + .handle_irq = zhaoxin_pmu_handle_irq, + .disable_all = zhaoxin_pmu_disable_all, + .enable_all = zhaoxin_pmu_enable_all, + .enable = zhaoxin_pmu_enable_event, + .disable = zhaoxin_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = zhaoxin_pmu_event_map, + .max_events = ARRAY_SIZE(zx_pmon_event_map), + .apic = 1, + /* + * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. + */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = zhaoxin_get_event_constraints, + + .format_attrs = zx_arch_formats_attr, + .events_sysfs_show = zhaoxin_event_sysfs_show, +}; + +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void zhaoxin_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { + zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + zx_arch_events_map[bit].name); + } +} + +__init int zhaoxin_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + pr_info("Welcome to zhaoxin pmu!\n"); + + /* + * Check whether the Architectural PerfMon supports + * hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) + return -ENODEV; + + version = eax.split.version_id; + if (version != 2) + return -ENODEV; + + x86_pmu = zhaoxin_pmu; + pr_info("Version check pass!\n"); + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_add_quirk(zhaoxin_arch_events_quirk); + + switch (boot_cpu_data.x86) { + case 0x06: + if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) { + + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; + + /* Clearing status works only if the global control is enable on zxc. */ + x86_pmu.enabled_ack = 1; + + x86_pmu.event_constraints = zxc_event_constraints; + zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; + + pr_cont("ZXC events, "); + break; + } + return -ENODEV; + + case 0x07: + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); + + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); + + switch (boot_cpu_data.x86_model) { + case 0x1b: + memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; + + pr_cont("ZXD events, "); + break; + case 0x3b: + memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; + + pr_cont("ZXE events, "); + break; + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; + x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + return 0; +} + diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h new file mode 100644 index 000000000000..1b07fb102c4e --- /dev/null +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -0,0 +1,25 @@ +#ifdef CONFIG_64BIT +GEN(rax) +GEN(rbx) +GEN(rcx) +GEN(rdx) +GEN(rsi) +GEN(rdi) +GEN(rbp) +GEN(r8) +GEN(r9) +GEN(r10) +GEN(r11) +GEN(r12) +GEN(r13) +GEN(r14) +GEN(r15) +#else +GEN(eax) +GEN(ebx) +GEN(ecx) +GEN(edx) +GEN(esi) +GEN(edi) +GEN(ebp) +#endif diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ce92c4acc913..9bf2620ce817 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,24 +17,19 @@ extern void cmpxchg8b_emu(void); #endif #ifdef CONFIG_RETPOLINE -#ifdef CONFIG_X86_32 -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); -#else -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); -INDIRECT_THUNK(8) -INDIRECT_THUNK(9) -INDIRECT_THUNK(10) -INDIRECT_THUNK(11) -INDIRECT_THUNK(12) -INDIRECT_THUNK(13) -INDIRECT_THUNK(14) -INDIRECT_THUNK(15) -#endif -INDIRECT_THUNK(ax) -INDIRECT_THUNK(bx) -INDIRECT_THUNK(cx) -INDIRECT_THUNK(dx) -INDIRECT_THUNK(si) -INDIRECT_THUNK(di) -INDIRECT_THUNK(bp) + +#define DECL_INDIRECT_THUNK(reg) \ + extern asmlinkage void __x86_indirect_thunk_ ## reg (void); + +#define DECL_RETPOLINE(reg) \ + extern asmlinkage void __x86_retpoline_ ## reg (void); + +#undef GEN +#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) DECL_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 7e31f7f1bb06..49bd6cf3eec9 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -3,7 +3,7 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) +#ifdef CONFIG_IOMMU_API void *iommu; /* hook for IOMMU specific extension */ #endif }; diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 00f7cf45e699..8e95aa4b0d17 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -74,7 +74,7 @@ #define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT)) #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index 07344d82e88e..ac1a99ffbd8d 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -17,7 +17,7 @@ struct task_struct; #ifdef CONFIG_X86_IOPL_IOPERM void io_bitmap_share(struct task_struct *tsk); -void io_bitmap_exit(void); +void io_bitmap_exit(struct task_struct *tsk); void native_tss_update_io_bitmap(void); @@ -29,7 +29,7 @@ void native_tss_update_io_bitmap(void); #else static inline void io_bitmap_share(struct task_struct *tsk) { } -static inline void io_bitmap_exit(void) { } +static inline void io_bitmap_exit(struct task_struct *tsk) { } static inline void tss_update_io_bitmap(void) { } #endif diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..ef452b817f44 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -301,6 +301,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 + /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e9a281e2660..d52d1aacdd97 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,20 +4,13 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include <linux/static_key.h> +#include <linux/frame.h> #include <asm/alternative.h> #include <asm/alternative-asm.h> #include <asm/cpufeatures.h> #include <asm/msr-index.h> - -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - ANNOTATE_IGNORE_ALTERNATIVE +#include <asm/unwind_hints.h> /* * Fill the CPU return stack buffer. @@ -46,21 +39,25 @@ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; + jnz 771b; #ifdef __ASSEMBLY__ @@ -77,57 +74,27 @@ .endm /* - * These are the bare retpoline primitives for indirect jmp and call. - * Do not use these directly; they only exist to make the ALTERNATIVE - * invocation below less ugly. - */ -.macro RETPOLINE_JMP reg:req - call .Ldo_rop_\@ -.Lspec_trap_\@: - pause - lfence - jmp .Lspec_trap_\@ -.Ldo_rop_\@: - mov \reg, (%_ASM_SP) - ret -.endm - -/* - * This is a wrapper around RETPOLINE_JMP so the called function in reg - * returns to the instruction after the macro. - */ -.macro RETPOLINE_CALL reg:req - jmp .Ldo_call_\@ -.Ldo_retpoline_jmp_\@: - RETPOLINE_JMP \reg -.Ldo_call_\@: - call .Ldo_retpoline_jmp_\@ -.endm - -/* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ - __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else - jmp *\reg + jmp *%\reg #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ - __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ + __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else - call *\reg + call *%\reg #endif .endm @@ -137,10 +104,8 @@ */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) .Lskip_rsb_\@: #endif .endm @@ -161,16 +126,16 @@ * which is ensured when CONFIG_RETPOLINE is defined. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ + "call __x86_retpoline_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_AMD) + # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -180,7 +145,6 @@ * here, anyway. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 6e060907c163..d25534940bde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -58,8 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bcf27caf6c9..29ee0c088009 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -113,9 +113,10 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ - /* Cache QoS architectural values: */ + /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ + int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ @@ -727,7 +728,6 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( - UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -737,7 +737,6 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" - UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl.h index f6b7fe2833cc..07603064df8f 100644 --- a/arch/x86/include/asm/resctrl_sched.h +++ b/arch/x86/include/asm/resctrl.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_RESCTRL_SCHED_H -#define _ASM_X86_RESCTRL_SCHED_H +#ifndef _ASM_X86_RESCTRL_H +#define _ASM_X86_RESCTRL_H #ifdef CONFIG_X86_CPU_RESCTRL @@ -84,10 +84,13 @@ static inline void resctrl_sched_in(void) __resctrl_sched_in(); } +void resctrl_cpu_detect(struct cpuinfo_x86 *c); + #else static inline void resctrl_sched_in(void) {} +static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ -#endif /* _ASM_X86_RESCTRL_SCHED_H */ +#endif /* _ASM_X86_RESCTRL_H */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 27c47d183f4b..8b58d6975d5d 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -57,8 +57,10 @@ static __always_inline unsigned long smap_save(void) { unsigned long flags; - asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, - X86_FEATURE_SMAP) + asm volatile ("# smap_save\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "pushf; pop %0; " __ASM_CLAC "\n\t" + "1:" : "=rm" (flags) : : "memory", "cc"); return flags; @@ -66,7 +68,10 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { - asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + asm volatile ("# smap_restore\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "push %0; popf\n\t" + "1:" : : "g" (flags) : "memory", "cc"); } diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..6f6c417e1e46 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -118,11 +118,6 @@ void smp_spurious_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); asmlinkage void smp_irq_move_cleanup_interrupt(void); -extern void ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs); -extern void ist_begin_non_atomic(struct pt_regs *regs); -extern void ist_end_non_atomic(void); - #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index f5e2eb12cb71..7d903fdb3f43 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,32 +86,15 @@ UNWIND_HINT sp_offset=\sp_offset .endm -.macro UNWIND_HINT_SAVE - UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE -.endm - -.macro UNWIND_HINT_RESTORE - UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE +/* + * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN + * and sibling calls. On these, sp_offset denotes the expected offset from + * initial_func_cfi. + */ +.macro UNWIND_HINT_RET_OFFSET sp_offset=8 + UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm -#else /* !__ASSEMBLY__ */ - -#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ - "987: \n\t" \ - ".pushsection .discard.unwind_hints\n\t" \ - /* struct unwind_hint */ \ - ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ - ".byte " __stringify(sp_reg) "\n\t" \ - ".byte " __stringify(type) "\n\t" \ - ".byte " __stringify(end) "\n\t" \ - ".balign 4 \n\t" \ - ".popsection\n\t" - -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) - -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 196fdd02b8b1..be5e2e747f50 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -2,8 +2,15 @@ #ifndef _UAPI_ASM_X86_UNISTD_H #define _UAPI_ASM_X86_UNISTD_H -/* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000UL +/* + * x32 syscall flag bit. Some user programs expect syscall NR macros + * and __X32_SYSCALL_BIT to have type int, even though syscall numbers + * are, for practical purposes, unsigned long. + * + * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right + * thing regardless. + */ +#define __X32_SYSCALL_BIT 0x40000000 #ifndef __KERNEL__ # ifdef __i386__ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 547ad7bbf0e0..c36e89930965 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -18,6 +18,7 @@ #include <asm/pci-direct.h> #include <asm/delay.h> #include <asm/debugreg.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 # include <asm/mmconfig.h> @@ -597,6 +598,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; } } + + resctrl_cpu_detect(c); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..d07809286b95 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -854,30 +854,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -945,7 +921,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); /* * Clear/Set all flags overridden by options, after probe. @@ -1377,20 +1352,6 @@ static void generic_identify(struct cpuinfo_x86 *c) #endif } -static void x86_init_cache_qos(struct cpuinfo_x86 *c) -{ - /* - * The heavy lifting of max_rmid and cache_occ_scale are handled - * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu - * in case CQM bits really aren't there in this CPU. - */ - if (c != &boot_cpu_data) { - boot_cpu_data.x86_cache_max_rmid = - min(boot_cpu_data.x86_cache_max_rmid, - c->x86_cache_max_rmid); - } -} - /* * Validate that ACPI/mptables have the same information about the * effective APIC id and update the package map. @@ -1503,7 +1464,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif x86_init_rdrand(c); - x86_init_cache_qos(c); setup_pku(c); /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a19a680542ce..166d7c355896 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -22,6 +22,7 @@ #include <asm/cpu_device_id.h> #include <asm/cmdline.h> #include <asm/traps.h> +#include <asm/resctrl.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -322,6 +323,11 @@ static void early_init_intel(struct cpuinfo_x86 *c) detect_ht_early(c); } +static void bsp_init_intel(struct cpuinfo_x86 *c) +{ + resctrl_cpu_detect(c); +} + #ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 @@ -961,6 +967,7 @@ static const struct cpu_dev intel_cpu_dev = { #endif .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, + .c_bsp_init = bsp_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..e9265e2f28c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,8 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -1086,23 +1088,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1189,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1230,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1259,7 +1267,7 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if (__mc_check_crashing_cpu(cpu)) return; - ist_enter(regs); + nmi_enter(); this_cpu_inc(mce_exception_count); @@ -1352,23 +1360,24 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } out_ist: - ist_exit(regs); + nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..5ee94aa1b766 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -24,7 +25,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - ist_enter(regs); + nmi_enter(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +40,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..b3938c195365 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -18,12 +19,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - ist_enter(regs); + nmi_enter(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 7019d4b2df0c..baec68b7e010 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -545,8 +545,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout) /* * Returns: * < 0 - on error - * 0 - no update done - * 1 - microcode was updated + * 0 - success (no update done or microcode was updated) */ static int __reload_late(void *info) { @@ -573,11 +572,11 @@ static int __reload_late(void *info) else goto wait_for_siblings; - if (err > UCODE_NFOUND) { - pr_warn("Error reloading microcode on CPU %d\n", cpu); + if (err >= UCODE_NFOUND) { + if (err == UCODE_ERROR) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + ret = -1; - } else if (err == UCODE_UPDATED || err == UCODE_OK) { - ret = 1; } wait_for_siblings: @@ -608,7 +607,7 @@ static int microcode_reload_late(void) atomic_set(&late_cpus_out, 0); ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask); - if (ret > 0) + if (ret == 0) microcode_check(); pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode); @@ -649,7 +648,7 @@ static ssize_t reload_store(struct device *dev, put: put_online_cpus(); - if (ret >= 0) + if (ret == 0) ret = size; return ret; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9556930cd8c1..a5ee607a3b89 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BPU_PERFCTR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_PERFCTR0; } return 0; } @@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) case 15: return msr - MSR_P4_BSU_ESCR0; } + fallthrough; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + return msr - MSR_ARCH_PERFMON_EVENTSEL0; } return 0; diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d8cc5223b7ce..12f967c6b603 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" /* Mutex to protect rdtgroup access. */ @@ -958,6 +958,36 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +/* Runs once on the BSP during boot. */ +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 055c8613b531..934c8fb8a64a 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of, return ret; } -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first) +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first) { /* * setup the parameters to send to the IPI to read the data. */ rr->rgrp = rdtgrp; rr->evtid = evtid; + rr->r = r; rr->d = d; rr->val = 0; rr->first = first; @@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg) goto out; } - mon_event_read(&rr, d, rdtgrp, evtid, false); + mon_event_read(&rr, r, d, rdtgrp, evtid, false); if (rr.val & RMID_VAL_ERROR) seq_puts(m, "Error\n"); diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 3dd13f3a8b23..f20a47d120b1 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -31,7 +31,7 @@ #define CQM_LIMBOCHECK_INTERVAL 1000 -#define MBM_CNTR_WIDTH 24 +#define MBM_CNTR_WIDTH_BASE 24 #define MBM_OVERFLOW_INTERVAL 1000 #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -40,6 +40,12 @@ #define RMID_VAL_ERROR BIT_ULL(63) #define RMID_VAL_UNAVAIL BIT_ULL(62) +/* + * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for + * data to be returned. The counter width is discovered from the hardware + * as an offset from MBM_CNTR_WIDTH_BASE. + */ +#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) struct rdt_fs_context { @@ -87,6 +93,7 @@ union mon_data_bits { struct rmid_read { struct rdtgroup *rgrp; + struct rdt_resource *r; struct rdt_domain *d; int evtid; bool first; @@ -460,6 +467,7 @@ struct rdt_resource { struct list_head evt_list; int num_rmid; unsigned int mon_scale; + unsigned int mbm_width; unsigned long fflags; }; @@ -587,8 +595,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id); void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, struct rdt_domain *d); -void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, - struct rdtgroup *rdtgrp, int evtid, int first); +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_domain *d, struct rdtgroup *rdtgrp, + int evtid, int first); void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms); void mbm_handle_overflow(struct work_struct *work); diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 773124b0e18a..837d7d012b7b 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -214,9 +214,9 @@ void free_rmid(u32 rmid) list_add_tail(&entry->list, &rmid_free_lru); } -static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr) +static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) { - u64 shift = 64 - MBM_CNTR_WIDTH, chunks; + u64 shift = 64 - width, chunks; chunks = (cur_msr << shift) - (prev_msr << shift); return chunks >>= shift; @@ -256,7 +256,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr) return 0; } - chunks = mbm_overflow_count(m->prev_msr, tval); + chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width); m->chunks += chunks; m->prev_msr = tval; @@ -278,7 +278,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) return; - chunks = mbm_overflow_count(m->prev_bw_msr, tval); + chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); m->chunks_bw += chunks; m->chunks = m->chunks_bw; cur_bw = (chunks * r->mon_scale) >> 20; @@ -433,11 +433,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm) } } -static void mbm_update(struct rdt_domain *d, int rmid) +static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) { struct rmid_read rr; rr.first = false; + rr.r = r; rr.d = d; /* @@ -510,6 +511,7 @@ void mbm_handle_overflow(struct work_struct *work) struct rdtgroup *prgrp, *crgrp; int cpu = smp_processor_id(); struct list_head *head; + struct rdt_resource *r; struct rdt_domain *d; mutex_lock(&rdtgroup_mutex); @@ -517,16 +519,18 @@ void mbm_handle_overflow(struct work_struct *work) if (!static_branch_likely(&rdt_mon_enable_key)) goto out_unlock; - d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + d = get_domain_from_cpu(cpu, r); if (!d) goto out_unlock; list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(d, prgrp->mon.rmid); + mbm_update(r, d, prgrp->mon.rmid); head = &prgrp->mon.crdtgrp_list; list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(d, crgrp->mon.rmid); + mbm_update(r, d, crgrp->mon.rmid); if (is_mba_sc(NULL)) update_mba_bw(prgrp, d); @@ -614,11 +618,18 @@ static void l3_mon_evt_init(struct rdt_resource *r) int rdt_get_mon_l3_config(struct rdt_resource *r) { + unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; unsigned int cl_size = boot_cpu_data.x86_cache_size; int ret; r->mon_scale = boot_cpu_data.x86_cache_occ_scale; r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + r->mbm_width = MBM_CNTR_WIDTH_BASE; + + if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX) + r->mbm_width += mbm_offset; + else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX) + pr_warn("Ignoring impossible MBM counter offset\n"); /* * A reasonable upper limit on the max threshold is the number diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..4bd28b388a1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include <asm/cacheflush.h> #include <asm/intel-family.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/perf_event.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5a359d9fcc05..d7cb5ab0d1f0 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -29,7 +29,7 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); @@ -2472,7 +2472,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, goto out_destroy; if (is_mbm_event(mevt->evtid)) - mon_event_read(&rr, d, prgrp, mevt->evtid, true); + mon_event_read(&rr, r, d, prgrp, mevt->evtid, true); } kernfs_activate(kn); return 0; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 32b153d38748..6a54e83d5589 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -957,18 +957,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures) return true; } -/* - * This is similar to user_regset_copyout(), but will not add offset to - * the source data pointer or increment pos, count, kbuf, and ubuf. - */ -static inline void -__copy_xstate_to_kernel(void *kbuf, const void *data, - unsigned int offset, unsigned int size, unsigned int size_total) +static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count) { - if (offset < size_total) { - unsigned int copy = min(size, size_total - offset); + if (*pos < to) { + unsigned size = to - *pos; + + if (size > *count) + size = *count; + memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size); + *kbuf += size; + *pos += size; + *count -= size; + } +} - memcpy(kbuf + offset, data, copy); +static void copy_part(unsigned offset, unsigned size, void *from, + void **kbuf, unsigned *pos, unsigned *count) +{ + fill_gap(offset, kbuf, pos, count); + if (size > *count) + size = *count; + if (size) { + memcpy(*kbuf, from, size); + *kbuf += size; + *pos += size; + *count -= size; } } @@ -981,8 +994,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data, */ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total) { - unsigned int offset, size; struct xstate_header header; + const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr); + unsigned count = size_total; int i; /* @@ -998,46 +1012,42 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of header.xfeatures = xsave->header.xfeatures; header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(0, off_mxcsr, + &xsave->i387, &kbuf, &offset_start, &count); + if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM)) + copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE, + &xsave->i387.mxcsr, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_FP) + copy_part(offsetof(struct fxregs_state, st_space), 128, + &xsave->i387.st_space, &kbuf, &offset_start, &count); + if (header.xfeatures & XFEATURE_MASK_SSE) + copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256, + &xsave->i387.xmm_space, &kbuf, &offset_start, &count); + /* + * Fill xsave->i387.sw_reserved value for ptrace frame: + */ + copy_part(offsetof(struct fxregs_state, sw_reserved), 48, + xstate_fx_sw_bytes, &kbuf, &offset_start, &count); /* * Copy xregs_state->header: */ - offset = offsetof(struct xregs_state, header); - size = sizeof(header); - - __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total); + copy_part(offsetof(struct xregs_state, header), sizeof(header), + &header, &kbuf, &offset_start, &count); - for (i = 0; i < XFEATURE_MAX; i++) { + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { /* * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { void *src = __raw_xsave_addr(xsave, i); - offset = xstate_offsets[i]; - size = xstate_sizes[i]; - - /* The next component has to fit fully into the output buffer: */ - if (offset + size > size_total) - break; - - __copy_xstate_to_kernel(kbuf, src, offset, size, size_total); + copy_part(xstate_offsets[i], xstate_sizes[i], + src, &kbuf, &offset_start, &count); } } - - if (xfeatures_mxcsr_quirk(header.xfeatures)) { - offset = offsetof(struct fxregs_state, mxcsr); - size = MXCSR_AND_FLAGS_SIZE; - __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total); - } - - /* - * Fill xsave->i387.sw_reserved value for ptrace frame: - */ - offset = offsetof(struct fxregs_state, sw_reserved); - size = sizeof(xstate_fx_sw_bytes); - - __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total); + fill_gap(size_total, &kbuf, &offset_start, &count); return 0; } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index b0e641793be4..c84d28e90a58 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -282,7 +282,8 @@ static inline void tramp_free(void *tramp) { } /* Defined as markers to the end of the ftrace default trampolines */ extern void ftrace_regs_caller_end(void); -extern void ftrace_epilogue(void); +extern void ftrace_regs_caller_ret(void); +extern void ftrace_caller_end(void); extern void ftrace_caller_op_ptr(void); extern void ftrace_regs_caller_op_ptr(void); @@ -334,7 +335,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) call_offset = (unsigned long)ftrace_regs_call; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; op_offset = (unsigned long)ftrace_caller_op_ptr; call_offset = (unsigned long)ftrace_call; } @@ -366,6 +367,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) if (WARN_ON(ret < 0)) goto fail; + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller); + ret = probe_kernel_read(ip, (void *)retq, RET_SIZE); + if (WARN_ON(ret < 0)) + goto fail; + } + /* * The address of the ftrace_ops that is used for this trampoline * is stored at the end of the trampoline. This will be used to @@ -433,7 +441,7 @@ void set_ftrace_ops_ro(void) end_offset = (unsigned long)ftrace_regs_caller_end; } else { start_offset = (unsigned long)ftrace_caller; - end_offset = (unsigned long)ftrace_epilogue; + end_offset = (unsigned long)ftrace_caller_end; } size = end_offset - start_offset; size = size + RET_SIZE + sizeof(void *); diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..e405fe1a8bf4 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 369e61faacfe..aa5d28aeb31e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -23,7 +23,7 @@ #endif /* CONFIG_FRAME_POINTER */ /* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) +#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE) /* * gcc -pg option adds a call to 'mcount' in most functions. @@ -77,7 +77,7 @@ /* * We add enough stack to save all regs. */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + subq $(FRAME_SIZE), %rsp movq %rax, RAX(%rsp) movq %rcx, RCX(%rsp) movq %rdx, RDX(%rsp) @@ -157,8 +157,12 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) * think twice before adding any new code or changing the * layout here. */ -SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL) +SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) + jmp ftrace_epilogue +SYM_FUNC_END(ftrace_caller); + +SYM_FUNC_START(ftrace_epilogue) #ifdef CONFIG_FUNCTION_GRAPH_TRACER SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) jmp ftrace_stub @@ -170,14 +174,12 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) */ SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK) retq -SYM_FUNC_END(ftrace_caller) +SYM_FUNC_END(ftrace_epilogue) SYM_FUNC_START(ftrace_regs_caller) /* Save the current flags before any operations that can change them */ pushfq - UNWIND_HINT_SAVE - /* added 8 bytes to save flags */ save_mcount_regs 8 /* save_mcount_regs fills in first two parameters */ @@ -233,10 +235,13 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq ORIG_RAX(%rsp), %rax movq %rax, MCOUNT_REG_SIZE-8(%rsp) - /* If ORIG_RAX is anything but zero, make this a call to that */ + /* + * If ORIG_RAX is anything but zero, make this a call to that. + * See arch_ftrace_set_direct_caller(). + */ movq ORIG_RAX(%rsp), %rax - cmpq $0, %rax - je 1f + testq %rax, %rax + jz 1f /* Swap the flags with orig_rax */ movq MCOUNT_REG_SIZE(%rsp), %rdi @@ -244,20 +249,14 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) movq %rax, MCOUNT_REG_SIZE(%rsp) restore_mcount_regs 8 + /* Restore flags */ + popfq - jmp 2f +SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL); + UNWIND_HINT_RET_OFFSET + jmp ftrace_epilogue 1: restore_mcount_regs - - -2: - /* - * The stack layout is nondetermistic here, depending on which path was - * taken. This confuses objtool and ORC, rightfully so. For now, - * pretend the stack always looks like the non-direct case. - */ - UNWIND_HINT_RESTORE - /* Restore flags */ popfq @@ -268,7 +267,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL) * to the return. */ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL) - jmp ftrace_epilogue SYM_FUNC_END(ftrace_regs_caller) @@ -303,7 +301,7 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs jmp fgraph_trace @@ -340,6 +338,6 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - JMP_NOSPEC %rdi + JMP_NOSPEC rdi SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index a53e7b4a7419..e2fab3ceb09f 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,15 +33,15 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(void) +static void task_update_io_bitmap(struct task_struct *tsk) { - struct thread_struct *t = ¤t->thread; + struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { /* TSS update is handled on exit to user space */ - set_thread_flag(TIF_IO_BITMAP); + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } else { - clear_thread_flag(TIF_IO_BITMAP); + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); /* Invalidate TSS */ preempt_disable(); tss_update_io_bitmap(); @@ -49,12 +49,12 @@ static void task_update_io_bitmap(void) } } -void io_bitmap_exit(void) +void io_bitmap_exit(struct task_struct *tsk) { - struct io_bitmap *iobm = current->thread.io_bitmap; + struct io_bitmap *iobm = tsk->thread.io_bitmap; - current->thread.io_bitmap = NULL; - task_update_io_bitmap(); + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -102,7 +102,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if (!iobm) return -ENOMEM; refcount_set(&iobm->refcnt, 1); - io_bitmap_exit(); + io_bitmap_exit(current); } /* @@ -134,7 +134,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) } /* All permissions dropped? */ if (max_long == UINT_MAX) { - io_bitmap_exit(); + io_bitmap_exit(current); return 0; } @@ -192,7 +192,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(); + task_update_io_bitmap(current); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..35638f1c5791 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -96,7 +96,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) } /* - * Free current thread data structures etc.. + * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { @@ -104,7 +104,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) - io_bitmap_exit(); + io_bitmap_exit(tsk); free_vm86(t); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 954b013cc585..538d4e8d6589 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -52,7 +52,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/proto.h> #include "process.h" diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5ef9d8f25b0e..0c169a5687e1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> -#include <asm/resctrl_sched.h> +#include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..f7cfb9d0ad02 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/hardirq.h> +#include <linux/atomic.h> + #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg.h> -#include <linux/atomic.h> #include <asm/text-patching.h> #include <asm/ftrace.h> #include <asm/traps.h> @@ -82,78 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) -{ - preempt_disable(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; @@ -363,7 +293,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && @@ -398,7 +328,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign } #endif - ist_enter(regs); + nmi_enter(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -592,19 +522,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from a kprobe in - * non-CONTEXT_KERNEL kernel mode or even during context tracking - * state changes. Make sure that we wake up RCU even if we're coming - * from kernel code. - * - * This means that we can't schedule even if we came from a - * preemptible kernel context. That's okay. + * Unlike any other non-IST entry, we can be called from pretty much + * any location in the kernel through kprobes -- text_poke() will most + * likely be handled by poke_int3_handler() above. This means this + * handler is effectively NMI-like. */ - if (!user_mode(regs)) { - rcu_nmi_enter(); - preempt_disable(); - } - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (!user_mode(regs)) + nmi_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -626,10 +550,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) { - preempt_enable_no_resched(); - rcu_nmi_exit(); - } + if (!user_mode(regs)) + nmi_exit(); } NOKPROBE_SYMBOL(do_int3); @@ -733,7 +655,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - ist_enter(regs); + nmi_enter(); get_debugreg(dr6, 6); /* @@ -826,7 +748,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs); + nmi_exit(); } NOKPROBE_SYMBOL(do_debug); diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4742e8fa7ee7..d1d768912368 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -153,7 +153,7 @@ SYM_FUNC_START(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -436,7 +436,7 @@ SYM_FUNC_START(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 363ec132df7e..b4c43a9b1483 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,15 +7,31 @@ #include <asm/alternative-asm.h> #include <asm/export.h> #include <asm/nospec-branch.h> +#include <asm/unwind_hints.h> +#include <asm/frame.h> .macro THUNK reg .section .text.__x86.indirect_thunk + .align 32 SYM_FUNC_START(__x86_indirect_thunk_\reg) - CFI_STARTPROC - JMP_NOSPEC %\reg - CFI_ENDPROC + JMP_NOSPEC \reg SYM_FUNC_END(__x86_indirect_thunk_\reg) + +SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop_\@ +.Lspec_trap_\@: + UNWIND_HINT_EMPTY + pause + lfence + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov %\reg, (%_ASM_SP) + UNWIND_HINT_RET_OFFSET + ret +SYM_FUNC_END(__x86_retpoline_\reg) + .endm /* @@ -24,25 +40,24 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) * only see one instance of "__x86_indirect_thunk_\reg" rather * than one per register with the correct names. So we do it * the simple and nasty way... + * + * Worse, you can only have a single EXPORT_SYMBOL per line, + * and CPP can't insert newlines, so we have to repeat everything + * at least twice. */ -#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) - -GENERATE_THUNK(_ASM_AX) -GENERATE_THUNK(_ASM_BX) -GENERATE_THUNK(_ASM_CX) -GENERATE_THUNK(_ASM_DX) -GENERATE_THUNK(_ASM_SI) -GENERATE_THUNK(_ASM_DI) -GENERATE_THUNK(_ASM_BP) -#ifdef CONFIG_64BIT -GENERATE_THUNK(r8) -GENERATE_THUNK(r9) -GENERATE_THUNK(r10) -GENERATE_THUNK(r11) -GENERATE_THUNK(r12) -GENERATE_THUNK(r13) -GENERATE_THUNK(r14) -GENERATE_THUNK(r15) -#endif + +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) + +#undef GEN +#define GEN(reg) THUNK reg +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_THUNK(reg) +#include <asm/GEN-for-each-reg.h> + +#undef GEN +#define GEN(reg) EXPORT_RETPOLINE(reg) +#include <asm/GEN-for-each-reg.h> diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 15da118f04f0..90380a17ab23 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -21,7 +21,7 @@ SYM_FUNC_START(__efi_call) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx - CALL_NOSPEC %rdi + CALL_NOSPEC rdi leave ret SYM_FUNC_END(__efi_call) |