diff options
Diffstat (limited to 'arch/x86/events')
35 files changed, 11263 insertions, 2033 deletions
diff --git a/arch/x86/events/Kconfig b/arch/x86/events/Kconfig index 9a7a1446cb3a..dabdf3d7bf84 100644 --- a/arch/x86/events/Kconfig +++ b/arch/x86/events/Kconfig @@ -5,33 +5,51 @@ config PERF_EVENTS_INTEL_UNCORE tristate "Intel uncore performance events" depends on PERF_EVENTS && CPU_SUP_INTEL && PCI default y - ---help--- - Include support for Intel uncore performance events. These are - available on NehalemEX and more modern processors. + help + Include support for Intel uncore performance events. These are + available on NehalemEX and more modern processors. config PERF_EVENTS_INTEL_RAPL - tristate "Intel rapl performance events" - depends on PERF_EVENTS && CPU_SUP_INTEL && PCI + tristate "Intel/AMD rapl performance events" + depends on PERF_EVENTS && (CPU_SUP_INTEL || CPU_SUP_AMD) && PCI default y - ---help--- - Include support for Intel rapl performance events for power - monitoring on modern processors. + help + Include support for Intel and AMD rapl performance events for power + monitoring on modern processors. config PERF_EVENTS_INTEL_CSTATE tristate "Intel cstate performance events" depends on PERF_EVENTS && CPU_SUP_INTEL && PCI default y - ---help--- - Include support for Intel cstate performance events for power - monitoring on modern processors. + help + Include support for Intel cstate performance events for power + monitoring on modern processors. config PERF_EVENTS_AMD_POWER depends on PERF_EVENTS && CPU_SUP_AMD tristate "AMD Processor Power Reporting Mechanism" - ---help--- + help Provide power reporting mechanism support for AMD processors. Currently, it leverages X86_FEATURE_ACC_POWER (CPUID Fn8000_0007_EDX[12]) interface to calculate the average power consumption on Family 15h processors. +config PERF_EVENTS_AMD_UNCORE + tristate "AMD Uncore performance events" + depends on PERF_EVENTS && CPU_SUP_AMD + default y + help + Include support for AMD uncore performance events for use with + e.g., perf stat -e amd_l3/.../,amd_df/.../. + + To compile this driver as a module, choose M here: the + module will be called 'amd-uncore'. + +config PERF_EVENTS_AMD_BRS + depends on PERF_EVENTS && CPU_SUP_AMD + bool "AMD Zen3 Branch Sampling support" + help + Enable AMD Zen3 branch sampling support (BRS) which samples up to + 16 consecutive taken branches in registers. + endmenu diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9e07f554333f..86a76efa8bb6 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o probe.o +obj-y += core.o probe.o utils.o +obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o obj-$(CONFIG_CPU_SUP_INTEL) += intel/ +obj-$(CONFIG_CPU_SUP_CENTAUR) += zhaoxin/ +obj-$(CONFIG_CPU_SUP_ZHAOXIN) += zhaoxin/ diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index fe8795a67385..527d947eb76b 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,8 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o uncore.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o +obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o +obj-$(CONFIG_PERF_EVENTS_AMD_UNCORE) += amd-uncore.o +amd-uncore-objs := uncore.o ifdef CONFIG_AMD_IOMMU obj-$(CONFIG_CPU_SUP_AMD) += iommu.o endif - diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c new file mode 100644 index 000000000000..f1bff153d945 --- /dev/null +++ b/arch/x86/events/amd/brs.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Implement support for AMD Fam19h Branch Sampling feature + * Based on specifications published in AMD PPR Fam19 Model 01 + * + * Copyright 2021 Google LLC + * Contributed by Stephane Eranian <eranian@google.com> + */ +#include <linux/kernel.h> +#include <linux/jump_label.h> +#include <asm/msr.h> +#include <asm/cpufeature.h> + +#include "../perf_event.h" + +#define BRS_POISON 0xFFFFFFFFFFFFFFFEULL /* mark limit of valid entries */ + +/* Debug Extension Configuration register layout */ +union amd_debug_extn_cfg { + __u64 val; + struct { + __u64 rsvd0:2, /* reserved */ + brsmen:1, /* branch sample enable */ + rsvd4_3:2,/* reserved - must be 0x3 */ + vb:1, /* valid branches recorded */ + rsvd2:10, /* reserved */ + msroff:4, /* index of next entry to write */ + rsvd3:4, /* reserved */ + pmc:3, /* #PMC holding the sampling event */ + rsvd4:37; /* reserved */ + }; +}; + +static inline unsigned int brs_from(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx; +} + +static inline unsigned int brs_to(int idx) +{ + return MSR_AMD_SAMP_BR_FROM + 2 * idx + 1; +} + +static inline void set_debug_extn_cfg(u64 val) +{ + /* bits[4:3] must always be set to 11b */ + wrmsrl(MSR_AMD_DBG_EXTN_CFG, val | 3ULL << 3); +} + +static inline u64 get_debug_extn_cfg(void) +{ + u64 val; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, val); + return val; +} + +static bool __init amd_brs_detect(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return false; + + switch (boot_cpu_data.x86) { + case 0x19: /* AMD Fam19h (Zen3) */ + x86_pmu.lbr_nr = 16; + + /* No hardware filtering supported */ + x86_pmu.lbr_sel_map = NULL; + x86_pmu.lbr_sel_mask = 0; + break; + default: + return false; + } + + return true; +} + +/* + * Current BRS implementation does not support branch type or privilege level + * filtering. Therefore, this function simply enforces these limitations. No need for + * a br_sel_map. Software filtering is not supported because it would not correlate well + * with a sampling period. + */ +static int amd_brs_setup_filter(struct perf_event *event) +{ + u64 type = event->attr.branch_sample_type; + + /* No BRS support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* Can only capture all branches, i.e., no filtering */ + if ((type & ~PERF_SAMPLE_BRANCH_PLM_ALL) != PERF_SAMPLE_BRANCH_ANY) + return -EINVAL; + + return 0; +} + +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + +/* tos = top of stack, i.e., last valid entry written */ +static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) +{ + /* + * msroff: index of next entry to write so top-of-stack is one off + * if BRS is full then msroff is set back to 0. + */ + return (cfg->msroff ? cfg->msroff : x86_pmu.lbr_nr) - 1; +} + +/* + * make sure we have a sane BRS offset to begin with + * especially with kexec + */ +void amd_brs_reset(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_BRS)) + return; + + /* + * Reset config + */ + set_debug_extn_cfg(0); + + /* + * Mark first entry as poisoned + */ + wrmsrl(brs_to(0), BRS_POISON); +} + +int __init amd_brs_init(void) +{ + if (!amd_brs_detect()) + return -EOPNOTSUPP; + + pr_cont("%d-deep BRS, ", x86_pmu.lbr_nr); + + return 0; +} + +void amd_brs_enable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Activate only on first user */ + if (++cpuc->brs_active > 1) + return; + + cfg.val = 0; /* reset all fields */ + cfg.brsmen = 1; /* enable branch sampling */ + + /* Set enable bit */ + set_debug_extn_cfg(cfg.val); +} + +void amd_brs_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_enable(); +} + +void amd_brs_disable(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* Check if active (could be disabled via x86_pmu_disable_all()) */ + if (!cpuc->brs_active) + return; + + /* Only disable for last user */ + if (--cpuc->brs_active) + return; + + /* + * Clear the brsmen bit but preserve the others as they contain + * useful state such as vb and msroff + */ + cfg.val = get_debug_extn_cfg(); + + /* + * When coming in on interrupt and BRS is full, then hw will have + * already stopped BRS, no need to issue wrmsr again + */ + if (cfg.brsmen) { + cfg.brsmen = 0; + set_debug_extn_cfg(cfg.val); + } +} + +void amd_brs_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + if (cpuc->lbr_users) + amd_brs_disable(); +} + +static bool amd_brs_match_plm(struct perf_event *event, u64 to) +{ + int type = event->attr.branch_sample_type; + int plm_k = PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_HV; + int plm_u = PERF_SAMPLE_BRANCH_USER; + + if (!(type & plm_k) && kernel_ip(to)) + return 0; + + if (!(type & plm_u) && !kernel_ip(to)) + return 0; + + return 1; +} + +/* + * Caller must ensure amd_brs_inuse() is true before calling + * return: + */ +void amd_brs_drain(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *event = cpuc->events[0]; + struct perf_branch_entry *br = cpuc->lbr_entries; + union amd_debug_extn_cfg cfg; + u32 i, nr = 0, num, tos, start; + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + /* + * BRS event forced on PMC0, + * so check if there is an event. + * It is possible to have lbr_users > 0 but the event + * not yet scheduled due to long latency PMU irq + */ + if (!event) + goto empty; + + cfg.val = get_debug_extn_cfg(); + + /* Sanity check [0-x86_pmu.lbr_nr] */ + if (WARN_ON_ONCE(cfg.msroff >= x86_pmu.lbr_nr)) + goto empty; + + /* No valid branch */ + if (cfg.vb == 0) + goto empty; + + /* + * msr.off points to next entry to be written + * tos = most recent entry index = msr.off - 1 + * BRS register buffer saturates, so we know we have + * start < tos and that we have to read from start to tos + */ + start = 0; + tos = amd_brs_get_tos(&cfg); + + num = tos - start + 1; + + /* + * BRS is only one pass (saturation) from MSROFF to depth-1 + * MSROFF wraps to zero when buffer is full + */ + for (i = 0; i < num; i++) { + u32 brs_idx = tos - i; + u64 from, to; + + rdmsrl(brs_to(brs_idx), to); + + /* Entry does not belong to us (as marked by kernel) */ + if (to == BRS_POISON) + break; + + /* + * Sign-extend SAMP_BR_TO to 64 bits, bits 61-63 are reserved. + * Necessary to generate proper virtual addresses suitable for + * symbolization + */ + to = (u64)(((s64)to << shift) >> shift); + + if (!amd_brs_match_plm(event, to)) + continue; + + rdmsrl(brs_from(brs_idx), from); + + perf_clear_branch_entry_bitfields(br+nr); + + br[nr].from = from; + br[nr].to = to; + + nr++; + } +empty: + /* Record number of sampled branches */ + cpuc->lbr_stack.nr = nr; +} + +/* + * Poison most recent entry to prevent reuse by next task + * required because BRS entry are not tagged by PID + */ +static void amd_brs_poison_buffer(void) +{ + union amd_debug_extn_cfg cfg; + unsigned int idx; + + /* Get current state */ + cfg.val = get_debug_extn_cfg(); + + /* idx is most recently written entry */ + idx = amd_brs_get_tos(&cfg); + + /* Poison target of entry */ + wrmsrl(brs_to(idx), BRS_POISON); +} + +/* + * On context switch in, we need to make sure no samples from previous user + * are left in the BRS. + * + * On ctxswin, sched_in = true, called after the PMU has started + * On ctxswout, sched_in = false, called before the PMU is stopped + */ +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* no active users */ + if (!cpuc->lbr_users) + return; + + /* + * On context switch in, we need to ensure we do not use entries + * from previous BRS user on that CPU, so we poison the buffer as + * a faster way compared to resetting all entries. + */ + if (sched_in) + amd_brs_poison_buffer(); +} + +/* + * called from ACPI processor_idle.c or acpi_pad.c + * with interrupts disabled + */ +void perf_amd_brs_lopwr_cb(bool lopwr_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union amd_debug_extn_cfg cfg; + + /* + * on mwait in, we may end up in non C0 state. + * we must disable branch sampling to avoid holding the NMI + * for too long. We disable it in hardware but we + * keep the state in cpuc, so we can re-enable. + * + * The hardware will deliver the NMI if needed when brsmen cleared + */ + if (cpuc->brs_active) { + cfg.val = get_debug_extn_cfg(); + cfg.brsmen = !lopwr_in; + set_debug_extn_cfg(cfg.val); + } +} + +DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb); + +void __init amd_brs_lopwr_init(void) +{ + static_call_update(perf_lopwr_cb, perf_amd_brs_lopwr_cb); +} diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 39eb276d0277..8b70237c33f7 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/perf_event.h> +#include <linux/jump_label.h> #include <linux/export.h> #include <linux/types.h> #include <linux/init.h> @@ -7,6 +8,7 @@ #include <linux/delay.h> #include <linux/jiffies.h> #include <asm/apicdef.h> +#include <asm/apic.h> #include <asm/nmi.h> #include "../perf_event.h" @@ -18,6 +20,9 @@ static unsigned long perf_nmi_window; #define AMD_MERGE_EVENT ((0xFULL << 32) | 0xFFULL) #define AMD_MERGE_EVENT_ENABLE (AMD_MERGE_EVENT | ARCH_PERFMON_EVENTSEL_ENABLE) +/* PMC Enable and Overflow bits for PerfCntrGlobal* registers */ +static u64 amd_pmu_global_cntr_mask __read_mostly; + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -325,6 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); + static int amd_core_hw_config(struct perf_event *event) { if (event->attr.exclude_host && event->attr.exclude_guest) @@ -343,6 +350,9 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; + if (has_branch_stack(event)) + return static_call(amd_pmu_branch_hw_config)(event); + return 0; } @@ -366,7 +376,7 @@ static int amd_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && get_ibs_caps()) return -ENOENT; - if (has_branch_stack(event)) + if (has_branch_stack(event) && !x86_pmu.lbr_nr) return -EOPNOTSUPP; ret = x86_pmu_hw_config(event); @@ -510,20 +520,46 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + +static void amd_pmu_cpu_reset(int cpu) +{ + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + + if (x86_pmu.version < 2) + return; + + /* Clear enable bits i.e. PerfCntrGlobalCtl.PerfCntrEn */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); + + /* Clear overflow bits i.e. PerfCntrGLobalStatus.PerfCntrOvfl */ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, amd_pmu_global_cntr_mask); +} + static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) return 0; cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return -ENOMEM; + if (cpuc->amd_nb) + return 0; - return 0; + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; } static void amd_pmu_cpu_starting(int cpu) @@ -538,7 +574,7 @@ static void amd_pmu_cpu_starting(int cpu) if (!x86_pmu.amd_nb_constraints) return; - nb_id = amd_get_nb_id(cpu); + nb_id = topology_die_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); for_each_online_cpu(i) { @@ -555,17 +591,20 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; + + amd_pmu_cpu_reset(cpu); } static void amd_pmu_cpu_dead(int cpu) { - struct cpu_hw_events *cpuhw; + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; if (!x86_pmu.amd_nb_constraints) return; - cpuhw = &per_cpu(cpu_hw_events, cpu); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -574,8 +613,52 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } + + amd_pmu_cpu_reset(cpu); +} + +static inline void amd_pmu_set_global_ctl(u64 ctl) +{ + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); +} + +static inline u64 amd_pmu_get_global_status(void) +{ + u64 status; + + /* PerfCntrGlobalStatus is read-only */ + rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); + + return status; +} + +static inline void amd_pmu_ack_global_status(u64 status) +{ + /* + * PerfCntrGlobalStatus is read-only but an overflow acknowledgment + * mechanism exists; writing 1 to a bit in PerfCntrGlobalStatusClr + * clears the same bit in PerfCntrGlobalStatus + */ + + wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); +} + +static bool amd_pmu_test_overflow_topbit(int idx) +{ + u64 counter; + + rdmsrl(x86_pmu_event_addr(idx), counter); + + return !(counter & BIT_ULL(x86_pmu.cntval_bits - 1)); +} + +static bool amd_pmu_test_overflow_status(int idx) +{ + return amd_pmu_get_global_status() & BIT_ULL(idx); } +DEFINE_STATIC_CALL(amd_pmu_test_overflow, amd_pmu_test_overflow_topbit); + /* * When a PMC counter overflows, an NMI is used to process the event and * reset the counter. NMI latency can result in the counter being updated @@ -588,7 +671,6 @@ static void amd_pmu_cpu_dead(int cpu) static void amd_pmu_wait_on_overflow(int idx) { unsigned int i; - u64 counter; /* * Wait for the counter to be reset if it has overflowed. This loop @@ -596,8 +678,7 @@ static void amd_pmu_wait_on_overflow(int idx) * forever... */ for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { - rdmsrl(x86_pmu_event_addr(idx), counter); - if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!static_call(amd_pmu_test_overflow)(idx)) break; /* Might be in IRQ context, so can't sleep */ @@ -605,13 +686,11 @@ static void amd_pmu_wait_on_overflow(int idx) } } -static void amd_pmu_disable_all(void) +static void amd_pmu_check_overflow(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int idx; - x86_pmu_disable_all(); - /* * This shouldn't be called from NMI context, but add a safeguard here * to return, since if we're in NMI context we can't wait for an NMI @@ -623,7 +702,7 @@ static void amd_pmu_disable_all(void) /* * Check each counter for overflow and wait for it to be reset by the * NMI if it has overflowed. This relies on the fact that all active - * counters are always enabled when this function is caled and + * counters are always enabled when this function is called and * ARCH_PERFMON_EVENTSEL_INT is always set. */ for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -634,6 +713,53 @@ static void amd_pmu_disable_all(void) } } +static void amd_pmu_enable_event(struct perf_event *event) +{ + x86_pmu_enable_event(event); +} + +static void amd_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + amd_brs_enable_all(); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + /* only activate events which are marked as active */ + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_enable_event(cpuc->events[idx]); + } +} + +static void amd_pmu_v2_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * Testing cpu_hw_events.enabled should be skipped in this case unlike + * in x86_pmu_enable_event(). + * + * Since cpu_hw_events.enabled is set only after returning from + * x86_pmu_start(), the PMCs must be programmed and kept ready. + * Counting starts only after x86_pmu_enable_all() is called. + */ + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +static __always_inline void amd_pmu_core_enable_all(void) +{ + amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); +} + +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -651,6 +777,41 @@ static void amd_pmu_disable_event(struct perf_event *event) amd_pmu_wait_on_overflow(event->hw.idx); } +static void amd_pmu_disable_all(void) +{ + amd_brs_disable_all(); + x86_pmu_disable_all(); + amd_pmu_check_overflow(); +} + +static __always_inline void amd_pmu_core_disable_all(void) +{ + amd_pmu_set_global_ctl(0); +} + +static void amd_pmu_v2_disable_all(void) +{ + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); + amd_pmu_check_overflow(); +} + +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + +static void amd_pmu_add_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + static_call(amd_pmu_branch_add)(event); +} + +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + +static void amd_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + static_call(amd_pmu_branch_del)(event); +} + /* * Because of NMI latency, if multiple PMC counters are active or other sources * of NMIs are received, the perf NMI handler can handle one or more overflowed @@ -669,13 +830,8 @@ static void amd_pmu_disable_event(struct perf_event *event) * handled a counter. When an un-handled NMI is received, it will be claimed * only if arriving within that window. */ -static int amd_pmu_handle_irq(struct pt_regs *regs) +static inline int amd_pmu_adjust_nmi_window(int handled) { - int handled; - - /* Process any counter overflows */ - handled = x86_pmu_handle_irq(regs); - /* * If a counter was handled, record a timestamp such that un-handled * NMIs will be claimed if arriving within that window. @@ -692,6 +848,124 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) return NMI_HANDLED; } +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int handled; + int pmu_enabled; + + /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* stop everything (includes BRS) */ + amd_pmu_disable_all(); + + /* Drain BRS is in use (could be inactive) */ + if (cpuc->lbr_users) + amd_brs_drain(); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + cpuc->enabled = pmu_enabled; + if (pmu_enabled) + amd_pmu_enable_all(0); + + return amd_pmu_adjust_nmi_window(handled); +} + +static int amd_pmu_v2_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_sample_data data; + struct hw_perf_event *hwc; + struct perf_event *event; + int handled = 0, idx; + u64 status, mask; + bool pmu_enabled; + + /* + * Save the PMU state as it needs to be restored when leaving the + * handler + */ + pmu_enabled = cpuc->enabled; + cpuc->enabled = 0; + + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); + + status = amd_pmu_get_global_status(); + + /* Check if any overflows are pending */ + if (!status) + goto done; + + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + x86_perf_event_update(event); + mask = BIT_ULL(idx); + + if (!(status & mask)) + continue; + + /* Event overflow */ + handled++; + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (has_branch_stack(event)) { + data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + + status &= ~mask; + } + + /* + * It should never be the case that some overflows are not handled as + * the corresponding PMCs are expected to be inactive according to the + * active_mask + */ + WARN_ON(status > 0); + + /* Clear overflow and freeze bits */ + amd_pmu_ack_global_status(~status); + + /* + * Unmasking the LVTPC is not required as the Mask (M) bit of the LVT + * PMI entry is not set by the local APIC when a PMC overflow occurs + */ + inc_irq_stat(apic_perf_irqs); + +done: + cpuc->enabled = pmu_enabled; + + /* Resume counting only if PMU is active */ + if (pmu_enabled) + amd_pmu_core_enable_all(); + + return amd_pmu_adjust_nmi_window(handled); +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -897,6 +1171,51 @@ static void amd_put_event_constraints_f17h(struct cpu_hw_events *cpuc, --cpuc->n_pair; } +/* + * Because of the way BRS operates with an inactive and active phases, and + * the link to one counter, it is not possible to have two events using BRS + * scheduled at the same time. There would be an issue with enforcing the + * period of each one and given that the BRS saturates, it would not be possible + * to guarantee correlated content for all events. Therefore, in situations + * where multiple events want to use BRS, the kernel enforces mutual exclusion. + * Exclusion is enforced by chosing only one counter for events using BRS. + * The event scheduling logic will then automatically multiplex the + * events and ensure that at most one event is actively using BRS. + * + * The BRS counter could be any counter, but there is no constraint on Fam19h, + * therefore all counters are equal and thus we pick the first one: PMC0 + */ +static struct event_constraint amd_fam19h_brs_cntr0_constraint = + EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK); + +static struct event_constraint amd_fam19h_brs_pair_cntr0_constraint = + __EVENT_CONSTRAINT(0, 0x1, AMD64_RAW_EVENT_MASK, 1, 0, PERF_X86_EVENT_PAIR); + +static struct event_constraint * +amd_get_event_constraints_f19h(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + bool has_brs = has_amd_brs(hwc); + + /* + * In case BRS is used with an event requiring a counter pair, + * the kernel allows it but only on counter 0 & 1 to enforce + * multiplexing requiring to protect BRS in case of multiple + * BRS users + */ + if (amd_is_pair_event_code(hwc)) { + return has_brs ? &amd_fam19h_brs_pair_cntr0_constraint + : &pair_constraint; + } + + if (has_brs) + return &amd_fam19h_brs_cntr0_constraint; + + return &unconstrained; +} + + static ssize_t amd_event_sysfs_show(char *page, u64 config) { u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | @@ -905,12 +1224,22 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) +{ + /* + * Decrease period by the depth of the BRS feature to get the last N + * taken branches and approximate the desired period + */ + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; +} + static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, .disable_all = amd_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, @@ -920,6 +1249,8 @@ static __initconst const struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), .num_counters = AMD64_NUM_COUNTERS, + .add = amd_pmu_add_event, + .del = amd_pmu_del_event, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, @@ -938,8 +1269,68 @@ static __initconst const struct x86_pmu amd_pmu = { .amd_nb_constraints = 1, }; +static ssize_t branches_show(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", x86_pmu.lbr_nr); +} + +static DEVICE_ATTR_RO(branches); + +static struct attribute *amd_pmu_branches_attrs[] = { + &dev_attr_branches.attr, + NULL, +}; + +static umode_t +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return x86_pmu.lbr_nr ? attr->mode : 0; +} + +static struct attribute_group group_caps_amd_branches = { + .name = "caps", + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, +}; + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + +EVENT_ATTR_STR(branch-brs, amd_branch_brs, + "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); + +static struct attribute *amd_brs_events_attrs[] = { + EVENT_PTR(amd_branch_brs), + NULL, +}; + +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + +static struct attribute_group group_events_amd_brs = { + .name = "events", + .attrs = amd_brs_events_attrs, + .is_visible = amd_brs_is_visible, +}; + +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + +static const struct attribute_group *amd_attr_update[] = { + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + &group_events_amd_brs, +#endif + NULL, +}; + static int __init amd_core_pmu_init(void) { + union cpuid_0x80000022_ebx ebx; u64 even_ctr_mask = 0ULL; int i; @@ -957,6 +1348,27 @@ static int __init amd_core_pmu_init(void) x86_pmu.eventsel = MSR_F15H_PERF_CTL; x86_pmu.perfctr = MSR_F15H_PERF_CTR; x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + /* Check for Performance Monitoring v2 support */ + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + + /* Update PMU version for later usage */ + x86_pmu.version = 2; + + /* Find the number of available Core PMCs */ + x86_pmu.num_counters = ebx.split.num_core_pmc; + + amd_pmu_global_cntr_mask = (1ULL << x86_pmu.num_counters) - 1; + + /* Update PMC handling functions */ + x86_pmu.enable_all = amd_pmu_v2_enable_all; + x86_pmu.disable_all = amd_pmu_v2_disable_all; + x86_pmu.enable = amd_pmu_v2_enable_event; + x86_pmu.handle_irq = amd_pmu_v2_handle_irq; + static_call_update(amd_pmu_test_overflow, amd_pmu_test_overflow_status); + } + /* * AMD Core perfctr has separate MSRs for the NB events, see * the amd/uncore.c driver. @@ -989,6 +1401,37 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } + /* LBR and BRS are mutually exclusive features */ + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ + x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; + x86_pmu.sched_task = amd_pmu_brs_sched_task; + x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + + /* + * put_event_constraints callback same as Fam17h, set above + */ + + /* branch sampling must be stopped when entering low power */ + amd_brs_lopwr_init(); + } + + x86_pmu.attr_update = amd_attr_update; + pr_cont("core perfctr, "); return 0; } @@ -1023,6 +1466,24 @@ __init int amd_pmu_init(void) return 0; } +static inline void amd_pmu_reload_virt(void) +{ + if (x86_pmu.version >= 2) { + /* + * Clear global enable bits, reprogram the PERF_CTL + * registers with updated perf_ctr_virt_mask and then + * set global enable bits once again + */ + amd_pmu_v2_disable_all(); + amd_pmu_enable_all(0); + amd_pmu_v2_enable_all(0); + return; + } + + amd_pmu_disable_all(); + amd_pmu_enable_all(0); +} + void amd_pmu_enable_virt(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1030,8 +1491,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -1048,7 +1508,6 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - amd_pmu_disable_all(); - x86_pmu_enable_all(0); + amd_pmu_reload_virt(); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 26c36357c4c9..4cb710efbdd9 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -26,6 +26,7 @@ static u32 ibs_caps; #include <linux/hardirq.h> #include <asm/nmi.h> +#include <asm/amd-ibs.h> #define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) #define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT @@ -89,24 +90,13 @@ struct perf_ibs { u64 max_period; unsigned long offset_mask[1]; int offset_max; + unsigned int fetch_count_reset_broken : 1; + unsigned int fetch_ignore_if_zero_rip : 1; struct cpu_perf_ibs __percpu *pcpu; - struct attribute **format_attrs; - struct attribute_group format_group; - const struct attribute_group *attr_groups[2]; - u64 (*get_count)(u64 config); }; -struct perf_ibs_data { - u32 size; - union { - u32 data[0]; /* data buffer starts here */ - u32 caps; - }; - u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX]; -}; - static int perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period) { @@ -327,18 +317,28 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, static u64 get_ibs_fetch_count(u64 config) { - return (config & IBS_FETCH_CNT) >> 12; + union ibs_fetch_ctl fetch_ctl = (union ibs_fetch_ctl)config; + + return fetch_ctl.fetch_cnt << 4; } static u64 get_ibs_op_count(u64 config) { + union ibs_op_ctl op_ctl = (union ibs_op_ctl)config; u64 count = 0; - if (config & IBS_OP_VAL) - count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */ - - if (ibs_caps & IBS_CAPS_RDWROPCNT) - count += (config & IBS_OP_CUR_CNT) >> 32; + /* + * If the internal 27-bit counter rolled over, the count is MaxCnt + * and the lower 7 bits of CurCnt are randomized. + * Otherwise CurCnt has the full 27-bit current counter value. + */ + if (op_ctl.op_val) { + count = op_ctl.opmaxcnt << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + count += op_ctl.opmaxcnt_ext << 20; + } else if (ibs_caps & IBS_CAPS_RDWROPCNT) { + count = op_ctl.opcurcnt; + } return count; } @@ -363,7 +363,12 @@ perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event, static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs, struct hw_perf_event *hwc, u64 config) { - wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask); + u64 tmp = hwc->config | config; + + if (perf_ibs->fetch_count_reset_broken) + wrmsrl(hwc->config_base, tmp & ~perf_ibs->enable_mask); + + wrmsrl(hwc->config_base, tmp | perf_ibs->enable_mask); } /* @@ -394,7 +399,7 @@ static void perf_ibs_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); - u64 period; + u64 period, config = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -403,13 +408,19 @@ static void perf_ibs_start(struct perf_event *event, int flags) hwc->state = 0; perf_ibs_set_period(perf_ibs, hwc, &period); + if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { + config |= period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + config |= period >> 4; + /* * Set STARTED before enabling the hardware, such that a subsequent NMI * must observe it. */ set_bit(IBS_STARTED, pcpu->state); clear_bit(IBS_STOPPING, pcpu->state); - perf_ibs_enable_event(perf_ibs, hwc, period >> 4); + perf_ibs_enable_event(perf_ibs, hwc, config); perf_event_update_userpage(event); } @@ -503,16 +514,118 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +/* + * We need to initialize with empty group if all attributes in the + * group are dynamic. + */ +static struct attribute *attrs_empty[] = { + NULL, +}; + +static struct attribute_group empty_format_group = { + .name = "format", + .attrs = attrs_empty, +}; + +static struct attribute_group empty_caps_group = { + .name = "caps", + .attrs = attrs_empty, +}; + +static const struct attribute_group *empty_attr_groups[] = { + &empty_format_group, + &empty_caps_group, + NULL, +}; + PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); +PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); + +static umode_t +zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; +} -static struct attribute *ibs_fetch_format_attrs[] = { +static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, }; -static struct attribute *ibs_op_format_attrs[] = { - NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */ +static struct attribute *fetch_l3missonly_attrs[] = { + &fetch_l3missonly.attr.attr, + NULL, +}; + +static struct attribute *zen4_ibs_extensions_attrs[] = { + &zen4_ibs_extensions.attr.attr, + NULL, +}; + +static struct attribute_group group_rand_en = { + .name = "format", + .attrs = rand_en_attrs, +}; + +static struct attribute_group group_fetch_l3missonly = { + .name = "format", + .attrs = fetch_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static struct attribute_group group_zen4_ibs_extensions = { + .name = "caps", + .attrs = zen4_ibs_extensions_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *fetch_attr_groups[] = { + &group_rand_en, + &empty_caps_group, + NULL, +}; + +static const struct attribute_group *fetch_attr_update[] = { + &group_fetch_l3missonly, + &group_zen4_ibs_extensions, + NULL, +}; + +static umode_t +cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; +} + +static struct attribute *cnt_ctl_attrs[] = { + &format_attr_cnt_ctl.attr, + NULL, +}; + +static struct attribute *op_l3missonly_attrs[] = { + &op_l3missonly.attr.attr, + NULL, +}; + +static struct attribute_group group_cnt_ctl = { + .name = "format", + .attrs = cnt_ctl_attrs, + .is_visible = cnt_ctl_is_visible, +}; + +static struct attribute_group group_op_l3missonly = { + .name = "format", + .attrs = op_l3missonly_attrs, + .is_visible = zen4_ibs_extensions_is_visible, +}; + +static const struct attribute_group *op_attr_update[] = { + &group_cnt_ctl, + &group_op_l3missonly, + &group_zen4_ibs_extensions, NULL, }; @@ -536,7 +649,6 @@ static struct perf_ibs perf_ibs_fetch = { .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, - .format_attrs = ibs_fetch_format_attrs, .get_count = get_ibs_fetch_count, }; @@ -551,6 +663,7 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_CONFIG_MASK, @@ -561,11 +674,343 @@ static struct perf_ibs perf_ibs_op = { .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, - .format_attrs = ibs_op_format_attrs, .get_count = get_ibs_op_count, }; +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_CXL; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + union ibs_op_data op_data; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -577,7 +1022,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) struct perf_ibs_data ibs_data; int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; - u64 *buf, *config, period; + u64 *buf, *config, period, new_config = 0; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -613,12 +1058,9 @@ fail: size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 3; - else - offset_max = 1; + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + do { rdmsrl(msr + offset, *buf++); size++; @@ -626,18 +1068,24 @@ fail: perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + /* + * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately + * depending on their availability. + * Can't add to offset_max as they are staggered + */ if (event->attr.sample_type & PERF_SAMPLE_RAW) { - /* - * Read IbsBrTarget and IbsOpData4 separately - * depending on their availability. - * Can't add to offset_max as they are staggered - */ - if (ibs_caps & IBS_CAPS_BRNTRGT) { - rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); - size++; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_BRNTRGT) { + rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + size++; + } + if (ibs_caps & IBS_CAPS_OPDATA4) { + rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + size++; + } } - if (ibs_caps & IBS_CAPS_OPDATA4) { - rdmsrl(MSR_AMD64_IBSOPDATA4, *buf++); + if (perf_ibs == &perf_ibs_fetch && (ibs_caps & IBS_CAPS_FETCHCTLEXTD)) { + rdmsrl(MSR_AMD64_ICIBSEXTDCTL, *buf++); size++; } } @@ -647,6 +1095,10 @@ fail: if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) { regs.flags &= ~PERF_EFLAGS_EXACT; } else { + /* Workaround for erratum #1197 */ + if (perf_ibs->fetch_ignore_if_zero_rip && !(ibs_data.regs[1])) + goto out; + set_linear_ip(®s, ibs_data.regs[1]); regs.flags |= PERF_EFLAGS_EXACT; } @@ -659,6 +1111,20 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; + } + + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + + /* + * rip recorded by IbsOpRip will not be consistent with rsp and rbp + * recorded as part of interrupt regs. Thus we need to use rip from + * interrupt regs while unwinding call stack. + */ + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; } throttle = perf_event_overflow(event, &data, ®s); @@ -666,13 +1132,17 @@ out: if (throttle) { perf_ibs_stop(event, 0); } else { - period >>= 4; - - if ((ibs_caps & IBS_CAPS_RDWROPCNT) && - (*config & IBS_OP_CNT_CTL)) - period |= *config & IBS_OP_CUR_CNT_RAND; + if (perf_ibs == &perf_ibs_op) { + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + new_config = period & IBS_OP_MAX_CNT_EXT_MASK; + period &= ~IBS_OP_MAX_CNT_EXT_MASK; + } + if ((ibs_caps & IBS_CAPS_RDWROPCNT) && (*config & IBS_OP_CNT_CTL)) + new_config |= *config & IBS_OP_CUR_CNT_RAND; + } + new_config |= period >> 4; - perf_ibs_enable_event(perf_ibs, hwc, period); + perf_ibs_enable_event(perf_ibs, hwc, new_config); } perf_event_update_userpage(event); @@ -709,17 +1179,6 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) perf_ibs->pcpu = pcpu; - /* register attributes */ - if (perf_ibs->format_attrs[0]) { - memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group)); - perf_ibs->format_group.name = "format"; - perf_ibs->format_group.attrs = perf_ibs->format_attrs; - - memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups)); - perf_ibs->attr_groups[0] = &perf_ibs->format_group; - perf_ibs->pmu.attr_groups = perf_ibs->attr_groups; - } - ret = perf_pmu_register(&perf_ibs->pmu, name, -1); if (ret) { perf_ibs->pcpu = NULL; @@ -729,25 +1188,84 @@ static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name) return ret; } -static __init void perf_event_ibs_init(void) +static __init int perf_ibs_fetch_init(void) { - struct attribute **attr = ibs_op_format_attrs; + /* + * Some chips fail to reset the fetch count when it is written; instead + * they need a 0-1 transition of IbsFetchEn. + */ + if (boot_cpu_data.x86 >= 0x16 && boot_cpu_data.x86 <= 0x18) + perf_ibs_fetch.fetch_count_reset_broken = 1; + + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model < 0x10) + perf_ibs_fetch.fetch_ignore_if_zero_rip = 1; + + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_fetch.config_mask |= IBS_FETCH_L3MISSONLY; + + perf_ibs_fetch.pmu.attr_groups = fetch_attr_groups; + perf_ibs_fetch.pmu.attr_update = fetch_attr_update; - perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); + return perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch"); +} - if (ibs_caps & IBS_CAPS_OPCNT) { +static __init int perf_ibs_op_init(void) +{ + if (ibs_caps & IBS_CAPS_OPCNT) perf_ibs_op.config_mask |= IBS_OP_CNT_CTL; - *attr++ = &format_attr_cnt_ctl.attr; + + if (ibs_caps & IBS_CAPS_OPCNTEXT) { + perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; } - perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); - register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ibs_caps & IBS_CAPS_ZEN4) + perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; + + perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_update = op_attr_update; + + return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); +} + +static __init int perf_event_ibs_init(void) +{ + int ret; + + ret = perf_ibs_fetch_init(); + if (ret) + return ret; + + ret = perf_ibs_op_init(); + if (ret) + goto err_op; + + ret = register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs"); + if (ret) + goto err_nmi; + pr_info("perf: AMD IBS detected (0x%08x)\n", ibs_caps); + return 0; + +err_nmi: + perf_pmu_unregister(&perf_ibs_op.pmu); + free_percpu(perf_ibs_op.pcpu); + perf_ibs_op.pcpu = NULL; +err_op: + perf_pmu_unregister(&perf_ibs_fetch.pmu); + free_percpu(perf_ibs_fetch.pcpu); + perf_ibs_fetch.pcpu = NULL; + + return ret; } #else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */ -static __init void perf_event_ibs_init(void) { } +static __init int perf_event_ibs_init(void) +{ + return 0; +} #endif @@ -1017,9 +1535,7 @@ static __init int amd_ibs_init(void) x86_pmu_amd_ibs_starting_cpu, x86_pmu_amd_ibs_dying_cpu); - perf_event_ibs_init(); - - return 0; + return perf_event_ibs_init(); } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ diff --git a/arch/x86/events/amd/iommu.c b/arch/x86/events/amd/iommu.c index fb616203ce42..b15f7b950d2e 100644 --- a/arch/x86/events/amd/iommu.c +++ b/arch/x86/events/amd/iommu.c @@ -14,12 +14,11 @@ #include <linux/init.h> #include <linux/cpumask.h> #include <linux/slab.h> +#include <linux/amd-iommu.h> #include "../perf_event.h" #include "iommu.h" -#define COUNTER_SHIFT 16 - /* iommu pmu conf masks */ #define GET_CSOURCE(x) ((x)->conf & 0xFFULL) #define GET_DEVID(x) (((x)->conf >> 8) & 0xFFFFULL) @@ -81,12 +80,12 @@ static struct attribute_group amd_iommu_events_group = { }; struct amd_iommu_event_desc { - struct kobj_attribute attr; + struct device_attribute attr; const char *event; }; -static ssize_t _iommu_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t _iommu_event_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct amd_iommu_event_desc *event = container_of(attr, struct amd_iommu_event_desc, attr); @@ -162,7 +161,7 @@ static int get_next_avail_iommu_bnk_cntr(struct perf_event *event) raw_spin_lock_irqsave(&piommu->lock, flags); - for (bank = 0, shift = 0; bank < max_banks; bank++) { + for (bank = 0; bank < max_banks; bank++) { for (cntr = 0; cntr < max_cntrs; cntr++) { shift = bank + (bank*3) + cntr; if (piommu->cntr_assign_mask & BIT_ULL(shift)) { @@ -285,22 +284,31 @@ static void perf_iommu_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + /* + * To account for power-gating, which prevents write to + * the counter, we need to enable the counter + * before setting up counter register. + */ + perf_iommu_enable_event(event); + if (flags & PERF_EF_RELOAD) { - u64 prev_raw_count = local64_read(&hwc->prev_count); + u64 count = 0; struct amd_iommu *iommu = perf_event_2_iommu(event); + /* + * Since the IOMMU PMU only support counting mode, + * the counter always start with value zero. + */ amd_iommu_pc_set_reg(iommu, hwc->iommu_bank, hwc->iommu_cntr, - IOMMU_PC_COUNTER_REG, &prev_raw_count); + IOMMU_PC_COUNTER_REG, &count); } - perf_iommu_enable_event(event); perf_event_update_userpage(event); - } static void perf_iommu_read(struct perf_event *event) { - u64 count, prev, delta; + u64 count; struct hw_perf_event *hwc = &event->hw; struct amd_iommu *iommu = perf_event_2_iommu(event); @@ -311,14 +319,11 @@ static void perf_iommu_read(struct perf_event *event) /* IOMMU pc counter register is only 48 bits */ count &= GENMASK_ULL(47, 0); - prev = local64_read(&hwc->prev_count); - if (local64_cmpxchg(&hwc->prev_count, prev, count) != prev) - return; - - /* Handle 48-bit counter overflow */ - delta = (count << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); - delta >>= COUNTER_SHIFT; - local64_add(delta, &event->count); + /* + * Since the counter always start with value zero, + * simply just accumulate the count for the event. + */ + local64_add(count, &event->count); } static void perf_iommu_stop(struct perf_event *event, int flags) @@ -328,15 +333,16 @@ static void perf_iommu_stop(struct perf_event *event, int flags) if (hwc->state & PERF_HES_UPTODATE) return; + /* + * To account for power-gating, in which reading the counter would + * return zero, we need to read the register before disabling. + */ + perf_iommu_read(event); + hwc->state |= PERF_HES_UPTODATE; + perf_iommu_disable_event(event); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; - - if (hwc->state & PERF_HES_UPTODATE) - return; - - perf_iommu_read(event); - hwc->state |= PERF_HES_UPTODATE; } static int perf_iommu_add(struct perf_event *event, int flags) @@ -379,7 +385,7 @@ static __init int _init_events_attrs(void) while (amd_iommu_v2_event_descs[i].attr.attr.name) i++; - attrs = kcalloc(i + 1, sizeof(struct attribute **), GFP_KERNEL); + attrs = kcalloc(i + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) return -ENOMEM; diff --git a/arch/x86/events/amd/iommu.h b/arch/x86/events/amd/iommu.h index 0e5c036fd7be..e6310c635c8b 100644 --- a/arch/x86/events/amd/iommu.h +++ b/arch/x86/events/amd/iommu.h @@ -17,27 +17,8 @@ #define IOMMU_PC_DEVID_MATCH_REG 0x20 #define IOMMU_PC_COUNTER_REPORT_REG 0x28 -/* maximun specified bank/counters */ +/* maximum specified bank/counters */ #define PC_MAX_SPEC_BNKS 64 #define PC_MAX_SPEC_CNTRS 16 -struct amd_iommu; - -/* amd_iommu_init.c external support functions */ -extern int amd_iommu_get_num_iommus(void); - -extern bool amd_iommu_pc_supported(void); - -extern u8 amd_iommu_pc_get_max_banks(unsigned int idx); - -extern u8 amd_iommu_pc_get_max_counters(unsigned int idx); - -extern int amd_iommu_pc_set_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); - -extern int amd_iommu_pc_get_reg(struct amd_iommu *iommu, u8 bank, u8 cntr, - u8 fxn, u64 *value); - -extern struct amd_iommu *get_amd_iommu(int idx); - #endif /*_PERF_EVENT_AMD_IOMMU_H_*/ diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000000..38a75216c12c --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/perf_event.h> +#include <asm/perf_event.h> + +#include "../perf_event.h" + +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, offset, type, i, j; + bool compress = false; + bool fused_only = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + fused_only = true; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) { + cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, idx, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); +} + +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, +}; + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; + } + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 lbr_select, dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/amd/power.c b/arch/x86/events/amd/power.c index abef51320e3a..37d5b380516e 100644 --- a/arch/x86/events/amd/power.c +++ b/arch/x86/events/amd/power.c @@ -13,10 +13,6 @@ #include <asm/cpu_device_id.h> #include "../perf_event.h" -#define MSR_F15H_CU_PWR_ACCUMULATOR 0xc001007a -#define MSR_F15H_CU_MAX_PWR_ACCUMULATOR 0xc001007b -#define MSR_F15H_PTSC 0xc0010280 - /* Event code: LSB 8 bits, passed in attr->config any other bit is reserved. */ #define AMD_POWER_EVENT_MASK 0xFFULL @@ -217,6 +213,7 @@ static struct pmu pmu_class = { .stop = pmu_event_stop, .read = pmu_event_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .module = THIS_MODULE, }; static int power_cpu_exit(unsigned int cpu) @@ -259,7 +256,7 @@ static int power_cpu_init(unsigned int cpu) } static const struct x86_cpu_id cpu_match[] = { - { .vendor = X86_VENDOR_AMD, .family = 0x15 }, + X86_MATCH_VENDOR_FAM(AMD, 0x15, NULL), {}, }; diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index 4d867a752f0e..d568afc705d2 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -12,16 +12,15 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/cpumask.h> +#include <linux/cpufeature.h> +#include <linux/smp.h> -#include <asm/cpufeature.h> #include <asm/perf_event.h> #include <asm/msr.h> -#include <asm/smp.h> #define NUM_COUNTERS_NB 4 #define NUM_COUNTERS_L2 4 #define NUM_COUNTERS_L3 6 -#define MAX_COUNTERS 6 #define RDPMC_BASE_NB 6 #define RDPMC_BASE_LLC 10 @@ -31,6 +30,7 @@ #undef pr_fmt #define pr_fmt(fmt) "amd_uncore: " fmt +static int pmu_version; static int num_counters_llc; static int num_counters_nb; static bool l3_mask; @@ -46,7 +46,7 @@ struct amd_uncore { u32 msr_base; cpumask_t *active_mask; struct pmu *pmu; - struct perf_event *events[MAX_COUNTERS]; + struct perf_event **events; struct hlist_node node; }; @@ -158,6 +158,16 @@ out: hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + /* + * The first four DF counters are accessible via RDPMC index 6 to 9 + * followed by the L3 counters from index 10 to 15. For processors + * with more than four DF counters, the DF RDPMC assignments become + * discontiguous as the additional counters are accessible starting + * from index 16. + */ + if (is_nb_event(event) && hwc->idx >= NUM_COUNTERS_NB) + hwc->event_base_rdpmc += NUM_COUNTERS_L3; + if (flags & PERF_EF_START) amd_uncore_start(event, PERF_EF_RELOAD); @@ -180,14 +190,43 @@ static void amd_uncore_del(struct perf_event *event, int flags) hwc->idx = -1; } +/* + * Return a full thread and slice mask unless user + * has provided them + */ +static u64 l3_thread_slice_mask(u64 config) +{ + if (boot_cpu_data.x86 <= 0x18) + return ((config & AMD64_L3_SLICE_MASK) ? : AMD64_L3_SLICE_MASK) | + ((config & AMD64_L3_THREAD_MASK) ? : AMD64_L3_THREAD_MASK); + + /* + * If the user doesn't specify a threadmask, they're not trying to + * count core 0, so we enable all cores & threads. + * We'll also assume that they want to count slice 0 if they specify + * a threadmask and leave sliceid and enallslices unpopulated. + */ + if (!(config & AMD64_L3_F19H_THREAD_MASK)) + return AMD64_L3_F19H_THREAD_MASK | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_EN_ALL_CORES; + + return config & (AMD64_L3_F19H_THREAD_MASK | AMD64_L3_SLICEID_MASK | + AMD64_L3_EN_ALL_CORES | AMD64_L3_EN_ALL_SLICES | + AMD64_L3_COREID_MASK); +} + static int amd_uncore_event_init(struct perf_event *event) { struct amd_uncore *uncore; struct hw_perf_event *hwc = &event->hw; + u64 event_mask = AMD64_RAW_EVENT_MASK_NB; if (event->attr.type != event->pmu->type) return -ENOENT; + if (pmu_version >= 2 && is_nb_event(event)) + event_mask = AMD64_PERFMON_V2_RAW_EVENT_MASK_NB; + /* * NB and Last level cache counters (MSRs) are shared across all cores * that share the same NB / Last level cache. On family 16h and below, @@ -196,25 +235,18 @@ static int amd_uncore_event_init(struct perf_event *event) * out. So we do not support sampling and per-thread events via * CAP_NO_INTERRUPT, and we do not enable counter overflow interrupts: */ - hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; + hwc->config = event->attr.config & event_mask; hwc->idx = -1; if (event->cpu < 0) return -EINVAL; /* - * SliceMask and ThreadMask need to be set for certain L3 events in - * Family 17h. For other events, the two fields do not affect the count. + * SliceMask and ThreadMask need to be set for certain L3 events. + * For other events, the two fields do not affect the count. */ - if (l3_mask && is_llc_event(event)) { - int thread = 2 * (cpu_data(event->cpu).cpu_core_id % 4); - - if (smp_num_siblings > 1) - thread += cpu_data(event->cpu).apicid & 1; - - hwc->config |= (1ULL << (AMD64_L3_THREAD_SHIFT + thread) & - AMD64_L3_THREAD_MASK) | AMD64_L3_SLICE_MASK; - } + if (l3_mask && is_llc_event(event)) + hwc->config |= l3_thread_slice_mask(event->attr.config); uncore = event_to_amd_uncore(event); if (!uncore) @@ -229,6 +261,19 @@ static int amd_uncore_event_init(struct perf_event *event) return 0; } +static umode_t +amd_f17h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x17 && boot_cpu_data.x86 < 0x19 ? + attr->mode : 0; +} + +static umode_t +amd_f19h_uncore_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return boot_cpu_data.x86 >= 0x19 ? attr->mode : 0; +} + static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, struct device_attribute *attr, char *buf) @@ -256,47 +301,105 @@ static struct attribute_group amd_uncore_attr_group = { .attrs = amd_uncore_attrs, }; -/* - * Similar to PMU_FORMAT_ATTR but allowing for format_attr to be assigned based - * on family - */ -#define AMD_FORMAT_ATTR(_dev, _name, _format) \ -static ssize_t \ -_dev##_show##_name(struct device *dev, \ - struct device_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct device_attribute format_attr_##_dev##_name = __ATTR_RO(_dev); - -/* Used for each uncore counter type */ -#define AMD_ATTRIBUTE(_name) \ -static struct attribute *amd_uncore_format_attr_##_name[] = { \ - &format_attr_event_##_name.attr, \ - &format_attr_umask.attr, \ - NULL, \ -}; \ -static struct attribute_group amd_uncore_format_group_##_name = { \ - .name = "format", \ - .attrs = amd_uncore_format_attr_##_name, \ -}; \ -static const struct attribute_group *amd_uncore_attr_groups_##_name[] = { \ - &amd_uncore_attr_group, \ - &amd_uncore_format_group_##_name, \ - NULL, \ +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct device_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + +DEFINE_UNCORE_FORMAT_ATTR(event12, event, "config:0-7,32-35"); +DEFINE_UNCORE_FORMAT_ATTR(event14, event, "config:0-7,32-35,59-60"); /* F17h+ DF */ +DEFINE_UNCORE_FORMAT_ATTR(event14v2, event, "config:0-7,32-37"); /* PerfMonV2 DF */ +DEFINE_UNCORE_FORMAT_ATTR(event8, event, "config:0-7"); /* F17h+ L3 */ +DEFINE_UNCORE_FORMAT_ATTR(umask8, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask12, umask, "config:8-15,24-27"); /* PerfMonV2 DF */ +DEFINE_UNCORE_FORMAT_ATTR(coreid, coreid, "config:42-44"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(slicemask, slicemask, "config:48-51"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask8, threadmask, "config:56-63"); /* F17h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(threadmask2, threadmask, "config:56-57"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallslices, enallslices, "config:46"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(enallcores, enallcores, "config:47"); /* F19h L3 */ +DEFINE_UNCORE_FORMAT_ATTR(sliceid, sliceid, "config:48-50"); /* F19h L3 */ + +/* Common DF and NB attributes */ +static struct attribute *amd_uncore_df_format_attr[] = { + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ + NULL, +}; + +/* Common L2 and L3 attributes */ +static struct attribute *amd_uncore_l3_format_attr[] = { + &format_attr_event12.attr, /* event */ + &format_attr_umask8.attr, /* umask */ + NULL, /* threadmask */ + NULL, +}; + +/* F17h unique L3 attributes */ +static struct attribute *amd_f17h_uncore_l3_format_attr[] = { + &format_attr_slicemask.attr, /* slicemask */ + NULL, +}; + +/* F19h unique L3 attributes */ +static struct attribute *amd_f19h_uncore_l3_format_attr[] = { + &format_attr_coreid.attr, /* coreid */ + &format_attr_enallslices.attr, /* enallslices */ + &format_attr_enallcores.attr, /* enallcores */ + &format_attr_sliceid.attr, /* sliceid */ + NULL, +}; + +static struct attribute_group amd_uncore_df_format_group = { + .name = "format", + .attrs = amd_uncore_df_format_attr, }; -AMD_FORMAT_ATTR(event, , "config:0-7,32-35"); -AMD_FORMAT_ATTR(umask, , "config:8-15"); -AMD_FORMAT_ATTR(event, _df, "config:0-7,32-35,59-60"); -AMD_FORMAT_ATTR(event, _l3, "config:0-7"); -AMD_ATTRIBUTE(df); -AMD_ATTRIBUTE(l3); +static struct attribute_group amd_uncore_l3_format_group = { + .name = "format", + .attrs = amd_uncore_l3_format_attr, +}; + +static struct attribute_group amd_f17h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f17h_uncore_l3_format_attr, + .is_visible = amd_f17h_uncore_is_visible, +}; + +static struct attribute_group amd_f19h_uncore_l3_format_group = { + .name = "format", + .attrs = amd_f19h_uncore_l3_format_attr, + .is_visible = amd_f19h_uncore_is_visible, +}; + +static const struct attribute_group *amd_uncore_df_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_df_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_l3_format_group, + NULL, +}; + +static const struct attribute_group *amd_uncore_l3_attr_update[] = { + &amd_f17h_uncore_l3_format_group, + &amd_f19h_uncore_l3_format_group, + NULL, +}; static struct pmu amd_nb_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_df_attr_groups, + .name = "amd_nb", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -304,10 +407,14 @@ static struct pmu amd_nb_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct pmu amd_llc_pmu = { .task_ctx_nr = perf_invalid_context, + .attr_groups = amd_uncore_l3_attr_groups, + .attr_update = amd_uncore_l3_attr_update, + .name = "amd_l2", .event_init = amd_uncore_event_init, .add = amd_uncore_add, .del = amd_uncore_del, @@ -315,6 +422,7 @@ static struct pmu amd_llc_pmu = { .stop = amd_uncore_stop, .read = amd_uncore_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE | PERF_PMU_CAP_NO_INTERRUPT, + .module = THIS_MODULE, }; static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) @@ -323,11 +431,19 @@ static struct amd_uncore *amd_uncore_alloc(unsigned int cpu) cpu_to_node(cpu)); } +static inline struct perf_event ** +amd_uncore_events_alloc(unsigned int num, unsigned int cpu) +{ + return kzalloc_node(sizeof(struct perf_event *) * num, GFP_KERNEL, + cpu_to_node(cpu)); +} + static int amd_uncore_cpu_up_prepare(unsigned int cpu) { - struct amd_uncore *uncore_nb = NULL, *uncore_llc; + struct amd_uncore *uncore_nb = NULL, *uncore_llc = NULL; if (amd_uncore_nb) { + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; uncore_nb = amd_uncore_alloc(cpu); if (!uncore_nb) goto fail; @@ -337,11 +453,15 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->events = amd_uncore_events_alloc(num_counters_nb, cpu); + if (!uncore_nb->events) + goto fail; uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } if (amd_uncore_llc) { + *per_cpu_ptr(amd_uncore_llc, cpu) = NULL; uncore_llc = amd_uncore_alloc(cpu); if (!uncore_llc) goto fail; @@ -351,6 +471,9 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_llc->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_llc->active_mask = &amd_llc_active_mask; uncore_llc->pmu = &amd_llc_pmu; + uncore_llc->events = amd_uncore_events_alloc(num_counters_llc, cpu); + if (!uncore_llc->events) + goto fail; uncore_llc->id = -1; *per_cpu_ptr(amd_uncore_llc, cpu) = uncore_llc; } @@ -358,9 +481,16 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) return 0; fail: - if (amd_uncore_nb) - *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; - kfree(uncore_nb); + if (uncore_nb) { + kfree(uncore_nb->events); + kfree(uncore_nb); + } + + if (uncore_llc) { + kfree(uncore_llc->events); + kfree(uncore_llc); + } + return -ENOMEM; } @@ -407,7 +537,7 @@ static int amd_uncore_cpu_starting(unsigned int cpu) if (amd_uncore_llc) { uncore = *per_cpu_ptr(amd_uncore_llc, cpu); - uncore->id = per_cpu(cpu_llc_id, cpu); + uncore->id = get_llc_id(cpu); uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_llc); *per_cpu_ptr(amd_uncore_llc, cpu) = uncore; @@ -493,8 +623,11 @@ static void uncore_dead(unsigned int cpu, struct amd_uncore * __percpu *uncores) if (cpu == uncore->cpu) cpumask_clear_cpu(cpu, uncore->active_mask); - if (!--uncore->refcnt) + if (!--uncore->refcnt) { + kfree(uncore->events); kfree(uncore); + } + *per_cpu_ptr(uncores, cpu) = NULL; } @@ -511,6 +644,9 @@ static int amd_uncore_cpu_dead(unsigned int cpu) static int __init amd_uncore_init(void) { + struct attribute **df_attr = amd_uncore_df_format_attr; + struct attribute **l3_attr = amd_uncore_l3_format_attr; + union cpuid_0x80000022_ebx ebx; int ret = -ENODEV; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && @@ -520,34 +656,32 @@ static int __init amd_uncore_init(void) if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) return -ENODEV; - if (boot_cpu_data.x86 == 0x17 || boot_cpu_data.x86 == 0x18) { + if (boot_cpu_has(X86_FEATURE_PERFMON_V2)) + pmu_version = 2; + + num_counters_nb = NUM_COUNTERS_NB; + num_counters_llc = NUM_COUNTERS_L2; + if (boot_cpu_data.x86 >= 0x17) { /* - * For F17h or F18h, the Northbridge counters are + * For F17h and above, the Northbridge counters are * repurposed as Data Fabric counters. Also, L3 * counters are supported too. The PMUs are exported * based on family as either L2 or L3 and NB or DF. */ - num_counters_nb = NUM_COUNTERS_NB; num_counters_llc = NUM_COUNTERS_L3; amd_nb_pmu.name = "amd_df"; amd_llc_pmu.name = "amd_l3"; - format_attr_event_df.show = &event_show_df; - format_attr_event_l3.show = &event_show_l3; l3_mask = true; - } else { - num_counters_nb = NUM_COUNTERS_NB; - num_counters_llc = NUM_COUNTERS_L2; - amd_nb_pmu.name = "amd_nb"; - amd_llc_pmu.name = "amd_l2"; - format_attr_event_df = format_attr_event; - format_attr_event_l3 = format_attr_event; - l3_mask = false; } - amd_nb_pmu.attr_groups = amd_uncore_attr_groups_df; - amd_llc_pmu.attr_groups = amd_uncore_attr_groups_l3; - if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + if (pmu_version >= 2) { + *df_attr++ = &format_attr_event14v2.attr; + *df_attr++ = &format_attr_umask12.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *df_attr = &format_attr_event14.attr; + } + amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { ret = -ENOMEM; @@ -557,13 +691,29 @@ static int __init amd_uncore_init(void) if (ret) goto fail_nb; - pr_info("%s NB counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + if (pmu_version >= 2) { + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + num_counters_nb = ebx.split.num_df_pmc; + } + + pr_info("%d %s %s counters detected\n", num_counters_nb, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_nb_pmu.name); + ret = 0; } if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + if (boot_cpu_data.x86 >= 0x19) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask8.attr; + *l3_attr++ = &format_attr_threadmask2.attr; + } else if (boot_cpu_data.x86 >= 0x17) { + *l3_attr++ = &format_attr_event8.attr; + *l3_attr++ = &format_attr_umask8.attr; + *l3_attr++ = &format_attr_threadmask8.attr; + } + amd_uncore_llc = alloc_percpu(struct amd_uncore *); if (!amd_uncore_llc) { ret = -ENOMEM; @@ -573,9 +723,9 @@ static int __init amd_uncore_init(void) if (ret) goto fail_llc; - pr_info("%s LLC counters detected\n", - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? - "HYGON" : "AMD"); + pr_info("%d %s %s counters detected\n", num_counters_llc, + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ? "HYGON" : "", + amd_llc_pmu.name); ret = 0; } @@ -605,12 +755,34 @@ fail_prep: fail_llc: if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); - if (amd_uncore_llc) - free_percpu(amd_uncore_llc); + free_percpu(amd_uncore_llc); fail_nb: - if (amd_uncore_nb) - free_percpu(amd_uncore_nb); + free_percpu(amd_uncore_nb); return ret; } -device_initcall(amd_uncore_init); + +static void __exit amd_uncore_exit(void) +{ + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE); + cpuhp_remove_state(CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING); + cpuhp_remove_state(CPUHP_PERF_X86_AMD_UNCORE_PREP); + + if (boot_cpu_has(X86_FEATURE_PERFCTR_LLC)) { + perf_pmu_unregister(&amd_llc_pmu); + free_percpu(amd_uncore_llc); + amd_uncore_llc = NULL; + } + + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { + perf_pmu_unregister(&amd_nb_pmu); + free_percpu(amd_uncore_nb); + amd_uncore_nb = NULL; + } +} + +module_init(amd_uncore_init); +module_exit(amd_uncore_exit); + +MODULE_DESCRIPTION("AMD Uncore Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 3bb738f5a472..b30b8bbcd1e2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -28,6 +28,7 @@ #include <linux/bitops.h> #include <linux/device.h> #include <linux/nospec.h> +#include <linux/static_call.h> #include <asm/apic.h> #include <asm/stacktrace.h> @@ -44,13 +45,56 @@ #include "perf_event.h" struct x86_pmu x86_pmu __read_mostly; +static struct pmu pmu; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, + .pmu = &pmu, }; DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); +DEFINE_STATIC_KEY_FALSE(perf_is_hybrid); + +/* + * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined + * from just a typename, as opposed to an actual function. + */ +DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq); +DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all); +DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable); +DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable); + +DEFINE_STATIC_CALL_NULL(x86_pmu_assign, *x86_pmu.assign); + +DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); +DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); +DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); + +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); + +DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); +DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); +DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); + +DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling); +DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling); +DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling); + +DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task); +DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx); + +DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs); +DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases); + +/* + * This one is magic, it will get called even when PMU init fails (because + * there is no PMU), in which case it should simply return NULL. + */ +DEFINE_STATIC_CALL_RET0(x86_pmu_guest_get_msrs, *x86_pmu.guest_get_msrs); u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] @@ -71,10 +115,9 @@ u64 x86_perf_event_update(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; - int idx = hwc->idx; u64 delta; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* @@ -114,15 +157,16 @@ again: */ static int x86_pmu_extra_regs(u64 config, struct perf_event *event) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); struct hw_perf_event_extra *reg; struct extra_reg *er; reg = &event->hw.extra_reg; - if (!x86_pmu.extra_regs) + if (!extra_regs) return 0; - for (er = x86_pmu.extra_regs; er->msr; er++) { + for (er = extra_regs; er->msr; er++) { if (er->event != (config & er->config_mask)) continue; if (event->attr.config1 & ~er->valid_mask) @@ -145,16 +189,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex); #ifdef CONFIG_X86_LOCAL_APIC +static inline int get_possible_num_counters(void) +{ + int i, num_counters = x86_pmu.num_counters; + + if (!is_hybrid()) + return num_counters; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) + num_counters = max_t(int, num_counters, x86_pmu.hybrid_pmu[i].num_counters); + + return num_counters; +} + static bool reserve_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } @@ -165,7 +222,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu_config_addr(i)); - i = x86_pmu.num_counters; + i = num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -176,9 +233,9 @@ perfctr_fail: static void release_pmc_hardware(void) { - int i; + int i, num_counters = get_possible_num_counters(); - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { release_perfctr_nmi(x86_pmu_event_addr(i)); release_evntsel_nmi(x86_pmu_config_addr(i)); } @@ -191,7 +248,7 @@ static void release_pmc_hardware(void) {} #endif -static bool check_hw_exists(void) +bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed) { u64 val, val_fail = -1, val_new= ~0; int i, reg, reg_fail = -1, ret = 0; @@ -202,7 +259,7 @@ static bool check_hw_exists(void) * Check to see if the BIOS enabled any of the counters, if so * complain and bail. */ - for (i = 0; i < x86_pmu.num_counters; i++) { + for (i = 0; i < num_counters; i++) { reg = x86_pmu_config_addr(i); ret = rdmsrl_safe(reg, &val); if (ret) @@ -216,13 +273,15 @@ static bool check_hw_exists(void) } } - if (x86_pmu.num_counters_fixed) { + if (num_counters_fixed) { reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; ret = rdmsrl_safe(reg, &val); if (ret) goto msr_fail; - for (i = 0; i < x86_pmu.num_counters_fixed; i++) { - if (val & (0x03 << i*4)) { + for (i = 0; i < num_counters_fixed; i++) { + if (fixed_counter_disabled(i, pmu)) + continue; + if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg; @@ -321,8 +380,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; cache_result = array_index_nospec(cache_result, PERF_COUNT_HW_CACHE_RESULT_MAX); - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - + val = hybrid_var(event->pmu, hw_cache_event_ids)[cache_type][cache_op][cache_result]; if (val == 0) return -ENOENT; @@ -330,7 +388,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) return -EINVAL; hwc->config |= val; - attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + attr->config1 = hybrid_var(event->pmu, hw_cache_extra_regs)[cache_type][cache_op][cache_result]; return x86_pmu_extra_regs(val, event); } @@ -341,10 +399,12 @@ int x86_reserve_hardware(void) if (!atomic_inc_not_zero(&pmc_refcount)) { mutex_lock(&pmc_reserve_mutex); if (atomic_read(&pmc_refcount) == 0) { - if (!reserve_pmc_hardware()) + if (!reserve_pmc_hardware()) { err = -EBUSY; - else + } else { reserve_ds_buffers(); + reserve_lbr_buffers(); + } } if (!err) atomic_inc(&pmc_refcount); @@ -359,6 +419,7 @@ void x86_release_hardware(void) if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) { release_pmc_hardware(); release_ds_buffers(); + release_lbr_buffers(); mutex_unlock(&pmc_reserve_mutex); } } @@ -422,7 +483,7 @@ int x86_setup_perfctr(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } - if (attr->type == PERF_TYPE_RAW) + if (attr->type == event->pmu->type) return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) @@ -557,12 +618,13 @@ int x86_pmu_hw_config(struct perf_event *event) if (!event->attr.exclude_kernel) event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; - if (event->attr.type == PERF_TYPE_RAW) + if (event->attr.type == event->pmu->type) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) + s64 left = event->attr.sample_period; + x86_pmu.limit_period(event, &left); + if (left > event->attr.sample_period) return -EINVAL; } @@ -633,6 +695,12 @@ void x86_pmu_disable_all(void) } } +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr, void *data) +{ + return static_call(x86_pmu_guest_get_msrs)(nr, data); +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + /* * There may be PMI landing after enabled=0. The PMI hitting could be before or * after disable_all. @@ -660,7 +728,7 @@ static void x86_pmu_disable(struct pmu *pmu) cpuc->enabled = 0; barrier(); - x86_pmu.disable_all(); + static_call(x86_pmu_disable_all)(); } void x86_pmu_enable_all(int added) @@ -678,16 +746,33 @@ void x86_pmu_enable_all(int added) } } -static struct pmu pmu; - static inline int is_x86_event(struct perf_event *event) { - return event->pmu == &pmu; + int i; + + if (!is_hybrid()) + return event->pmu == &pmu; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (event->pmu == &x86_pmu.hybrid_pmu[i].pmu) + return true; + } + + return false; } -struct pmu *x86_get_pmu(void) +struct pmu *x86_get_pmu(unsigned int cpu) { - return &pmu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + /* + * All CPUs of the hybrid type have been offline. + * The x86_get_pmu() should not be invoked. + */ + if (WARN_ON_ONCE(!cpuc->pmu)) + return &pmu; + + return cpuc->pmu; } /* * Event scheduler state: @@ -719,7 +804,7 @@ struct perf_sched { }; /* - * Initialize interator that runs through all events and counters. + * Initialize iterator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, int num, int wmin, int wmax, int gpmax) @@ -890,6 +975,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { + int num_counters = hybrid(cpuc->pmu, num_counters); struct event_constraint *c; struct perf_event *e; int n0, i, wmin, wmax, unsched = 0; @@ -907,8 +993,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) n0 -= cpuc->n_txn; - if (x86_pmu.start_scheduling) - x86_pmu.start_scheduling(cpuc); + static_call_cond(x86_pmu_start_scheduling)(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = cpuc->event_constraint[i]; @@ -925,7 +1010,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * change due to external factors (sibling state, allow_tfa). */ if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); + c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]); cpuc->event_constraint[i] = c; } @@ -966,7 +1051,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { - int gpmax = x86_pmu.num_counters; + int gpmax = num_counters; /* * Do not allow scheduling of more than half the available @@ -987,7 +1072,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * the extra Merge events needed by large increment events. */ if (x86_pmu.flags & PMU_FL_PAIR) { - gpmax = x86_pmu.num_counters - cpuc->n_pair; + gpmax = num_counters - cpuc->n_pair; WARN_ON(gpmax <= 0); } @@ -1006,11 +1091,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * validate an event group (assign == NULL) */ if (!unsched && assign) { - for (i = 0; i < n; i++) { - e = cpuc->event_list[i]; - if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, i, assign[i]); - } + for (i = 0; i < n; i++) + static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]); } else { for (i = n0; i < n; i++) { e = cpuc->event_list[i]; @@ -1018,29 +1100,69 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* * release events that failed scheduling */ - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, e); + static_call_cond(x86_pmu_put_event_constraints)(cpuc, e); cpuc->event_constraint[i] = NULL; } } - if (x86_pmu.stop_scheduling) - x86_pmu.stop_scheduling(cpuc); + static_call_cond(x86_pmu_stop_scheduling)(cpuc); return unsched ? -EINVAL : 0; } +static int add_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) { + if (cpuc->n_metric == INTEL_TD_METRIC_NUM) + return -EINVAL; + cpuc->n_metric++; + cpuc->n_txn_metric++; + } + + return 0; +} + +static void del_nr_metric_event(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (is_metric_event(event)) + cpuc->n_metric--; +} + +static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event, + int max_count, int n) +{ + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); + + if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event)) + return -EINVAL; + + if (n >= max_count + cpuc->n_metric) + return -EINVAL; + + cpuc->event_list[n] = event; + if (is_counter_pair(&event->hw)) { + cpuc->n_pair++; + cpuc->n_txn_pair++; + } + + return 0; +} + /* * dogrp: true if must collect siblings events (group) * returns total number of events and error code */ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { + int num_counters = hybrid(cpuc->pmu, num_counters); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct perf_event *event; int n, max_count; - max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; + max_count = num_counters + num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; @@ -1067,28 +1189,22 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, } if (is_x86_event(leader)) { - if (n >= max_count) + if (collect_event(cpuc, leader, max_count, n)) return -EINVAL; - cpuc->event_list[n] = leader; n++; - if (is_counter_pair(&leader->hw)) - cpuc->n_pair++; } + if (!dogrp) return n; for_each_sibling_event(event, leader) { - if (!is_x86_event(event) || - event->state <= PERF_EVENT_STATE_OFF) + if (!is_x86_event(event) || event->state <= PERF_EVENT_STATE_OFF) continue; - if (n >= max_count) + if (collect_event(cpuc, event, max_count, n)) return -EINVAL; - cpuc->event_list[n] = event; n++; - if (is_counter_pair(&event->hw)) - cpuc->n_pair++; } return n; } @@ -1097,22 +1213,38 @@ static inline void x86_assign_hw_event(struct perf_event *event, struct cpu_hw_events *cpuc, int i) { struct hw_perf_event *hwc = &event->hw; + int idx; - hwc->idx = cpuc->assign[i]; + idx = hwc->idx = cpuc->assign[i]; hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { + static_call_cond(x86_pmu_assign)(event, idx); + + switch (hwc->idx) { + case INTEL_PMC_IDX_FIXED_BTS: + case INTEL_PMC_IDX_FIXED_VLBR: hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + break; + + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: + /* All the metric events are mapped onto the fixed counter 3. */ + idx = INTEL_PMC_IDX_FIXED_SLOTS; + fallthrough; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1: hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); - hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; - } else { + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + + (idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | + INTEL_PMC_FIXED_RDPMC_BASE; + break; + + default: hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); + break; } } @@ -1208,6 +1340,10 @@ static void x86_pmu_enable(struct pmu *pmu) if (hwc->state & PERF_HES_ARCH) continue; + /* + * if cpuc->enabled = 0, then no wrmsr as + * per x86_pmu_enable_event() + */ x86_pmu_start(event, PERF_EF_RELOAD); } cpuc->n_added = 0; @@ -1217,10 +1353,10 @@ static void x86_pmu_enable(struct pmu *pmu) cpuc->enabled = 1; barrier(); - x86_pmu.enable_all(added); + static_call(x86_pmu_enable_all)(added); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); +DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1233,7 +1369,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == INTEL_PMC_IDX_FIXED_BTS) + if (unlikely(!hwc->event_base)) return 0; /* @@ -1261,10 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); + static_call_cond(x86_pmu_limit_period)(event, &left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, @@ -1275,21 +1410,11 @@ int x86_perf_event_set_period(struct perf_event *event) wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* - * Clear the Merge event counter's upper 16 bits since + * Sign extend the Merge event counter's upper 16 bits since * we currently declare a 48-bit counter width */ if (is_counter_pair(hwc)) - wrmsrl(x86_pmu_event_addr(idx + 1), 0); - - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } + wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); perf_event_update_userpage(event); @@ -1338,7 +1463,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; - ret = x86_pmu.schedule_events(cpuc, n, assign); + ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) goto out; /* @@ -1356,13 +1481,11 @@ done_collect: cpuc->n_added += n - n0; cpuc->n_txn += n - n0; - if (x86_pmu.add) { - /* - * This is before x86_pmu_enable() will call x86_pmu_start(), - * so we enable LBRs before an event needs them etc.. - */ - x86_pmu.add(event); - } + /* + * This is before x86_pmu_enable() will call x86_pmu_start(), + * so we enable LBRs before an event needs them etc.. + */ + static_call_cond(x86_pmu_add)(event); ret = 0; out: @@ -1382,15 +1505,14 @@ static void x86_pmu_start(struct perf_event *event, int flags) if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); } event->hw.state = 0; cpuc->events[idx] = event; __set_bit(idx, cpuc->active_mask); - __set_bit(idx, cpuc->running); - x86_pmu.enable(event); + static_call(x86_pmu_enable)(event); perf_event_update_userpage(event); } @@ -1398,18 +1520,19 @@ void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; u64 pebs, debugctl; - struct cpu_hw_events *cpuc; + int cpu = smp_processor_id(); + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int num_counters = hybrid(cpuc->pmu, num_counters); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, pebs_constraints); unsigned long flags; - int cpu, idx; + int idx; - if (!x86_pmu.num_counters) + if (!num_counters) return; local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_events, cpu); - if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -1421,7 +1544,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - if (x86_pmu.pebs_constraints) { + if (pebs_constraints) { rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } @@ -1432,7 +1555,7 @@ void perf_event_print_debug(void) } pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < num_counters; idx++) { rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); rdmsrl(x86_pmu_event_addr(idx), pmc_count); @@ -1445,7 +1568,9 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + for (idx = 0; idx < num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx, cpuc->pmu)) + continue; rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1460,7 +1585,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; if (test_bit(hwc->idx, cpuc->active_mask)) { - x86_pmu.disable(event); + static_call(x86_pmu_disable)(event); __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); @@ -1472,7 +1597,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -1480,6 +1605,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) static void x86_pmu_del(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap); int i; /* @@ -1493,6 +1619,8 @@ static void x86_pmu_del(struct perf_event *event, int flags) if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto do_del; + __set_bit(event->hw.idx, cpuc->dirty); + /* * Not a TXN, therefore cleanup properly. */ @@ -1510,8 +1638,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) if (i >= cpuc->n_events - cpuc->n_added) --cpuc->n_added; - if (x86_pmu.put_event_constraints) - x86_pmu.put_event_constraints(cpuc, event); + static_call_cond(x86_pmu_put_event_constraints)(cpuc, event); /* Delete the array entry. */ while (++i < cpuc->n_events) { @@ -1520,17 +1647,18 @@ static void x86_pmu_del(struct perf_event *event, int flags) } cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; + if (intel_cap.perf_metrics) + del_nr_metric_event(cpuc, event); perf_event_update_userpage(event); do_del: - if (x86_pmu.del) { - /* - * This is after x86_pmu_stop(); so we disable LBRs after any - * event can need them etc.. - */ - x86_pmu.del(event); - } + + /* + * This is after x86_pmu_stop(); so we disable LBRs after any + * event can need them etc.. + */ + static_call_cond(x86_pmu_del)(event); } int x86_pmu_handle_irq(struct pt_regs *regs) @@ -1559,7 +1687,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; - val = x86_perf_event_update(event); + val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; @@ -1567,11 +1695,17 @@ int x86_pmu_handle_irq(struct pt_regs *regs) * event overflow */ handled++; - perf_sample_data_init(&data, 0, event->hw.last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) { + data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } @@ -1608,7 +1742,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) return NMI_DONE; start_clock = sched_clock(); - ret = x86_pmu.handle_irq(regs); + ret = static_call(x86_pmu_handle_irq)(regs); finish_clock = sched_clock(); perf_sample_event_took(finish_clock - start_clock); @@ -1700,7 +1834,7 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, cha /* string trumps id */ if (pmu_attr->event_str) - return sprintf(page, "%s", pmu_attr->event_str); + return sprintf(page, "%s\n", pmu_attr->event_str); return x86_pmu.events_sysfs_show(page, config); } @@ -1729,6 +1863,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, pmu_attr->event_str_noht); } +ssize_t events_hybrid_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr); + struct x86_hybrid_pmu *pmu; + const char *str, *next_str; + int i; + + if (hweight64(pmu_attr->pmu_type) == 1) + return sprintf(page, "%s", pmu_attr->event_str); + + /* + * Hybrid PMUs may support the same event name, but with different + * event encoding, e.g., the mem-loads event on an Atom PMU has + * different event encoding from a Core PMU. + * + * The event_str includes all event encodings. Each event encoding + * is divided by ";". The order of the event encodings must follow + * the order of the hybrid PMU index. + */ + pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + str = pmu_attr->event_str; + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type)) + continue; + if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) { + next_str = strchr(str, ';'); + if (next_str) + return snprintf(page, next_str - str + 1, "%s", str); + else + return sprintf(page, "%s", str); + } + str = strchr(str, ';'); + str++; + } + + return 0; +} +EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show); + EVENT_ATTR(cpu-cycles, CPU_CYCLES ); EVENT_ATTR(instructions, INSTRUCTIONS ); EVENT_ATTR(cache-references, CACHE_REFERENCES ); @@ -1821,6 +1998,77 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_caps_group; +static void x86_pmu_static_call_update(void) +{ + static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq); + static_call_update(x86_pmu_disable_all, x86_pmu.disable_all); + static_call_update(x86_pmu_enable_all, x86_pmu.enable_all); + static_call_update(x86_pmu_enable, x86_pmu.enable); + static_call_update(x86_pmu_disable, x86_pmu.disable); + + static_call_update(x86_pmu_assign, x86_pmu.assign); + + static_call_update(x86_pmu_add, x86_pmu.add); + static_call_update(x86_pmu_del, x86_pmu.del); + static_call_update(x86_pmu_read, x86_pmu.read); + + static_call_update(x86_pmu_set_period, x86_pmu.set_period); + static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); + + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); + static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); + static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); + + static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling); + static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling); + static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling); + + static_call_update(x86_pmu_sched_task, x86_pmu.sched_task); + static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx); + + static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs); + static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases); + + static_call_update(x86_pmu_guest_get_msrs, x86_pmu.guest_get_msrs); +} + +static void _x86_pmu_read(struct perf_event *event) +{ + static_call(x86_pmu_update)(event); +} + +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl) +{ + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %lu\n", + hweight64((((1ULL << num_counters_fixed) - 1) + << INTEL_PMC_IDX_FIXED) & intel_ctrl)); + pr_info("... event mask: %016Lx\n", intel_ctrl); +} + +/* + * The generic code is not hybrid friendly. The hybrid_pmu->pmu + * of the first registered PMU is unconditionally assigned to + * each possible cpuctx->ctx.pmu. + * Update the correct hybrid PMU to the cpuctx->ctx.pmu. + */ +void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu) +{ + struct perf_cpu_context *cpuctx; + + if (!pmu->pmu_cpu_context) + return; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx->ctx.pmu = pmu; +} + static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; @@ -1839,19 +2087,24 @@ static int __init init_hw_perf_events(void) err = amd_pmu_init(); x86_pmu.name = "HYGON"; break; + case X86_VENDOR_ZHAOXIN: + case X86_VENDOR_CENTAUR: + err = zhaoxin_pmu_init(); + break; default: err = -ENOTSUPP; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return 0; + err = 0; + goto out_bad_pmu; } pmu_check_apic(); /* sanity check that the hardware exists or is emulated */ - if (!check_hw_exists()) - return 0; + if (!check_hw_exists(&pmu, x86_pmu.num_counters, x86_pmu.num_counters_fixed)) + goto out_bad_pmu; pr_cont("%s PMU driver.\n", x86_pmu.name); @@ -1877,13 +2130,25 @@ static int __init init_hw_perf_events(void) pmu.attr_update = x86_pmu.attr_update; - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.cntval_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); - pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + if (!is_hybrid()) { + x86_pmu_show_pmu_cap(x86_pmu.num_counters, + x86_pmu.num_counters_fixed, + x86_pmu.intel_ctrl); + } + + if (!x86_pmu.read) + x86_pmu.read = _x86_pmu_read; + + if (!x86_pmu.guest_get_msrs) + x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + + if (!x86_pmu.set_period) + x86_pmu.set_period = x86_perf_event_set_period; + + if (!x86_pmu.update) + x86_pmu.update = x86_perf_event_update; + + x86_pmu_static_call_update(); /* * Install callbacks. Core will call them for each online @@ -1905,9 +2170,46 @@ static int __init init_hw_perf_events(void) if (err) goto out1; - err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); - if (err) - goto out2; + if (!is_hybrid()) { + err = perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + if (err) + goto out2; + } else { + u8 cpu_type = get_this_hybrid_cpu_type(); + struct x86_hybrid_pmu *hybrid_pmu; + int i, j; + + if (!cpu_type && x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + hybrid_pmu = &x86_pmu.hybrid_pmu[i]; + + hybrid_pmu->pmu = pmu; + hybrid_pmu->pmu.type = -1; + hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; + + err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, + (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); + if (err) + break; + + if (cpu_type == hybrid_pmu->cpu_type) + x86_pmu_update_cpu_context(&hybrid_pmu->pmu, raw_smp_processor_id()); + } + + if (i < x86_pmu.num_hybrid_pmus) { + for (j = 0; j < i; j++) + perf_pmu_unregister(&x86_pmu.hybrid_pmu[j].pmu); + pr_warn("Failed to register hybrid PMUs\n"); + kfree(x86_pmu.hybrid_pmu); + x86_pmu.hybrid_pmu = NULL; + x86_pmu.num_hybrid_pmus = 0; + goto out2; + } + } return 0; @@ -1917,15 +2219,15 @@ out1: cpuhp_remove_state(CPUHP_AP_PERF_X86_STARTING); out: cpuhp_remove_state(CPUHP_PERF_X86_PREPARE); +out_bad_pmu: + memset(&x86_pmu, 0, sizeof(x86_pmu)); return err; } early_initcall(init_hw_perf_events); -static inline void x86_pmu_read(struct perf_event *event) +static void x86_pmu_read(struct perf_event *event) { - if (x86_pmu.read) - return x86_pmu.read(event); - x86_perf_event_update(event); + static_call(x86_pmu_read)(event); } /* @@ -1949,6 +2251,8 @@ static void x86_pmu_start_txn(struct pmu *pmu, unsigned int txn_flags) perf_pmu_disable(pmu); __this_cpu_write(cpu_hw_events.n_txn, 0); + __this_cpu_write(cpu_hw_events.n_txn_pair, 0); + __this_cpu_write(cpu_hw_events.n_txn_metric, 0); } /* @@ -1974,6 +2278,8 @@ static void x86_pmu_cancel_txn(struct pmu *pmu) */ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn)); __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn)); + __this_cpu_sub(cpu_hw_events.n_pair, __this_cpu_read(cpu_hw_events.n_txn_pair)); + __this_cpu_sub(cpu_hw_events.n_metric, __this_cpu_read(cpu_hw_events.n_txn_metric)); perf_pmu_enable(pmu); } @@ -2002,7 +2308,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) if (!x86_pmu_initialized()) return -EAGAIN; - ret = x86_pmu.schedule_events(cpuc, n, assign); + ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign); if (ret) return ret; @@ -2030,16 +2336,27 @@ static void free_fake_cpuc(struct cpu_hw_events *cpuc) kfree(cpuc); } -static struct cpu_hw_events *allocate_fake_cpuc(void) +static struct cpu_hw_events *allocate_fake_cpuc(struct pmu *event_pmu) { struct cpu_hw_events *cpuc; - int cpu = raw_smp_processor_id(); + int cpu; cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); if (!cpuc) return ERR_PTR(-ENOMEM); cpuc->is_fake = 1; + if (is_hybrid()) { + struct x86_hybrid_pmu *h_pmu; + + h_pmu = hybrid_pmu(event_pmu); + if (cpumask_empty(&h_pmu->supported_cpus)) + goto error; + cpu = cpumask_first(&h_pmu->supported_cpus); + } else + cpu = raw_smp_processor_id(); + cpuc->pmu = event_pmu; + if (intel_cpuc_prepare(cpuc, cpu)) goto error; @@ -2058,7 +2375,7 @@ static int validate_event(struct perf_event *event) struct event_constraint *c; int ret = 0; - fake_cpuc = allocate_fake_cpuc(); + fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); @@ -2092,7 +2409,27 @@ static int validate_group(struct perf_event *event) struct cpu_hw_events *fake_cpuc; int ret = -EINVAL, n; - fake_cpuc = allocate_fake_cpuc(); + /* + * Reject events from different hybrid PMUs. + */ + if (is_hybrid()) { + struct perf_event *sibling; + struct pmu *pmu = NULL; + + if (is_x86_event(leader)) + pmu = leader->pmu; + + for_each_sibling_event(sibling, leader) { + if (!is_x86_event(sibling)) + continue; + if (!pmu) + pmu = sibling->pmu; + else if (pmu != sibling->pmu) + return ret; + } + } + + fake_cpuc = allocate_fake_cpuc(event->pmu); if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); /* @@ -2120,56 +2457,70 @@ out: static int x86_pmu_event_init(struct perf_event *event) { - struct pmu *tmp; + struct x86_hybrid_pmu *pmu = NULL; int err; - switch (event->attr.type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - break; - - default: + if ((event->attr.type != event->pmu->type) && + (event->attr.type != PERF_TYPE_HARDWARE) && + (event->attr.type != PERF_TYPE_HW_CACHE)) return -ENOENT; + + if (is_hybrid() && (event->cpu != -1)) { + pmu = hybrid_pmu(event->pmu); + if (!cpumask_test_cpu(event->cpu, &pmu->supported_cpus)) + return -ENOENT; } err = __x86_pmu_event_init(event); if (!err) { - /* - * we temporarily connect event to its pmu - * such that validate_group() can classify - * it as an x86 event using is_x86_event() - */ - tmp = event->pmu; - event->pmu = &pmu; - if (event->group_leader != event) err = validate_group(event); else err = validate_event(event); - - event->pmu = tmp; } if (err) { if (event->destroy) event->destroy(event); + event->destroy = NULL; } if (READ_ONCE(x86_pmu.attr_rdpmc) && !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS)) - event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED; + event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT; return err; } -static void refresh_pce(void *ignored) +void perf_clear_dirty_counters(void) { - load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm)); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + /* Don't need to clear the assigned counter. */ + for (i = 0; i < cpuc->n_events; i++) + __clear_bit(cpuc->assign[i], cpuc->dirty); + + if (bitmap_empty(cpuc->dirty, X86_PMC_IDX_MAX)) + return; + + for_each_set_bit(i, cpuc->dirty, X86_PMC_IDX_MAX) { + if (i >= INTEL_PMC_IDX_FIXED) { + /* Metrics and fake events don't have corresponding HW counters. */ + if ((i - INTEL_PMC_IDX_FIXED) >= hybrid(cpuc->pmu, num_counters_fixed)) + continue; + + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + (i - INTEL_PMC_IDX_FIXED), 0); + } else { + wrmsrl(x86_pmu_event_addr(i), 0); + } + } + + bitmap_zero(cpuc->dirty, X86_PMC_IDX_MAX); } static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; /* @@ -2179,38 +2530,35 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) * userspace with CR4.PCE clear while another task is still * doing on_each_cpu_mask() to propagate CR4.PCE. * - * For now, this can't happen because all callers hold mmap_sem + * For now, this can't happen because all callers hold mmap_lock * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_write(&mm->mmap_sem); + mmap_assert_write_locked(mm); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)) return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) { - int idx = event->hw.idx; + struct hw_perf_event *hwc = &event->hw; - if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) + if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT)) return 0; - if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { - idx -= INTEL_PMC_IDX_FIXED; - idx |= 1 << 30; - } - - return idx + 1; + if (is_metric_idx(hwc->idx)) + return INTEL_PMC_FIXED_RDPMC_METRICS + 1; + else + return hwc->event_base_rdpmc + 1; } static ssize_t get_attr_rdpmc(struct device *cdev, @@ -2253,7 +2601,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev, else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); - on_each_cpu(refresh_pce, NULL, 1); + on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } @@ -2300,15 +2648,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) { - if (x86_pmu.sched_task) - x86_pmu.sched_task(ctx, sched_in); + static_call_cond(x86_pmu_sched_task)(ctx, sched_in); } static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - if (x86_pmu.swap_task_ctx) - x86_pmu.swap_task_ctx(prev, next); + static_call_cond(x86_pmu_swap_task_ctx)(prev, next); } void perf_check_microcode(void) @@ -2323,7 +2669,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return -EINVAL; if (value && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, value) > value) + s64 left = value; + x86_pmu.limit_period(event, &left); + if (left > value) return -EINVAL; } @@ -2341,6 +2689,14 @@ static int x86_pmu_aux_output_match(struct perf_event *event) return 0; } +static int x86_pmu_filter_match(struct perf_event *event) +{ + if (x86_pmu.filter_match) + return x86_pmu.filter_match(event); + + return 1; +} + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, @@ -2364,11 +2720,12 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .sched_task = x86_pmu_sched_task, - .task_ctx_size = sizeof(struct x86_perf_task_context), .swap_task_ctx = x86_pmu_swap_task_ctx, .check_period = x86_pmu_check_period, .aux_output_match = x86_pmu_aux_output_match, + + .filter_match = x86_pmu_filter_match, }; void arch_perf_update_userpage(struct perf_event *event, @@ -2380,7 +2737,7 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time = 0; userpg->cap_user_time_zero = 0; userpg->cap_user_rdpmc = - !!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED); + !!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT); userpg->pmc_width = x86_pmu.cntval_bits; if (!using_native_sched_clock() || !sched_clock_stable()) @@ -2426,7 +2783,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re struct unwind_state state; unsigned long addr; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2449,7 +2806,7 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re static inline int valid_user_frame(const void __user *fp, unsigned long size) { - return (__range_not_ok(fp, size, TASK_SIZE) == 0); + return __access_ok(fp, size); } static unsigned long get_segment_base(unsigned int segment) @@ -2490,9 +2847,9 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent /* 32-bit process in 64-bit kernel. */ unsigned long ss_base, cs_base; struct stack_frame_ia32 frame; - const void __user *fp; + const struct stack_frame_ia32 __user *fp; - if (!test_thread_flag(TIF_IA32)) + if (user_64bit_mode(regs)) return 0; cs_base = get_segment_base(regs->cs); @@ -2501,18 +2858,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent fp = compat_ptr(ss_base + regs->bp); pagefault_disable(); while (entry->nr < entry->max_stack) { - unsigned long bytes; - frame.next_frame = 0; - frame.return_address = 0; - if (!valid_user_frame(fp, sizeof(frame))) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, 4); - if (bytes != 0) + if (__get_user(frame.next_frame, &fp->next_frame)) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp+4, 4); - if (bytes != 0) + if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, cs_base + frame.return_address); @@ -2533,9 +2884,9 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { struct stack_frame frame; - const unsigned long __user *fp; + const struct stack_frame __user *fp; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_state()) { /* TODO: We don't support guest os callchain now */ return; } @@ -2546,7 +2897,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM)) return; - fp = (unsigned long __user *)regs->bp; + fp = (void __user *)regs->bp; perf_callchain_store(entry, regs->ip); @@ -2558,19 +2909,12 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs pagefault_disable(); while (entry->nr < entry->max_stack) { - unsigned long bytes; - - frame.next_frame = NULL; - frame.return_address = 0; - if (!valid_user_frame(fp, sizeof(frame))) break; - bytes = __copy_from_user_nmi(&frame.next_frame, fp, sizeof(*fp)); - if (bytes != 0) + if (__get_user(frame.next_frame, &fp->next_frame)) break; - bytes = __copy_from_user_nmi(&frame.return_address, fp + 1, sizeof(*fp)); - if (bytes != 0) + if (__get_user(frame.return_address, &fp->return_address)) break; perf_callchain_store(entry, frame.return_address); @@ -2619,18 +2963,19 @@ static unsigned long code_segment_base(struct pt_regs *regs) unsigned long perf_instruction_pointer(struct pt_regs *regs) { - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) - return perf_guest_cbs->get_guest_ip(); + if (perf_guest_state()) + return perf_guest_get_ip(); return regs->ip + code_segment_base(regs); } unsigned long perf_misc_flags(struct pt_regs *regs) { + unsigned int guest_state = perf_guest_state(); int misc = 0; - if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { - if (perf_guest_cbs->is_user_mode()) + if (guest_state) { + if (guest_state & PERF_GUEST_USER) misc |= PERF_RECORD_MISC_GUEST_USER; else misc |= PERF_RECORD_MISC_GUEST_KERNEL; @@ -2649,12 +2994,34 @@ unsigned long perf_misc_flags(struct pt_regs *regs) void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { + if (!x86_pmu_initialized()) { + memset(cap, 0, sizeof(*cap)); + return; + } + cap->version = x86_pmu.version; + /* + * KVM doesn't support the hybrid PMU yet. + * Return the common value in global x86_pmu, + * which available for all cores. + */ cap->num_counters_gp = x86_pmu.num_counters; cap->num_counters_fixed = x86_pmu.num_counters_fixed; cap->bit_width_gp = x86_pmu.cntval_bits; cap->bit_width_fixed = x86_pmu.cntval_bits; cap->events_mask = (unsigned int)x86_pmu.events_maskl; cap->events_mask_len = x86_pmu.events_mask_len; + cap->pebs_ept = x86_pmu.pebs_ept; } EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); + +u64 perf_get_hw_event_config(int hw_event) +{ + int max = x86_pmu.max_events; + + if (hw_event < max) + return x86_pmu.event_map(array_index_nospec(hw_event, max)); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_get_hw_event_config); diff --git a/arch/x86/events/intel/Makefile b/arch/x86/events/intel/Makefile index 3468b0c1dc7c..10bde6c5abb2 100644 --- a/arch/x86/events/intel/Makefile +++ b/arch/x86/events/intel/Makefile @@ -2,9 +2,7 @@ obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o -obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o -intel-rapl-perf-objs := rapl.o obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += intel-uncore.o -intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o +intel-uncore-objs := uncore.o uncore_nhmex.o uncore_snb.o uncore_snbep.o uncore_discovery.o obj-$(CONFIG_PERF_EVENTS_INTEL_CSTATE) += intel-cstate.o intel-cstate-objs := cstate.o diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 6a3b599ee0fe..974e917e65b2 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -58,7 +58,7 @@ struct bts_buffer { local_t head; unsigned long end; void **data_pages; - struct bts_phys buf[0]; + struct bts_phys buf[]; }; static struct pmu bts_pmu; @@ -209,6 +209,12 @@ static void bts_update(struct bts_ctx *bts) } else { local_set(&buf->data_size, head); } + + /* + * Since BTS is coherent, just add compiler barrier to ensure + * BTS updating is ordered against bts::handle::event. + */ + barrier(); } static int @@ -594,7 +600,7 @@ static __init int bts_init(void) * we cannot use the user mapping since it will not be available * if we're not running the owning process. * - * With PTI we can't use the kernal map either, because its not + * With PTI we can't use the kernel map either, because its not * there when we run userspace. * * For now, disable this driver when using PTI. diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dff6623804c2..1b92bf05fd65 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/export.h> #include <linux/nmi.h> +#include <linux/kvm_host.h> #include <asm/cpufeature.h> #include <asm/hardirq.h> @@ -137,7 +138,7 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_UEVENT_CONSTRAINT(0x0148, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMTPY */ + INTEL_UEVENT_CONSTRAINT(0x0279, 0xf), /* IDQ.EMPTY */ INTEL_UEVENT_CONSTRAINT(0x019c, 0xf), /* IDQ_UOPS_NOT_DELIVERED.CORE */ INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_LDM_PENDING */ INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ @@ -181,6 +182,27 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct event_constraint intel_v5_gen_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + FIXED_EVENT_CONSTRAINT(0x0500, 4), + FIXED_EVENT_CONSTRAINT(0x0600, 5), + FIXED_EVENT_CONSTRAINT(0x0700, 6), + FIXED_EVENT_CONSTRAINT(0x0800, 7), + FIXED_EVENT_CONSTRAINT(0x0900, 8), + FIXED_EVENT_CONSTRAINT(0x0a00, 9), + FIXED_EVENT_CONSTRAINT(0x0b00, 10), + FIXED_EVENT_CONSTRAINT(0x0c00, 11), + FIXED_EVENT_CONSTRAINT(0x0d00, 12), + FIXED_EVENT_CONSTRAINT(0x0e00, 13), + FIXED_EVENT_CONSTRAINT(0x0f00, 14), + FIXED_EVENT_CONSTRAINT(0x1000, 15), + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_slm_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -243,21 +265,28 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { static struct event_constraint intel_icl_event_constraints[] = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ - INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x01c0, 0), /* old INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ - INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x56, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ - INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.CYCLES_MEM_ANY */ + INTEL_UEVENT_CONSTRAINT(0x14a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT(0xef, 0xf), INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), EVENT_CONSTRAINT_END }; @@ -270,6 +299,57 @@ static struct extra_reg intel_icl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct extra_reg intel_spr_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x012a, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x012b, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff1f, FE), + INTEL_UEVENT_EXTRA_REG(0x40ad, MSR_PEBS_FRONTEND, 0x7, FE), + INTEL_UEVENT_EXTRA_REG(0x04c2, MSR_PEBS_FRONTEND, 0x8, FE), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_spr_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x0100, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_RETIRING, 0), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BAD_SPEC, 1), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FE_BOUND, 2), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BE_BOUND, 3), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_HEAVY_OPS, 4), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_BR_MISPREDICT, 5), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_FETCH_LAT, 6), + METRIC_EVENT_CONSTRAINT(INTEL_TD_METRIC_MEM_BOUND, 7), + + INTEL_EVENT_CONSTRAINT(0x2e, 0xff), + INTEL_EVENT_CONSTRAINT(0x3c, 0xff), + /* + * Generally event codes < 0x90 are restricted to counters 0-3. + * The 0x2E and 0x3C are exception, which has no restriction. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x01, 0x8f, 0xf), + + INTEL_UEVENT_CONSTRAINT(0x01a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x02a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x08a3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x08a4, 0x1), + INTEL_UEVENT_CONSTRAINT(0x02cd, 0x1), + INTEL_EVENT_CONSTRAINT(0xce, 0x1), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xdf, 0xf), + /* + * Generally event codes >= 0x90 are likely to have no restrictions. + * The exception are defined as above. + */ + INTEL_EVENT_CONSTRAINT_RANGE(0x90, 0xfe, 0xff), + + EVENT_CONSTRAINT_END +}; + + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -309,6 +389,16 @@ EVENT_ATTR_STR_HT(topdown-recovery-bubbles, td_recovery_bubbles, EVENT_ATTR_STR_HT(topdown-recovery-bubbles.scale, td_recovery_bubbles_scale, "4", "2"); +EVENT_ATTR_STR(slots, slots, "event=0x00,umask=0x4"); +EVENT_ATTR_STR(topdown-retiring, td_retiring, "event=0x00,umask=0x80"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec, "event=0x00,umask=0x81"); +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound, "event=0x00,umask=0x82"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound, "event=0x00,umask=0x83"); +EVENT_ATTR_STR(topdown-heavy-ops, td_heavy_ops, "event=0x00,umask=0x84"); +EVENT_ATTR_STR(topdown-br-mispredict, td_br_mispredict, "event=0x00,umask=0x85"); +EVENT_ATTR_STR(topdown-fetch-lat, td_fetch_lat, "event=0x00,umask=0x86"); +EVENT_ATTR_STR(topdown-mem-bound, td_mem_bound, "event=0x00,umask=0x87"); + static struct attribute *snb_events_attrs[] = { EVENT_PTR(td_slots_issued), EVENT_PTR(td_slots_retired), @@ -373,6 +463,108 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } +static __initconst const u64 spr_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe124, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_MISS) ] = 0xe424, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, + [ C(RESULT_MISS) ] = 0xe12, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, + [ C(RESULT_MISS) ] = 0xe13, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = 0xe11, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4c4, + [ C(RESULT_MISS) ] = 0x4c5, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x12a, + [ C(RESULT_MISS) ] = 0x12a, + }, + }, +}; + +static __initconst const u64 spr_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10001, + [ C(RESULT_MISS) ] = 0x3fbfc00001, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x3f3ffc0002, + [ C(RESULT_MISS) ] = 0x3f3fc00002, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x10c000001, + [ C(RESULT_MISS) ] = 0x3fb3000001, + }, + }, +}; + /* * Notes on the events: * - data reads do not include code reads (comparable to earlier tables) @@ -1890,10 +2082,40 @@ static __initconst const u64 tnt_hw_cache_extra_regs }, }; +EVENT_ATTR_STR(topdown-fe-bound, td_fe_bound_tnt, "event=0x71,umask=0x0"); +EVENT_ATTR_STR(topdown-retiring, td_retiring_tnt, "event=0xc2,umask=0x0"); +EVENT_ATTR_STR(topdown-bad-spec, td_bad_spec_tnt, "event=0x73,umask=0x6"); +EVENT_ATTR_STR(topdown-be-bound, td_be_bound_tnt, "event=0x74,umask=0x0"); + +static struct attribute *tnt_events_attrs[] = { + EVENT_PTR(td_fe_bound_tnt), + EVENT_PTR(td_retiring_tnt), + EVENT_PTR(td_bad_spec_tnt), + EVENT_PTR(td_be_bound_tnt), + NULL, +}; + static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ - INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), - INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x800ff0ffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xff0ffffff9fffull, RSP_1), + EVENT_EXTRA_END +}; + +EVENT_ATTR_STR(mem-loads, mem_ld_grt, "event=0xd0,umask=0x5,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_grt, "event=0xd0,umask=0x6"); + +static struct attribute *grt_mem_attrs[] = { + EVENT_PTR(mem_ld_grt), + EVENT_PTR(mem_st_grt), + NULL +}; + +static struct extra_reg intel_grt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0), EVENT_EXTRA_END }; @@ -1945,33 +2167,46 @@ static __initconst const u64 knl_hw_cache_extra_regs * intel_bts events don't coexist with intel PMU's BTS events because of * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them * disabled around intel PMU's event batching etc, only inside the PMI handler. + * + * Avoid PEBS_ENABLE MSR access in PMIs. + * The GLOBAL_CTRL has been disabled. All the counters do not count anymore. + * It doesn't matter if the PEBS is enabled or not. + * Usually, the PEBS status are not changed in PMIs. It's unnecessary to + * access PEBS_ENABLE MSR in disable_all()/enable_all(). + * However, there are some cases which may change PEBS status, e.g. PMI + * throttle. The PEBS_ENABLE should be updated where the status changes. */ -static void __intel_pmu_disable_all(void) +static __always_inline void __intel_pmu_disable_all(bool bts) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (bts && test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - - intel_pmu_pebs_disable_all(); } -static void intel_pmu_disable_all(void) +static __always_inline void intel_pmu_disable_all(void) { - __intel_pmu_disable_all(); + __intel_pmu_disable_all(true); + intel_pmu_pebs_disable_all(); intel_pmu_lbr_disable_all(); } static void __intel_pmu_enable_all(int added, bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); - intel_pmu_pebs_enable_all(); intel_pmu_lbr_enable_all(pmi); + + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; + } + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, - x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); + intel_ctrl & ~cpuc->intel_ctrl_guest_mask); if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = @@ -1986,9 +2221,51 @@ static void __intel_pmu_enable_all(int added, bool pmi) static void intel_pmu_enable_all(int added) { + intel_pmu_pebs_enable_all(); __intel_pmu_enable_all(added, false); } +static noinline int +__intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, + unsigned int cnt, unsigned long flags) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + intel_pmu_lbr_read(); + cnt = min_t(unsigned int, cnt, x86_pmu.lbr_nr); + + memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt); + intel_pmu_enable_all(0); + local_irq_restore(flags); + return cnt; +} + +static int +intel_pmu_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + unsigned long flags; + + /* must not have branches... */ + local_irq_save(flags); + __intel_pmu_disable_all(false); /* we don't care about BTS */ + __intel_pmu_lbr_disable(); + /* ... until here */ + return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); +} + +static int +intel_pmu_snapshot_arch_branch_stack(struct perf_branch_entry *entries, unsigned int cnt) +{ + unsigned long flags; + + /* must not have branches... */ + local_irq_save(flags); + __intel_pmu_disable_all(false); /* we don't care about BTS */ + __intel_pmu_arch_lbr_disable(); + /* ... until here */ + return __intel_pmu_snapshot_branch_stack(entries, cnt, flags); +} + /* * Workaround for: * Intel Errata AAK100 (model 26) @@ -2000,7 +2277,7 @@ static void intel_pmu_enable_all(int added) * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either * in sequence on the same PMC or on different PMCs. * - * In practise it appears some of these events do in fact count, and + * In practice it appears some of these events do in fact count, and * we need to program all 4 events. */ static void intel_pmu_nhm_workaround(void) @@ -2040,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void) for (i = 0; i < 4; i++) { event = cpuc->events[i]; if (event) - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } for (i = 0; i < 4; i++) { @@ -2055,7 +2332,7 @@ static void intel_pmu_nhm_workaround(void) event = cpuc->events[i]; if (event) { - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else @@ -2103,18 +2380,6 @@ static void intel_tfa_pmu_enable_all(int added) intel_pmu_enable_all(added); } -static void enable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() | - DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - -static void disable_counter_freeze(void) -{ - update_debugctlmsr(get_debugctlmsr() & - ~DEBUGCTLMSR_FREEZE_PERFMON_ON_PMI); -} - static inline u64 intel_pmu_get_status(void) { u64 status; @@ -2129,43 +2394,85 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) +static inline bool event_is_checkpointed(struct perf_event *event) { - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, mask; + return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} - mask = 0xfULL << (idx * 4); +static inline void intel_set_masks(struct perf_event *event, int idx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + if (event->attr.exclude_host) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + if (event->attr.exclude_guest) + __set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + if (event_is_checkpointed(event)) + __set_bit(idx, (unsigned long *)&cpuc->intel_cp_status); } -static inline bool event_is_checkpointed(struct perf_event *event) +static inline void intel_clear_masks(struct perf_event *event, int idx) { - return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask); + __clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status); +} + +static void intel_pmu_disable_fixed(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + u64 mask; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * When there are other active TopDown events, + * don't disable the fixed counter 3. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + + intel_clear_masks(event, idx); + + mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); + cpuc->fixed_ctrl_val &= ~mask; } static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx = hwc->idx; - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: + intel_clear_masks(event, idx); + x86_pmu_disable_event(event); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: + intel_pmu_disable_fixed(event); + break; + case INTEL_PMC_IDX_FIXED_BTS: intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; + case INTEL_PMC_IDX_FIXED_VLBR: + intel_clear_masks(event, idx); + break; + default: + intel_clear_masks(event, idx); + pr_warn("Failed to disable the event with invalid index %d\n", + idx); + return; } - cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); - cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); - cpuc->intel_cp_status &= ~(1ull << hwc->idx); - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - intel_pmu_disable_fixed(hwc); - else - x86_pmu_disable_event(event); - /* * Needs to be called after x86_pmu_disable_event, * so we don't trigger the event without PEBS bit set. @@ -2174,6 +2481,12 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_assign_event(struct perf_event *event, int idx) +{ + if (is_pebs_pt(event)) + perf_report_aux_output_id(event, idx); +} + static void intel_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) @@ -2182,19 +2495,244 @@ static void intel_pmu_del_event(struct perf_event *event) intel_pmu_pebs_del(event); } +static int icl_set_topdown_event_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + + /* + * The values in PERF_METRICS MSR are derived from fixed counter 3. + * Software should start both registers, PERF_METRICS and fixed + * counter 3, from zero. + * Clear PERF_METRICS and Fixed counter 3 in initialization. + * After that, both MSRs will be cleared for each read. + * Don't need to clear them again. + */ + if (left == x86_pmu.max_period) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + hwc->saved_slots = 0; + hwc->saved_metric = 0; + } + + if ((hwc->saved_slots) && is_slots_event(event)) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, hwc->saved_slots); + wrmsrl(MSR_PERF_METRICS, hwc->saved_metric); + } + + perf_event_update_userpage(event); + + return 0; +} + +static int adl_set_topdown_event_period(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_set_topdown_event_period(event); +} + +DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); + +static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) +{ + u32 val; + + /* + * The metric is reported as an 8bit integer fraction + * summing up to 0xff. + * slots-in-metric = (Metric / 0xff) * slots + */ + val = (metric >> ((idx - INTEL_PMC_IDX_METRIC_BASE) * 8)) & 0xff; + return mul_u64_u32_div(slots, val, 0xff); +} + +static u64 icl_get_topdown_value(struct perf_event *event, + u64 slots, u64 metrics) +{ + int idx = event->hw.idx; + u64 delta; + + if (is_metric_idx(idx)) + delta = icl_get_metrics_event_value(metrics, slots, idx); + else + delta = slots; + + return delta; +} + +static void __icl_update_topdown_event(struct perf_event *event, + u64 slots, u64 metrics, + u64 last_slots, u64 last_metrics) +{ + u64 delta, last = 0; + + delta = icl_get_topdown_value(event, slots, metrics); + if (last_slots) + last = icl_get_topdown_value(event, last_slots, last_metrics); + + /* + * The 8bit integer fraction of metric may be not accurate, + * especially when the changes is very small. + * For example, if only a few bad_spec happens, the fraction + * may be reduced from 1 to 0. If so, the bad_spec event value + * will be 0 which is definitely less than the last value. + * Avoid update event->count for this case. + */ + if (delta > last) { + delta -= last; + local64_add(delta, &event->count); + } +} + +static void update_saved_topdown_regs(struct perf_event *event, u64 slots, + u64 metrics, int metric_end) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + int idx; + + event->hw.saved_slots = slots; + event->hw.saved_metric = metrics; + + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + other->hw.saved_slots = slots; + other->hw.saved_metric = metrics; + } +} + +/* + * Update all active Topdown events. + * + * The PERF_METRICS and Fixed counter 3 are read separately. The values may be + * modify by a NMI. PMU has to be disabled before calling this function. + */ + +static u64 intel_update_topdown_event(struct perf_event *event, int metric_end) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_event *other; + u64 slots, metrics; + bool reset = true; + int idx; + + /* read Fixed counter 3 */ + rdpmcl((3 | INTEL_PMC_FIXED_RDPMC_BASE), slots); + if (!slots) + return 0; + + /* read PERF_METRICS */ + rdpmcl(INTEL_PMC_FIXED_RDPMC_METRICS, metrics); + + for_each_set_bit(idx, cpuc->active_mask, metric_end + 1) { + if (!is_topdown_idx(idx)) + continue; + other = cpuc->events[idx]; + __icl_update_topdown_event(other, slots, metrics, + event ? event->hw.saved_slots : 0, + event ? event->hw.saved_metric : 0); + } + + /* + * Check and update this event, which may have been cleared + * in active_mask e.g. x86_pmu_stop() + */ + if (event && !test_bit(event->hw.idx, cpuc->active_mask)) { + __icl_update_topdown_event(event, slots, metrics, + event->hw.saved_slots, + event->hw.saved_metric); + + /* + * In x86_pmu_stop(), the event is cleared in active_mask first, + * then drain the delta, which indicates context switch for + * counting. + * Save metric and slots for context switch. + * Don't need to reset the PERF_METRICS and Fixed counter 3. + * Because the values will be restored in next schedule in. + */ + update_saved_topdown_regs(event, slots, metrics, metric_end); + reset = false; + } + + if (reset) { + /* The fixed counter 3 has to be written before the PERF_METRICS. */ + wrmsrl(MSR_CORE_PERF_FIXED_CTR3, 0); + wrmsrl(MSR_PERF_METRICS, 0); + if (event) + update_saved_topdown_regs(event, 0, 0, metric_end); + } + + return slots; +} + +static u64 icl_update_topdown_event(struct perf_event *event) +{ + return intel_update_topdown_event(event, INTEL_PMC_IDX_METRIC_BASE + + x86_pmu.num_topdown_events - 1); +} + +static u64 adl_update_topdown_event(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type != hybrid_big) + return 0; + + return icl_update_topdown_event(event); +} + +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); + +static void intel_pmu_read_topdown_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* Only need to call update_topdown_event() once for group read. */ + if ((cpuc->txn_flags & PERF_PMU_TXN_READ) && + !is_slots_event(event)) + return; + + perf_pmu_disable(event->pmu); + static_call(intel_pmu_update_topdown_event)(event); + perf_pmu_enable(event->pmu); +} + static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); + else if (is_topdown_count(event)) + intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); } static void intel_pmu_enable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx - INTEL_PMC_IDX_FIXED; - u64 ctrl_val, mask, bits = 0; + u64 mask, bits = 0; + int idx = hwc->idx; + + if (is_topdown_idx(idx)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + /* + * When there are other active TopDown events, + * don't enable the fixed counter 3 again. + */ + if (*(u64 *)cpuc->active_mask & INTEL_PMC_OTHER_TOPDOWN_BITS(idx)) + return; + + idx = INTEL_PMC_IDX_FIXED_SLOTS; + } + + intel_set_masks(event, idx); /* * Enable IRQ generation (0x8), if not PEBS, @@ -2214,6 +2752,7 @@ static void intel_pmu_enable_fixed(struct perf_event *event) if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) bits |= 0x4; + idx -= INTEL_PMC_IDX_FIXED; bits <<= (idx * 4); mask = 0xfULL << (idx * 4); @@ -2222,42 +2761,39 @@ static void intel_pmu_enable_fixed(struct perf_event *event) mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); } - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val |= bits; } static void intel_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - - if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { - if (!__this_cpu_read(cpu_hw_events.enabled)) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (event->attr.exclude_host) - cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); - if (event->attr.exclude_guest) - cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); - - if (unlikely(event_is_checkpointed(event))) - cpuc->intel_cp_status |= (1ull << hwc->idx); + int idx = hwc->idx; if (unlikely(event->attr.precise_ip)) intel_pmu_pebs_enable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + switch (idx) { + case 0 ... INTEL_PMC_IDX_FIXED - 1: + intel_set_masks(event, idx); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + break; + case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS - 1: + case INTEL_PMC_IDX_METRIC_BASE ... INTEL_PMC_IDX_METRIC_END: intel_pmu_enable_fixed(event); - return; + break; + case INTEL_PMC_IDX_FIXED_BTS: + if (!__this_cpu_read(cpu_hw_events.enabled)) + return; + intel_pmu_enable_bts(hwc->config); + break; + case INTEL_PMC_IDX_FIXED_VLBR: + intel_set_masks(event, idx); + break; + default: + pr_warn("Failed to enable the event with invalid index %d\n", + idx); } - - __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } static void intel_pmu_add_event(struct perf_event *event) @@ -2274,7 +2810,7 @@ static void intel_pmu_add_event(struct perf_event *event) */ int intel_pmu_save_and_restart(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the @@ -2286,28 +2822,50 @@ int intel_pmu_save_and_restart(struct perf_event *event) wrmsrl(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } + return static_call(x86_pmu_set_period)(event); +} + +static int intel_pmu_set_period(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_set_topdown_event_period)(event); + return x86_perf_event_set_period(event); } +static u64 intel_pmu_update(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_update_topdown_event)(event); + + return x86_perf_event_update(event); +} + static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); + int num_counters = hybrid(cpuc->pmu, num_counters); unsigned long flags; int idx; - if (!x86_pmu.num_counters) + if (!num_counters) return; local_irq_save(flags); pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_counters; idx++) { + for (idx = 0; idx < num_counters; idx++) { wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + for (idx = 0; idx < num_counters_fixed; idx++) { + if (fixed_counter_disabled(idx, cpuc->pmu)) + continue; wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } if (ds) ds->bts_index = ds->bts_buffer_base; @@ -2327,12 +2885,54 @@ static void intel_pmu_reset(void) local_irq_restore(flags); } +/* + * We may be running with guest PEBS events created by KVM, and the + * PEBS records are logged into the guest's DS and invisible to host. + * + * In the case of guest PEBS overflow, we only trigger a fake event + * to emulate the PEBS overflow PMI for guest PEBS counters in KVM. + * The guest will then vm-entry and check the guest DS area to read + * the guest PEBS records. + * + * The contents and other behavior of the guest event do not matter. + */ +static void x86_pmu_handle_guest_pebs(struct pt_regs *regs, + struct perf_sample_data *data) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 guest_pebs_idxs = cpuc->pebs_enabled & ~cpuc->intel_ctrl_host_mask; + struct perf_event *event = NULL; + int bit; + + if (!unlikely(perf_guest_state())) + return; + + if (!x86_pmu.pebs_ept || !x86_pmu.pebs_active || + !guest_pebs_idxs) + return; + + for_each_set_bit(bit, (unsigned long *)&guest_pebs_idxs, + INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed) { + event = cpuc->events[bit]; + if (!event->attr.precise_ip) + continue; + + perf_sample_data_init(data, 0, event->hw.last_period); + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); + + /* Inject one fake event is enough. */ + break; + } +} + static int handle_pmi_common(struct pt_regs *regs, u64 status) { struct perf_sample_data data; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int bit; int handled = 0; + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); inc_irq_stat(apic_perf_irqs); @@ -2356,7 +2956,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * processing loop coming after that the function, otherwise * phony regular samples may be generated in the sampling buffer * not marked with the EXACT tag. Another possibility is to have - * one PEBS event and at least one non-PEBS event whic hoverflows + * one PEBS event and at least one non-PEBS event which overflows * while PEBS has armed. In this case, bit 62 of GLOBAL_STATUS will * not be set, yet the overflow status bit for the PEBS counter will * be on Skylake. @@ -2365,33 +2965,48 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - status &= ~cpuc->pebs_enabled; - else - status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); + status &= ~(cpuc->pebs_enabled & x86_pmu.pebs_capable); /* * PEBS overflow sets bit 62 in the global status register */ - if (__test_and_clear_bit(62, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, (unsigned long *)&status)) { + u64 pebs_enabled = cpuc->pebs_enabled; + handled++; - x86_pmu.drain_pebs(regs); - status &= x86_pmu.intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; + x86_pmu_handle_guest_pebs(regs, &data); + x86_pmu.drain_pebs(regs, &data); + status &= intel_ctrl | GLOBAL_STATUS_TRACE_TOPAPMI; + + /* + * PMI throttle may be triggered, which stops the PEBS event. + * Although cpuc->pebs_enabled is updated accordingly, the + * MSR_IA32_PEBS_ENABLE is not updated. Because the + * cpuc->enabled has been forced to 0 in PMI. + * Update the MSR if pebs_enabled is changed. + */ + if (pebs_enabled != cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); } /* * Intel PT */ - if (__test_and_clear_bit(55, (unsigned long *)&status)) { + if (__test_and_clear_bit(GLOBAL_STATUS_TRACE_TOPAPMI_BIT, (unsigned long *)&status)) { handled++; - if (unlikely(perf_guest_cbs && perf_guest_cbs->is_in_guest() && - perf_guest_cbs->handle_intel_pt_intr)) - perf_guest_cbs->handle_intel_pt_intr(); - else + if (!perf_guest_handle_intel_pt_intr()) intel_pt_interrupt(); } /* + * Intel Perf metrics + */ + if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { + handled++; + static_call(intel_pmu_update_topdown_event)(NULL); + } + + /* * Checkpointed counters can lead to 'spurious' PMIs because the * rollback caused by the PMI will have cleared the overflow status * bit. Therefore always force probe these counters. @@ -2411,8 +3026,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -2421,123 +3038,38 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing = true; -static int __init intel_perf_counter_freezing_setup(char *s) -{ - bool res; - - if (kstrtobool(s, &res)) - return -EINVAL; - - disable_counter_freezing = !res; - return 1; -} -__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); - -/* - * Simplified handler for Arch Perfmon v4: - * - We rely on counter freezing/unfreezing to enable/disable the PMU. - * This is done automatically on PMU ack. - * - Ack the PMU only after the APIC. - */ - -static int intel_pmu_handle_irq_v4(struct pt_regs *regs) -{ - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int handled = 0; - bool bts = false; - u64 status; - int pmu_enabled = cpuc->enabled; - int loops = 0; - - /* PMU has been disabled because of counter freezing */ - cpuc->enabled = 0; - if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - bts = true; - intel_bts_disable_local(); - handled = intel_pmu_drain_bts_buffer(); - handled += intel_bts_interrupt(); - } - status = intel_pmu_get_status(); - if (!status) - goto done; -again: - intel_pmu_lbr_read(); - if (++loops > 100) { - static bool warned; - - if (!warned) { - WARN(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - warned = true; - } - intel_pmu_reset(); - goto done; - } - - - handled += handle_pmi_common(regs, status); -done: - /* Ack the PMI in the APIC */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* - * The counters start counting immediately while ack the status. - * Make it as close as possible to IRET. This avoids bogus - * freezing on Skylake CPUs. - */ - if (status) { - intel_pmu_ack_status(status); - } else { - /* - * CPU may issues two PMIs very close to each other. - * When the PMI handler services the first one, the - * GLOBAL_STATUS is already updated to reflect both. - * When it IRETs, the second PMI is immediately - * handled and it sees clear status. At the meantime, - * there may be a third PMI, because the freezing bit - * isn't set since the ack in first PMI handlers. - * Double check if there is more work to be done. - */ - status = intel_pmu_get_status(); - if (status) - goto again; - } - - if (bts) - intel_bts_enable_local(); - cpuc->enabled = pmu_enabled; - return handled; -} - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ static int intel_pmu_handle_irq(struct pt_regs *regs) { - struct cpu_hw_events *cpuc; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + bool late_ack = hybrid_bit(cpuc->pmu, late_ack); + bool mid_ack = hybrid_bit(cpuc->pmu, mid_ack); int loops; u64 status; int handled; int pmu_enabled; - cpuc = this_cpu_ptr(&cpu_hw_events); - /* * Save the PMU state. * It needs to be restored when leaving the handler. */ pmu_enabled = cpuc->enabled; /* - * No known reason to not always do late ACK, - * but just in case do it opt-in. + * In general, the early ACK is only applied for old platforms. + * For the big core starts from Haswell, the late ACK should be + * applied. + * For the small core after Tremont, we have to do the ACK right + * before re-enabling counters, which is in the middle of the + * NMI handler. */ - if (!x86_pmu.late_ack) + if (!late_ack && !mid_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_bts_disable_local(); cpuc->enabled = 0; - __intel_pmu_disable_all(); + __intel_pmu_disable_all(true); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); status = intel_pmu_get_status(); @@ -2570,6 +3102,8 @@ again: goto again; done: + if (mid_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); /* Only restore PMU state when it's active. See x86_pmu_disable(). */ cpuc->enabled = pmu_enabled; if (pmu_enabled) @@ -2581,7 +3115,7 @@ done: * have been reset. This avoids spurious NMIs on * Haswell CPUs. */ - if (x86_pmu.late_ack) + if (late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); return handled; } @@ -2595,8 +3129,26 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static int intel_alt_er(int idx, u64 config) +/* + * Note: matches a fake event, like Fixed2. + */ +static struct event_constraint * +intel_vlbr_constraints(struct perf_event *event) +{ + struct event_constraint *c = &vlbr_constraint; + + if (unlikely(constraint_match(c, event->hw.config))) { + event->hw.flags |= c->flags; + return c; + } + + return NULL; +} + +static int intel_alt_er(struct cpu_hw_events *cpuc, + int idx, u64 config) { + struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs); int alt_idx = idx; if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1)) @@ -2608,7 +3160,7 @@ static int intel_alt_er(int idx, u64 config) if (idx == EXTRA_REG_RSP_1) alt_idx = EXTRA_REG_RSP_0; - if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask) + if (config & ~extra_regs[alt_idx].valid_mask) return idx; return alt_idx; @@ -2616,15 +3168,16 @@ static int intel_alt_er(int idx, u64 config) static void intel_fixup_er(struct perf_event *event, int idx) { + struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs); event->hw.extra_reg.idx = idx; if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; } else if (idx == EXTRA_REG_RSP_1) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } } @@ -2700,7 +3253,7 @@ again: */ c = NULL; } else { - idx = intel_alt_er(idx, reg->config); + idx = intel_alt_er(cpuc, idx, reg->config); if (idx != reg->idx) { raw_spin_unlock_irqrestore(&era->lock, flags); goto again; @@ -2765,10 +3318,11 @@ struct event_constraint * x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { + struct event_constraint *event_constraints = hybrid(cpuc->pmu, event_constraints); struct event_constraint *c; - if (x86_pmu.event_constraints) { - for_each_event_constraint(c, x86_pmu.event_constraints) { + if (event_constraints) { + for_each_event_constraint(c, event_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; @@ -2776,7 +3330,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } } - return &unconstrained; + return &hybrid_var(cpuc->pmu, unconstrained); } static struct event_constraint * @@ -2785,6 +3339,10 @@ __intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; + c = intel_vlbr_constraints(event); + if (c) + return c; + c = intel_bts_constraints(event); if (c) return c; @@ -3257,6 +3815,43 @@ static int core_pmu_hw_config(struct perf_event *event) return intel_pmu_bts_config(event); } +#define INTEL_TD_METRIC_AVAILABLE_MAX (INTEL_TD_METRIC_RETIRING + \ + ((x86_pmu.num_topdown_events - 1) << 8)) + +static bool is_available_metric_event(struct perf_event *event) +{ + return is_metric_event(event) && + event->attr.config <= INTEL_TD_METRIC_AVAILABLE_MAX; +} + +static inline bool is_mem_loads_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xcd, .umask=0x01); +} + +static inline bool is_mem_loads_aux_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0x03, .umask=0x82); +} + +static inline bool require_mem_loads_aux_event(struct perf_event *event) +{ + if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX)) + return false; + + if (is_hybrid()) + return hybrid_pmu(event->pmu)->cpu_type == hybrid_big; + + return true; +} + +static inline bool intel_pmu_has_cap(struct perf_event *event, int idx) +{ + union perf_capabilities *intel_cap = &hybrid(event->pmu, intel_cap); + + return test_bit(idx, (unsigned long *)&intel_cap->capabilities); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -3269,23 +3864,26 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { + if ((event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_FIXED_VLBR_EVENT) + return -EINVAL; + if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & - ~intel_pmu_large_pebs_flags(event))) + ~intel_pmu_large_pebs_flags(event))) { event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; + event->attach_state |= PERF_ATTACH_SCHED_CB; + } } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; + event->attach_state |= PERF_ATTACH_SCHED_CB; /* * BTS is set up earlier in this path, so don't account twice @@ -3306,9 +3904,87 @@ static int intel_pmu_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PEBS_VIA_PT; } - if (event->attr.type != PERF_TYPE_RAW) + if ((event->attr.type == PERF_TYPE_HARDWARE) || + (event->attr.type == PERF_TYPE_HW_CACHE)) return 0; + /* + * Config Topdown slots and metric events + * + * The slots event on Fixed Counter 3 can support sampling, + * which will be handled normally in x86_perf_event_update(). + * + * Metric events don't support sampling and require being paired + * with a slots event as group leader. When the slots event + * is used in a metrics group, it too cannot support sampling. + */ + if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && is_topdown_event(event)) { + if (event->attr.config1 || event->attr.config2) + return -EINVAL; + + /* + * The TopDown metrics events and slots event don't + * support any filters. + */ + if (event->attr.config & X86_ALL_EVENT_FLAGS) + return -EINVAL; + + if (is_available_metric_event(event)) { + struct perf_event *leader = event->group_leader; + + /* The metric events don't support sampling. */ + if (is_sampling_event(event)) + return -EINVAL; + + /* The metric events require a slots group leader. */ + if (!is_slots_event(leader)) + return -EINVAL; + + /* + * The leader/SLOTS must not be a sampling event for + * metric use; hardware requires it starts at 0 when used + * in conjunction with MSR_PERF_METRICS. + */ + if (is_sampling_event(leader)) + return -EINVAL; + + event->event_caps |= PERF_EV_CAP_SIBLING; + /* + * Only once we have a METRICs sibling do we + * need TopDown magic. + */ + leader->hw.flags |= PERF_X86_EVENT_TOPDOWN; + event->hw.flags |= PERF_X86_EVENT_TOPDOWN; + } + } + + /* + * The load latency event X86_CONFIG(.event=0xcd, .umask=0x01) on SPR + * doesn't function quite right. As a work-around it needs to always be + * co-scheduled with a auxiliary event X86_CONFIG(.event=0x03, .umask=0x82). + * The actual count of this second event is irrelevant it just needs + * to be active to make the first event function correctly. + * + * In a group, the auxiliary event must be in front of the load latency + * event. The rule is to simplify the implementation of the check. + * That's because perf cannot have a complete group at the moment. + */ + if (require_mem_loads_aux_event(event) && + (event->attr.sample_type & PERF_SAMPLE_DATA_SRC) && + is_mem_loads_event(event)) { + struct perf_event *leader = event->group_leader; + struct perf_event *sibling = NULL; + + if (!is_mem_loads_aux_event(leader)) { + for_each_sibling_event(sibling, leader) { + if (is_mem_loads_aux_event(sibling)) + break; + } + if (list_entry_is_head(sibling, &leader->sibling_list, sibling_list)) + return -ENODATA; + } + } + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) return 0; @@ -3324,59 +4000,99 @@ static int intel_pmu_hw_config(struct perf_event *event) return 0; } -#ifdef CONFIG_RETPOLINE -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr); -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr); -#endif - -struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) -{ -#ifdef CONFIG_RETPOLINE - if (x86_pmu.guest_get_msrs == intel_guest_get_msrs) - return intel_guest_get_msrs(nr); - else if (x86_pmu.guest_get_msrs == core_guest_get_msrs) - return core_guest_get_msrs(nr); -#endif - if (x86_pmu.guest_get_msrs) - return x86_pmu.guest_get_msrs(nr); - *nr = 0; - return NULL; -} -EXPORT_SYMBOL_GPL(perf_guest_get_msrs); - -static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +/* + * Currently, the only caller of this function is the atomic_switch_perf_msrs(). + * The host perf conext helps to prepare the values of the real hardware for + * a set of msrs that need to be switched atomically in a vmx transaction. + * + * For example, the pseudocode needed to add a new msr should look like: + * + * arr[(*nr)++] = (struct perf_guest_switch_msr){ + * .msr = the hardware msr address, + * .host = the value the hardware has when it doesn't run a guest, + * .guest = the value the hardware has when it runs a guest, + * }; + * + * These values have nothing to do with the emulated values the guest sees + * when it uses {RD,WR}MSR, which should be handled by the KVM context, + * specifically in the intel_pmu_{get,set}_msr(). + */ +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + struct kvm_pmu *kvm_pmu = (struct kvm_pmu *)data; + u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); + u64 pebs_mask = cpuc->pebs_enabled & x86_pmu.pebs_capable; + int global_ctrl, pebs_enable; - arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; - arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; - arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; - if (x86_pmu.flags & PMU_FL_PEBS_ALL) - arr[0].guest &= ~cpuc->pebs_enabled; - else - arr[0].guest &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); - *nr = 1; + *nr = 0; + global_ctrl = (*nr)++; + arr[global_ctrl] = (struct perf_guest_switch_msr){ + .msr = MSR_CORE_PERF_GLOBAL_CTRL, + .host = intel_ctrl & ~cpuc->intel_ctrl_guest_mask, + .guest = intel_ctrl & (~cpuc->intel_ctrl_host_mask | ~pebs_mask), + }; - if (x86_pmu.pebs && x86_pmu.pebs_no_isolation) { - /* - * If PMU counter has PEBS enabled it is not enough to - * disable counter on a guest entry since PEBS memory - * write can overshoot guest entry and corrupt guest - * memory. Disabling PEBS solves the problem. - * - * Don't do this if the CPU already enforces it. - */ - arr[1].msr = MSR_IA32_PEBS_ENABLE; - arr[1].host = cpuc->pebs_enabled; - arr[1].guest = 0; - *nr = 2; + if (!x86_pmu.pebs) + return arr; + + /* + * If PMU counter has PEBS enabled it is not enough to + * disable counter on a guest entry since PEBS memory + * write can overshoot guest entry and corrupt guest + * memory. Disabling PEBS solves the problem. + * + * Don't do this if the CPU already enforces it. + */ + if (x86_pmu.pebs_no_isolation) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled, + .guest = 0, + }; + return arr; + } + + if (!kvm_pmu || !x86_pmu.pebs_ept) + return arr; + + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_DS_AREA, + .host = (unsigned long)cpuc->ds, + .guest = kvm_pmu->ds_area, + }; + + if (x86_pmu.intel_cap.pebs_baseline) { + arr[(*nr)++] = (struct perf_guest_switch_msr){ + .msr = MSR_PEBS_DATA_CFG, + .host = cpuc->pebs_data_cfg, + .guest = kvm_pmu->pebs_data_cfg, + }; + } + + pebs_enable = (*nr)++; + arr[pebs_enable] = (struct perf_guest_switch_msr){ + .msr = MSR_IA32_PEBS_ENABLE, + .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + }; + + if (arr[pebs_enable].host) { + /* Disable guest PEBS if host PEBS is enabled. */ + arr[pebs_enable].guest = 0; + } else { + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ + arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; + /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ + arr[global_ctrl].guest |= arr[pebs_enable].guest; } return arr; } -static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr, void *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; @@ -3509,6 +4225,31 @@ icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } static struct event_constraint * +spr_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + c = icl_get_event_constraints(cpuc, idx, event); + + /* + * The :ppp indicates the Precise Distribution (PDist) facility, which + * is only supported on the GP counter 0. If a :ppp event which is not + * available on the GP counter 0, error out. + * Exception: Instruction PDIR is only available on the fixed counter 0. + */ + if ((event->attr.precise_ip == 3) && + !constraint_match(&fixed0_constraint, event->hw.config)) { + if (c->idxmsk64 & BIT_ULL(0)) + return &counter0_constraint; + + return &emptyconstraint; + } + + return c; +} + +static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { @@ -3529,6 +4270,8 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, { struct event_constraint *c; + c = intel_get_event_constraints(cpuc, idx, event); + /* * :ppp means to do reduced skid PEBS, * which is available on PMC0 and fixed counter 0. @@ -3541,8 +4284,6 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return &counter0_constraint; } - c = intel_get_event_constraints(cpuc, idx, event); - return c; } @@ -3566,6 +4307,39 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type == hybrid_big) + return spr_get_event_constraints(cpuc, idx, event); + else if (pmu->cpu_type == hybrid_small) + return tnt_get_event_constraints(cpuc, idx, event); + + WARN_ON(1); + return &emptyconstraint; +} + +static int adl_hw_config(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + + if (pmu->cpu_type == hybrid_big) + return hsw_hw_config(event); + else if (pmu->cpu_type == hybrid_small) + return intel_pmu_hw_config(event); + + WARN_ON(1); + return -EOPNOTSUPP; +} + +static u8 adl_get_hybrid_cpu_type(void) +{ + return hybrid_big; +} + /* * Broadwell: * @@ -3581,20 +4355,25 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static u64 bdw_limit_period(struct perf_event *event, u64 left) +static void bdw_limit_period(struct perf_event *event, s64 *left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fULL; + if (*left < 128) + *left = 128; + *left &= ~0x3fULL; } - return left; } -static u64 nhm_limit_period(struct perf_event *event, u64 left) +static void nhm_limit_period(struct perf_event *event, s64 *left) { - return max(left, 32ULL); + *left = max(*left, 32LL); +} + +static void spr_limit_period(struct perf_event *event, s64 *left) +{ + if (event->attr.precise_ip == 3) + *left = max(*left, 128LL); } PMU_FORMAT_ATTR(event, "config:0-7" ); @@ -3661,7 +4440,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { cpuc->pebs_record_size = x86_pmu.pebs_record_size; - if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { + if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) goto err; @@ -3715,12 +4494,62 @@ static void flip_smm_bit(void *data) } } +static bool init_hybrid_pmu(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + u8 cpu_type = get_this_hybrid_cpu_type(); + struct x86_hybrid_pmu *pmu = NULL; + int i; + + if (!cpu_type && x86_pmu.get_hybrid_cpu_type) + cpu_type = x86_pmu.get_hybrid_cpu_type(); + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + if (x86_pmu.hybrid_pmu[i].cpu_type == cpu_type) { + pmu = &x86_pmu.hybrid_pmu[i]; + break; + } + } + if (WARN_ON_ONCE(!pmu || (pmu->pmu.type == -1))) { + cpuc->pmu = NULL; + return false; + } + + /* Only check and dump the PMU information for the first CPU */ + if (!cpumask_empty(&pmu->supported_cpus)) + goto end; + + if (!check_hw_exists(&pmu->pmu, pmu->num_counters, pmu->num_counters_fixed)) + return false; + + pr_info("%s PMU driver: ", pmu->name); + + if (pmu->intel_cap.pebs_output_pt_available) + pr_cont("PEBS-via-PT "); + + pr_cont("\n"); + + x86_pmu_show_pmu_cap(pmu->num_counters, pmu->num_counters_fixed, + pmu->intel_ctrl); + +end: + cpumask_set_cpu(cpu, &pmu->supported_cpus); + cpuc->pmu = &pmu->pmu; + + x86_pmu_update_cpu_context(&pmu->pmu, cpu); + + return true; +} + static void intel_pmu_cpu_starting(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); int core_id = topology_core_id(cpu); int i; + if (is_hybrid() && !init_hybrid_pmu(cpu)) + return; + init_debug_store_on_cpu(cpu); /* * Deal with CPUs that don't clear their LBRs on power-up. @@ -3738,8 +4567,24 @@ static void intel_pmu_cpu_starting(int cpu) if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); - if (x86_pmu.counter_freezing) - enable_counter_freeze(); + /* + * Disable perf metrics if any added CPU doesn't support it. + * + * Turn off the check for a hybrid architecture, because the + * architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate + * the architecture features. The perf metrics is a model-specific + * feature for now. The corresponding bit should always be 0 on + * a hybrid platform, e.g., Alder Lake. + */ + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) { + union perf_capabilities perf_cap; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities); + if (!perf_cap.perf_metrics) { + x86_pmu.intel_cap.perf_metrics = 0; + x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS); + } + } if (!cpuc->shared_regs) return; @@ -3800,9 +4645,6 @@ static void free_excl_cntrs(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dying(int cpu) { fini_debug_store_on_cpu(cpu); - - if (x86_pmu.counter_freezing) - disable_counter_freeze(); } void intel_cpuc_finish(struct cpu_hw_events *cpuc) @@ -3821,7 +4663,12 @@ void intel_cpuc_finish(struct cpu_hw_events *cpuc) static void intel_pmu_cpu_dead(int cpu) { - intel_cpuc_finish(&per_cpu(cpu_hw_events, cpu)); + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + intel_cpuc_finish(cpuc); + + if (is_hybrid() && cpuc->pmu) + cpumask_clear_cpu(cpu, &hybrid_pmu(cpuc->pmu)->supported_cpus); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -3842,14 +4689,30 @@ static int intel_pmu_check_period(struct perf_event *event, u64 value) return intel_pmu_has_bts_period(event, value) ? -EINVAL : 0; } +static void intel_aux_output_init(void) +{ + /* Refer also intel_pmu_aux_output_match() */ + if (x86_pmu.intel_cap.pebs_output_pt_available) + x86_pmu.assign = intel_pmu_assign_event; +} + static int intel_pmu_aux_output_match(struct perf_event *event) { + /* intel_pmu_assign_event() is needed, refer intel_aux_output_init() */ if (!x86_pmu.intel_cap.pebs_output_pt_available) return 0; return is_intel_pt_event(event); } +static int intel_pmu_filter_match(struct perf_event *event) +{ + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); + unsigned int cpu = smp_processor_id(); + + return cpumask_test_cpu(cpu, &pmu->supported_cpus); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -3932,6 +4795,11 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_dead = intel_pmu_cpu_dead, .check_period = intel_pmu_check_period, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, }; static __initconst const struct x86_pmu intel_pmu = { @@ -3944,6 +4812,8 @@ static __initconst const struct x86_pmu intel_pmu = { .add = intel_pmu_add_event, .del = intel_pmu_del_event, .read = intel_pmu_read_event, + .set_period = intel_pmu_set_period, + .update = intel_pmu_update, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -3977,6 +4847,24 @@ static __initconst const struct x86_pmu intel_pmu = { .check_period = intel_pmu_check_period, .aux_output_match = intel_pmu_aux_output_match, + + .lbr_reset = intel_pmu_lbr_reset_64, + .lbr_read = intel_pmu_lbr_read_64, + .lbr_save = intel_pmu_lbr_save, + .lbr_restore = intel_pmu_lbr_restore, + + /* + * SMM has access to all 4 rings and while traditionally SMM code only + * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM. + * + * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction + * between SMM or not, this results in what should be pure userspace + * counters including SMM data. + * + * This is a clear privilege issue, therefore globally disable + * counting SMM by default. + */ + .attr_freeze_on_smi = 1, }; static __init void intel_clovertown_quirk(void) @@ -4017,9 +4905,13 @@ static const struct x86_cpu_desc isolation_ucodes[] = { INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 3, 0x07000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 4, 0x0f000009), INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_D, 5, 0x0e000002), - INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 2, 0x0b000014), + INTEL_CPU_DESC(INTEL_FAM6_BROADWELL_X, 1, 0x0b000014), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 3, 0x00000021), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 4, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 5, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 6, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 7, 0x00000000), + INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_X, 11, 0x00000000), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE_L, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_SKYLAKE, 3, 0x0000007c), INTEL_CPU_DESC(INTEL_FAM6_KABYLAKE, 9, 0x0000004e), @@ -4092,7 +4984,7 @@ static bool check_msr(unsigned long msr, u64 mask) /* * Disable the check for real HW, so we don't - * mess with potentionaly enabled registers: + * mess with potentially enabled registers: */ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) return true; @@ -4157,7 +5049,7 @@ static __init void intel_arch_events_quirk(void) { int bit; - /* disable event that reported as not presend by cpuid */ + /* disable event that reported as not present by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; pr_warn("CPUID marked event: \'%s\' unavailable\n", @@ -4184,39 +5076,6 @@ static __init void intel_nehalem_quirk(void) } } -static const struct x86_cpu_desc counter_freezing_ucodes[] = { - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 2, 0x0000000e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 9, 0x0000002e), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT, 10, 0x00000008), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_D, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 1, 0x00000028), - INTEL_CPU_DESC(INTEL_FAM6_ATOM_GOLDMONT_PLUS, 8, 0x00000006), - {} -}; - -static bool intel_counter_freezing_broken(void) -{ - return !x86_cpu_has_min_microcode_rev(counter_freezing_ucodes); -} - -static __init void intel_counter_freezing_quirk(void) -{ - /* Check if it's already disabled */ - if (disable_counter_freezing) - return; - - /* - * If the system starts with the wrong ucode, leave the - * counter-freezing feature permanently disabled. - */ - if (intel_counter_freezing_broken()) { - pr_info("PMU counter freezing disabled due to CPU errata," - "please upgrade microcode\n"); - x86_pmu.counter_freezing = false; - x86_pmu.handle_irq = intel_pmu_handle_irq; - } -} - /* * enable software workaround for errata: * SNB: BJ122 @@ -4299,6 +5158,15 @@ static struct attribute *icl_events_attrs[] = { NULL, }; +static struct attribute *icl_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + NULL, +}; + static struct attribute *icl_tsx_events_attrs[] = { EVENT_PTR(tx_start), EVENT_PTR(tx_abort), @@ -4317,6 +5185,42 @@ static struct attribute *icl_tsx_events_attrs[] = { NULL, }; + +EVENT_ATTR_STR(mem-stores, mem_st_spr, "event=0xcd,umask=0x2"); +EVENT_ATTR_STR(mem-loads-aux, mem_ld_aux, "event=0x03,umask=0x82"); + +static struct attribute *spr_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_spr), + EVENT_PTR(mem_ld_aux), + NULL, +}; + +static struct attribute *spr_td_events_attrs[] = { + EVENT_PTR(slots), + EVENT_PTR(td_retiring), + EVENT_PTR(td_bad_spec), + EVENT_PTR(td_fe_bound), + EVENT_PTR(td_be_bound), + EVENT_PTR(td_heavy_ops), + EVENT_PTR(td_br_mispredict), + EVENT_PTR(td_fetch_lat), + EVENT_PTR(td_mem_bound), + NULL, +}; + +static struct attribute *spr_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4347,9 +5251,9 @@ static ssize_t freeze_on_smi_store(struct device *cdev, x86_pmu.attr_freeze_on_smi = val; - get_online_cpus(); + cpus_read_lock(); on_each_cpu(flip_smm_bit, &val, 1); - put_online_cpus(); + cpus_read_unlock(); done: mutex_unlock(&freeze_on_smi_mutex); @@ -4365,7 +5269,7 @@ static void update_tfa_sched(void *ignored) * and if so force schedule out for all event types all contexts */ if (test_bit(3, cpuc->active_mask)) - perf_pmu_resched(x86_get_pmu()); + perf_pmu_resched(x86_get_pmu(smp_processor_id())); } static ssize_t show_sysctl_tfa(struct device *cdev, @@ -4392,9 +5296,9 @@ static ssize_t set_sysctl_tfa(struct device *cdev, allow_tsx_force_abort = val; - get_online_cpus(); + cpus_read_lock(); on_each_cpu(update_tfa_sched, NULL, 1); - put_online_cpus(); + cpus_read_unlock(); return count; } @@ -4527,8 +5431,303 @@ static const struct attribute_group *attr_update[] = { NULL, }; +EVENT_ATTR_STR_HYBRID(slots, slots_adl, "event=0x00,umask=0x4", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-retiring, td_retiring_adl, "event=0xc2,umask=0x0;event=0x00,umask=0x80", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-bad-spec, td_bad_spec_adl, "event=0x73,umask=0x0;event=0x00,umask=0x81", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-fe-bound, td_fe_bound_adl, "event=0x71,umask=0x0;event=0x00,umask=0x82", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-be-bound, td_be_bound_adl, "event=0x74,umask=0x0;event=0x00,umask=0x83", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(topdown-heavy-ops, td_heavy_ops_adl, "event=0x00,umask=0x84", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-br-mispredict, td_br_mis_adl, "event=0x00,umask=0x85", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-fetch-lat, td_fetch_lat_adl, "event=0x00,umask=0x86", hybrid_big); +EVENT_ATTR_STR_HYBRID(topdown-mem-bound, td_mem_bound_adl, "event=0x00,umask=0x87", hybrid_big); + +static struct attribute *adl_hybrid_events_attrs[] = { + EVENT_PTR(slots_adl), + EVENT_PTR(td_retiring_adl), + EVENT_PTR(td_bad_spec_adl), + EVENT_PTR(td_fe_bound_adl), + EVENT_PTR(td_be_bound_adl), + EVENT_PTR(td_heavy_ops_adl), + EVENT_PTR(td_br_mis_adl), + EVENT_PTR(td_fetch_lat_adl), + EVENT_PTR(td_mem_bound_adl), + NULL, +}; + +/* Must be in IDX order */ +EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); +EVENT_ATTR_STR_HYBRID(mem-loads-aux, mem_ld_aux_adl, "event=0x03,umask=0x82", hybrid_big); + +static struct attribute *adl_hybrid_mem_attrs[] = { + EVENT_PTR(mem_ld_adl), + EVENT_PTR(mem_st_adl), + EVENT_PTR(mem_ld_aux_adl), + NULL, +}; + +EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-commit, tx_commit_adl, "event=0xc9,umask=0x2", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-abort, tx_abort_adl, "event=0xc9,umask=0x4", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-conflict, tx_conflict_adl, "event=0x54,umask=0x1", hybrid_big); +EVENT_ATTR_STR_HYBRID(cycles-t, cycles_t_adl, "event=0x3c,in_tx=1", hybrid_big); +EVENT_ATTR_STR_HYBRID(cycles-ct, cycles_ct_adl, "event=0x3c,in_tx=1,in_tx_cp=1", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-capacity-read, tx_capacity_read_adl, "event=0x54,umask=0x80", hybrid_big); +EVENT_ATTR_STR_HYBRID(tx-capacity-write, tx_capacity_write_adl, "event=0x54,umask=0x2", hybrid_big); + +static struct attribute *adl_hybrid_tsx_attrs[] = { + EVENT_PTR(tx_start_adl), + EVENT_PTR(tx_abort_adl), + EVENT_PTR(tx_commit_adl), + EVENT_PTR(tx_capacity_read_adl), + EVENT_PTR(tx_capacity_write_adl), + EVENT_PTR(tx_conflict_adl), + EVENT_PTR(cycles_t_adl), + EVENT_PTR(cycles_ct_adl), + NULL, +}; + +FORMAT_ATTR_HYBRID(in_tx, hybrid_big); +FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big); +FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small); +FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small); +FORMAT_ATTR_HYBRID(frontend, hybrid_big); + +static struct attribute *adl_hybrid_extra_attr_rtm[] = { + FORMAT_HYBRID_PTR(in_tx), + FORMAT_HYBRID_PTR(in_tx_cp), + FORMAT_HYBRID_PTR(offcore_rsp), + FORMAT_HYBRID_PTR(ldlat), + FORMAT_HYBRID_PTR(frontend), + NULL, +}; + +static struct attribute *adl_hybrid_extra_attr[] = { + FORMAT_HYBRID_PTR(offcore_rsp), + FORMAT_HYBRID_PTR(ldlat), + FORMAT_HYBRID_PTR(frontend), + NULL, +}; + +static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_events_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_hybrid_attr, attr.attr); + + return pmu->cpu_type & pmu_attr->pmu_type; +} + +static umode_t hybrid_events_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0; +} + +static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu) +{ + int cpu = cpumask_first(&pmu->supported_cpus); + + return (cpu >= nr_cpu_ids) ? -1 : cpu; +} + +static umode_t hybrid_tsx_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && cpu_has(&cpu_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0; +} + +static umode_t hybrid_format_is_visible(struct kobject *kobj, + struct attribute *attr, int i) +{ + struct device *dev = kobj_to_dev(kobj); + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + struct perf_pmu_format_hybrid_attr *pmu_attr = + container_of(attr, struct perf_pmu_format_hybrid_attr, attr.attr); + int cpu = hybrid_find_supported_cpu(pmu); + + return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode : 0; +} + +static struct attribute_group hybrid_group_events_td = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_mem = { + .name = "events", + .is_visible = hybrid_events_is_visible, +}; + +static struct attribute_group hybrid_group_events_tsx = { + .name = "events", + .is_visible = hybrid_tsx_is_visible, +}; + +static struct attribute_group hybrid_group_format_extra = { + .name = "format", + .is_visible = hybrid_format_is_visible, +}; + +static ssize_t intel_hybrid_get_attr_cpus(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct x86_hybrid_pmu *pmu = + container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu); + + return cpumap_print_to_pagebuf(true, buf, &pmu->supported_cpus); +} + +static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL); +static struct attribute *intel_hybrid_cpus_attrs[] = { + &dev_attr_cpus.attr, + NULL, +}; + +static struct attribute_group hybrid_group_cpus = { + .attrs = intel_hybrid_cpus_attrs, +}; + +static const struct attribute_group *hybrid_attr_update[] = { + &hybrid_group_events_td, + &hybrid_group_events_mem, + &hybrid_group_events_tsx, + &group_caps_gen, + &group_caps_lbr, + &hybrid_group_format_extra, + &group_default, + &hybrid_group_cpus, + NULL, +}; + static struct attribute *empty_attrs; +static void intel_pmu_check_num_counters(int *num_counters, + int *num_counters_fixed, + u64 *intel_ctrl, u64 fixed_mask) +{ + if (*num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + *num_counters, INTEL_PMC_MAX_GENERIC); + *num_counters = INTEL_PMC_MAX_GENERIC; + } + *intel_ctrl = (1ULL << *num_counters) - 1; + + if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + *num_counters_fixed, INTEL_PMC_MAX_FIXED); + *num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED; +} + +static void intel_pmu_check_event_constraints(struct event_constraint *event_constraints, + int num_counters, + int num_counters_fixed, + u64 intel_ctrl) +{ + struct event_constraint *c; + + if (!event_constraints) + return; + + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, event_constraints) { + /* + * Don't extend the topdown slots and metrics + * events to the generic counters. + */ + if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) { + /* + * Disable topdown slots and metrics events, + * if slots event is not in CPUID. + */ + if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl)) + c->idxmsk64 = 0; + c->weight = hweight64(c->idxmsk64); + continue; + } + + if (c->cmask == FIXED_EVENT_FLAGS) { + /* Disabled fixed counters which are not in CPUID */ + c->idxmsk64 &= intel_ctrl; + + /* + * Don't extend the pseudo-encoding to the + * generic counters + */ + if (!use_fixed_pseudo_encoding(c->code)) + c->idxmsk64 |= (1ULL << num_counters) - 1; + } + c->idxmsk64 &= + ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed)); + c->weight = hweight64(c->idxmsk64); + } +} + +static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs) +{ + struct extra_reg *er; + + /* + * Access extra MSR may cause #GP under certain circumstances. + * E.g. KVM doesn't support offcore event + * Check all extra_regs here. + */ + if (!extra_regs) + return; + + for (er = extra_regs; er->msr; er++) { + er->extra_msr_access = check_msr(er->msr, 0x11UL); + /* Disable LBR select mapping */ + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) + x86_pmu.lbr_sel_map = NULL; + } +} + +static void intel_pmu_check_hybrid_pmus(u64 fixed_mask) +{ + struct x86_hybrid_pmu *pmu; + int i; + + for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { + pmu = &x86_pmu.hybrid_pmu[i]; + + intel_pmu_check_num_counters(&pmu->num_counters, + &pmu->num_counters_fixed, + &pmu->intel_ctrl, + fixed_mask); + + if (pmu->intel_cap.perf_metrics) { + pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + pmu->intel_ctrl |= INTEL_PMC_MSK_FIXED_SLOTS; + } + + if (pmu->intel_cap.pebs_output_pt_available) + pmu->pmu.capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + + intel_pmu_check_event_constraints(pmu->event_constraints, + pmu->num_counters, + pmu->num_counters_fixed, + pmu->intel_ctrl); + + intel_pmu_check_extra_regs(pmu->extra_regs); + } +} + __init int intel_pmu_init(void) { struct attribute **extra_skl_attr = &empty_attrs; @@ -4539,12 +5738,11 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; - struct event_constraint *c; - unsigned int unused; - struct extra_reg *er; + unsigned int fixed_mask; bool pmem = false; int version, i; char *name; + struct x86_hybrid_pmu *pmu; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -4562,7 +5760,7 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + cpuid(10, &eax.full, &ebx.full, &fixed_mask, &edx.full); if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; @@ -4581,20 +5779,21 @@ __init int intel_pmu_init(void) x86_pmu.events_mask_len = eax.split.mask_length; x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + x86_pmu.pebs_capable = PEBS_COUNTER_MASK; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events, when not running in a hypervisor: */ - if (version > 1) { + if (version > 1 && version < 5) { int assume = 3 * !boot_cpu_has(X86_FEATURE_HYPERVISOR); x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, assume); - } - if (version >= 4) - x86_pmu.counter_freezing = !disable_counter_freezing; + fixed_mask = (1L << x86_pmu.num_counters_fixed) - 1; + } else if (version >= 5) + x86_pmu.num_counters_fixed = fls(fixed_mask); if (boot_cpu_has(X86_FEATURE_PDCM)) { u64 capabilities; @@ -4603,10 +5802,24 @@ __init int intel_pmu_init(void) x86_pmu.intel_cap.capabilities = capabilities; } + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) { + x86_pmu.lbr_reset = intel_pmu_lbr_reset_32; + x86_pmu.lbr_read = intel_pmu_lbr_read_32; + } + + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) + intel_pmu_arch_lbr_init(); + intel_ds_init(); x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + if (version >= 5) { + x86_pmu.intel_cap.anythread_deprecated = edx.split.anythread_deprecated; + if (x86_pmu.intel_cap.anythread_deprecated) + pr_cont(" AnyThread deprecated, "); + } + /* * Install the hw-cache-events table: */ @@ -4618,7 +5831,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_CORE2_MEROM: x86_add_quirk(intel_clovertown_quirk); - /* fall through */ + fallthrough; case INTEL_FAM6_CORE2_MEROM_L: case INTEL_FAM6_CORE2_PENRYN: @@ -4709,7 +5922,6 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_GOLDMONT: case INTEL_FAM6_ATOM_GOLDMONT_D: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs, @@ -4736,7 +5948,6 @@ __init int intel_pmu_init(void) break; case INTEL_FAM6_ATOM_GOLDMONT_PLUS: - x86_add_quirk(intel_counter_freezing_quirk); memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, glp_hw_cache_extra_regs, @@ -4753,6 +5964,7 @@ __init int intel_pmu_init(void) x86_pmu.pebs_aliases = NULL; x86_pmu.pebs_prec_dist = true; x86_pmu.lbr_pt_coexist = true; + x86_pmu.pebs_capable = ~0ULL; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.flags |= PMU_FL_PEBS_ALL; x86_pmu.get_event_constraints = glp_get_event_constraints; @@ -4766,6 +5978,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -4786,11 +5999,42 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; x86_pmu.flags |= PMU_FL_HAS_RSP_1; x86_pmu.get_event_constraints = tnt_get_event_constraints; + td_attr = tnt_events_attrs; extra_attr = slm_format_attr; pr_cont("Tremont events, "); name = "Tremont"; break; + case INTEL_FAM6_ALDERLAKE_N: + x86_pmu.mid_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints; + x86_pmu.extra_regs = intel_grt_extra_regs; + + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + + intel_pmu_pebs_data_source_grt(); + x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.get_event_constraints = tnt_get_event_constraints; + x86_pmu.limit_period = spr_limit_period; + td_attr = tnt_events_attrs; + mem_attr = grt_mem_attrs; + extra_attr = nhm_format_attr; + pr_cont("Gracemont events, "); + name = "gracemont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: @@ -4998,7 +6242,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_SKYLAKE_X: pmem = true; - /* fall through */ + fallthrough; case INTEL_FAM6_SKYLAKE_L: case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: @@ -5036,7 +6280,13 @@ __init int intel_pmu_init(void) tsx_attr = hsw_tsx_events_attrs; intel_pmu_pebs_data_source_skl(pmem); - if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { + /* + * Processors with CPUID.RTM_ALWAYS_ABORT have TSX deprecated by default. + * TSX force abort hooks are not required on these systems. Only deploy + * workaround when microcode has not enabled X86_FEATURE_RTM_ALWAYS_ABORT. + */ + if (boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) && + !boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)) { x86_pmu.flags |= PMU_FL_TFA; x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; @@ -5049,12 +6299,14 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ICELAKE_X: case INTEL_FAM6_ICELAKE_D: + x86_pmu.pebs_ept = 1; pmem = true; - /* fall through */ + fallthrough; case INTEL_FAM6_ICELAKE_L: case INTEL_FAM6_ICELAKE: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: x86_pmu.late_ack = true; memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); @@ -5075,14 +6327,175 @@ __init int intel_pmu_init(void) hsw_format_attr : nhm_format_attr; extra_skl_attr = skl_format_attr; mem_attr = icl_events_attrs; + td_attr = icl_td_events_attrs; tsx_attr = icl_tsx_events_attrs; - x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 4; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Icelake events, "); name = "icelake"; break; + case INTEL_FAM6_SAPPHIRERAPIDS_X: + pmem = true; + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + x86_pmu.event_constraints = intel_spr_event_constraints; + x86_pmu.pebs_constraints = intel_spr_pebs_event_constraints; + x86_pmu.extra_regs = intel_spr_extra_regs; + x86_pmu.limit_period = spr_limit_period; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = spr_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_skl_attr = skl_format_attr; + mem_attr = spr_events_attrs; + td_attr = spr_td_events_attrs; + tsx_attr = spr_tsx_events_attrs; + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(pmem); + x86_pmu.num_topdown_events = 8; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); + pr_cont("Sapphire Rapids events, "); + name = "sapphire_rapids"; + break; + + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: + /* + * Alder Lake has 2 types of CPU, core and atom. + * + * Initialize the common PerfMon capabilities here. + */ + x86_pmu.hybrid_pmu = kcalloc(X86_HYBRID_NUM_PMUS, + sizeof(struct x86_hybrid_pmu), + GFP_KERNEL); + if (!x86_pmu.hybrid_pmu) + return -ENOMEM; + static_branch_enable(&perf_is_hybrid); + x86_pmu.num_hybrid_pmus = X86_HYBRID_NUM_PMUS; + + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.pebs_block = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + x86_pmu.flags |= PMU_FL_INSTR_LATENCY; + x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX; + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_adl(); + x86_pmu.pebs_latency_data = adl_latency_data_small; + x86_pmu.num_topdown_events = 8; + static_call_update(intel_pmu_update_topdown_event, + &adl_update_topdown_event); + static_call_update(intel_pmu_set_topdown_event_period, + &adl_set_topdown_event_period); + + x86_pmu.filter_match = intel_pmu_filter_match; + x86_pmu.get_event_constraints = adl_get_event_constraints; + x86_pmu.hw_config = adl_hw_config; + x86_pmu.limit_period = spr_limit_period; + x86_pmu.get_hybrid_cpu_type = adl_get_hybrid_cpu_type; + /* + * The rtm_abort_event is used to check whether to enable GPRs + * for the RTM abort event. Atom doesn't have the RTM abort + * event. There is no harmful to set it in the common + * x86_pmu.rtm_abort_event. + */ + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xc9, .umask=0x04); + + td_attr = adl_hybrid_events_attrs; + mem_attr = adl_hybrid_mem_attrs; + tsx_attr = adl_hybrid_tsx_attrs; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + adl_hybrid_extra_attr_rtm : adl_hybrid_extra_attr; + + /* Initialize big core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; + pmu->name = "cpu_core"; + pmu->cpu_type = hybrid_big; + pmu->late_ack = true; + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) { + pmu->num_counters = x86_pmu.num_counters + 2; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed + 1; + } else { + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } + + /* + * Quirk: For some Alder Lake machine, when all E-cores are disabled in + * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However, + * the X86_FEATURE_HYBRID_CPU is still set. The above codes will + * mistakenly add extra counters for P-cores. Correct the number of + * counters here. + */ + if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) { + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + } + + pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters); + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + pmu->intel_cap.perf_metrics = 1; + pmu->intel_cap.pebs_output_pt_available = 0; + + memcpy(pmu->hw_cache_event_ids, spr_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); + memcpy(pmu->hw_cache_extra_regs, spr_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); + pmu->event_constraints = intel_spr_event_constraints; + pmu->pebs_constraints = intel_spr_pebs_event_constraints; + pmu->extra_regs = intel_spr_extra_regs; + + /* Initialize Atom core specific PerfMon capabilities.*/ + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; + pmu->name = "cpu_atom"; + pmu->cpu_type = hybrid_small; + pmu->mid_ack = true; + pmu->num_counters = x86_pmu.num_counters; + pmu->num_counters_fixed = x86_pmu.num_counters_fixed; + pmu->max_pebs_events = x86_pmu.max_pebs_events; + pmu->unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1, + 0, pmu->num_counters, 0, 0); + pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; + pmu->intel_cap.perf_metrics = 0; + pmu->intel_cap.pebs_output_pt_available = 1; + + memcpy(pmu->hw_cache_event_ids, glp_hw_cache_event_ids, sizeof(pmu->hw_cache_event_ids)); + memcpy(pmu->hw_cache_extra_regs, tnt_hw_cache_extra_regs, sizeof(pmu->hw_cache_extra_regs)); + pmu->hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + pmu->event_constraints = intel_slm_event_constraints; + pmu->pebs_constraints = intel_grt_pebs_event_constraints; + pmu->extra_regs = intel_grt_extra_regs; + pr_cont("Alderlake Hybrid events, "); + name = "alderlake_hybrid"; + break; + default: switch (x86_pmu.version) { case 1: @@ -5090,7 +6503,9 @@ __init int intel_pmu_init(void) pr_cont("generic architected perfmon v1, "); name = "generic_arch_v1"; break; - default: + case 2: + case 3: + case 4: /* * default constraints for v2 and up */ @@ -5098,59 +6513,62 @@ __init int intel_pmu_init(void) pr_cont("generic architected perfmon, "); name = "generic_arch_v2+"; break; + default: + /* + * The default constraints for v5 and up can support up to + * 16 fixed counters. For the fixed counters 4 and later, + * the pseudo-encoding is applied. + * The constraints may be cut according to the CPUID enumeration + * by inserting the EVENT_CONSTRAINT_END. + */ + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + intel_v5_gen_event_constraints[x86_pmu.num_counters_fixed].weight = -1; + x86_pmu.event_constraints = intel_v5_gen_event_constraints; + pr_cont("generic architected perfmon, "); + name = "generic_arch_v5+"; + break; } } snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name); + if (!is_hybrid()) { + group_events_td.attrs = td_attr; + group_events_mem.attrs = mem_attr; + group_events_tsx.attrs = tsx_attr; + group_format_extra.attrs = extra_attr; + group_format_extra_skl.attrs = extra_skl_attr; - group_events_td.attrs = td_attr; - group_events_mem.attrs = mem_attr; - group_events_tsx.attrs = tsx_attr; - group_format_extra.attrs = extra_attr; - group_format_extra_skl.attrs = extra_skl_attr; - - x86_pmu.attr_update = attr_update; - - if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); - x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1; + x86_pmu.attr_update = attr_update; + } else { + hybrid_group_events_td.attrs = td_attr; + hybrid_group_events_mem.attrs = mem_attr; + hybrid_group_events_tsx.attrs = tsx_attr; + hybrid_group_format_extra.attrs = extra_attr; - if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + x86_pmu.attr_update = hybrid_attr_update; } - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + intel_pmu_check_num_counters(&x86_pmu.num_counters, + &x86_pmu.num_counters_fixed, + &x86_pmu.intel_ctrl, + (u64)fixed_mask); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask == FIXED_EVENT_FLAGS - && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - } - c->idxmsk64 &= - ~(~0ULL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); - c->weight = hweight64(c->idxmsk64); - } - } + /* AnyThread may be deprecated on arch perfmon v5 or later */ + if (x86_pmu.intel_cap.anythread_deprecated) + x86_pmu.format_attrs = intel_arch_formats_attr; + intel_pmu_check_event_constraints(x86_pmu.event_constraints, + x86_pmu.num_counters, + x86_pmu.num_counters_fixed, + x86_pmu.intel_ctrl); /* * Access LBR MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support LBR MSR - * Check all LBT MSR here. + * Check all LBR MSR here. * Disable LBR access if any LBR MSRs can not be accessed. */ - if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) + if (x86_pmu.lbr_tos && !check_msr(x86_pmu.lbr_tos, 0x3UL)) x86_pmu.lbr_nr = 0; for (i = 0; i < x86_pmu.lbr_nr; i++) { if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && @@ -5158,23 +6576,25 @@ __init int intel_pmu_init(void) x86_pmu.lbr_nr = 0; } - if (x86_pmu.lbr_nr) + if (x86_pmu.lbr_nr) { + intel_pmu_lbr_init(); + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); - /* - * Access extra MSR may cause #GP under certain circumstances. - * E.g. KVM doesn't support offcore event - * Check all extra_regs here. - */ - if (x86_pmu.extra_regs) { - for (er = x86_pmu.extra_regs; er->msr; er++) { - er->extra_msr_access = check_msr(er->msr, 0x11UL); - /* Disable LBR select mapping */ - if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) - x86_pmu.lbr_sel_map = NULL; + /* only support branch_stack snapshot for perfmon >= v2 */ + if (x86_pmu.disable_all == intel_pmu_disable_all) { + if (boot_cpu_has(X86_FEATURE_ARCH_LBR)) { + static_call_update(perf_snapshot_branch_stack, + intel_pmu_snapshot_arch_branch_stack); + } else { + static_call_update(perf_snapshot_branch_stack, + intel_pmu_snapshot_branch_stack); + } } } + intel_pmu_check_extra_regs(x86_pmu.extra_regs); + /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { x86_pmu.max_period = x86_pmu.cntval_mask >> 1; @@ -5182,12 +6602,13 @@ __init int intel_pmu_init(void) pr_cont("full-width counters, "); } - /* - * For arch perfmon 4 use counter freezing to avoid - * several MSR accesses in the PMI. - */ - if (x86_pmu.counter_freezing) - x86_pmu.handle_irq = intel_pmu_handle_irq_v4; + if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) + x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS; + + if (is_hybrid()) + intel_pmu_check_hybrid_pmus((u64)fixed_mask); + + intel_aux_output_init(); return 0; } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4814c964692c..a2834bc93149 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -40,7 +40,7 @@ * Model specific counters: * MSR_CORE_C1_RES: CORE C1 Residency Counter * perf code: 0x00 - * Available model: SLM,AMT,GLM,CNL,TNT + * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL * Scope: Core (each processor core has a MSR) * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter * perf code: 0x01 @@ -50,47 +50,51 @@ * MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, + * TGL,TNT,RKL,ADL,RPL,SPR * Scope: Core * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter * perf code: 0x03 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, - * ICL,TGL + * ICL,TGL,RKL,ADL,RPL * Scope: Core * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. * perf code: 0x00 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, - * KBL,CML,ICL,TGL,TNT + * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, + * RPL,SPR * Scope: Package (physical package) * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. * perf code: 0x01 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL, - * GLM,CNL,KBL,CML,ICL,TGL,TNT + * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL, + * ADL,RPL * Scope: Package (physical package) * MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter. * perf code: 0x02 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, - * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL, - * TNT + * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, + * TGL,TNT,RKL,ADL,RPL,SPR * Scope: Package (physical package) * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. * perf code: 0x03 * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL, - * KBL,CML,ICL,TGL + * KBL,CML,ICL,TGL,RKL,ADL,RPL * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL,RPL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL + * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL, + * ADL,RPL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, - * TNT + * TNT,RKL,ADL,RPL * Scope: Package (physical package) * */ @@ -107,14 +111,14 @@ MODULE_LICENSE("GPL"); #define DEFINE_CSTATE_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __cstate_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ +static ssize_t __cstate_##_var##_show(struct device *dev, \ + struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ -static struct kobj_attribute format_attr_##_var = \ +static struct device_attribute format_attr_##_var = \ __ATTR(_name, 0444, __cstate_##_var##_show, NULL) static ssize_t cstate_get_attr_cpumask(struct device *dev, @@ -563,6 +567,28 @@ static const struct cstate_model icl_cstates __initconst = { BIT(PERF_CSTATE_PKG_C10_RES), }; +static const struct cstate_model icx_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C6_RES), +}; + +static const struct cstate_model adl_cstates __initconst = { + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | + BIT(PERF_CSTATE_CORE_C6_RES) | + BIT(PERF_CSTATE_CORE_C7_RES), + + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | + BIT(PERF_CSTATE_PKG_C3_RES) | + BIT(PERF_CSTATE_PKG_C6_RES) | + BIT(PERF_CSTATE_PKG_C7_RES) | + BIT(PERF_CSTATE_PKG_C8_RES) | + BIT(PERF_CSTATE_PKG_C9_RES) | + BIT(PERF_CSTATE_PKG_C10_RES), +}; + static const struct cstate_model slm_cstates __initconst = { .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | BIT(PERF_CSTATE_CORE_C6_RES), @@ -594,63 +620,72 @@ static const struct cstate_model glm_cstates __initconst = { }; -#define X86_CSTATES_MODEL(model, states) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long) &(states) } - static const struct x86_cpu_id intel_cstates_match[] __initconst = { - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EP, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_NEHALEM_EX, nhm_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EP, nhm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_WESTMERE_EX, nhm_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SANDYBRIDGE_X, snb_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_IVYBRIDGE_X, snb_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_G, snb_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_L, hswult_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_D, slm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_D, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_G, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_X, snb_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_L, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_D, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT_D, glm_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ATOM_TREMONT, glm_cstates), - - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhm_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhm_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snb_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &snb_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &snb_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hswult_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT, &slm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_SILVERMONT_D, &slm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_AIRMONT, &slm_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &snb_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &snb_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &snb_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &hswult_cstates), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &hswult_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &cnl_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, &glm_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, &glm_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_cstates), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &icx_cstates), + + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &icl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index dc43cc124e09..446d2833efa7 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -36,7 +36,9 @@ union intel_x86_pebs_dse { unsigned int ld_dse:4; unsigned int ld_stlb_miss:1; unsigned int ld_locked:1; - unsigned int ld_reserved:26; + unsigned int ld_data_blk:1; + unsigned int ld_addr_blk:1; + unsigned int ld_reserved:24; }; struct { unsigned int st_l1d_hit:1; @@ -45,6 +47,12 @@ union intel_x86_pebs_dse { unsigned int st_locked:1; unsigned int st_reserved2:26; }; + struct { + unsigned int st_lat_dse:4; + unsigned int st_lat_stlb_miss:1; + unsigned int st_lat_locked:1; + unsigned int ld_reserved3:26; + }; }; @@ -86,15 +94,45 @@ void __init intel_pmu_pebs_data_source_nhm(void) pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); } -void __init intel_pmu_pebs_data_source_skl(bool pmem) +static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source) { u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); - pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); - pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); - pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); - pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); - pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); + data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + __intel_pmu_pebs_data_source_skl(pmem, pebs_data_source); +} + +static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source) +{ + data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD); +} + +void __init intel_pmu_pebs_data_source_grt(void) +{ + __intel_pmu_pebs_data_source_grt(pebs_data_source); +} + +void __init intel_pmu_pebs_data_source_adl(void) +{ + u64 *data_source; + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_skl(false, data_source); + + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source; + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); + __intel_pmu_pebs_data_source_grt(data_source); } static u64 precise_store_data(u64 status) @@ -163,7 +201,50 @@ static u64 precise_datala_hsw(struct perf_event *event, u64 status) return dse.val; } -static u64 load_latency_data(u64 status) +static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock) +{ + /* + * TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (tlb) + *val |= P(TLB, MISS) | P(TLB, L2); + else + *val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* locked prefix */ + if (lock) + *val |= P(LOCK, LOCKED); +} + +/* Retrieve the latency data for e-core of ADL */ +u64 adl_latency_data_small(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + + WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big); + + dse.val = status; + + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; + + /* + * For the atom core on ADL, + * bit 4: lock, bit 5: TLB access. + */ + pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss); + + if (dse.ld_data_blk) + val |= P(BLK, DATA); + else + val |= P(BLK, NA); + + return val; +} + +static u64 load_latency_data(struct perf_event *event, u64 status) { union intel_x86_pebs_dse dse; u64 val; @@ -173,7 +254,7 @@ static u64 load_latency_data(u64 status) /* * use the mapping table for bit 0-3 */ - val = pebs_data_source[dse.ld_dse]; + val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse]; /* * Nehalem models do not support TLB, Lock infos @@ -182,25 +263,63 @@ static u64 load_latency_data(u64 status) val |= P(TLB, NA) | P(LOCK, NA); return val; } + + pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked); + /* - * bit 4: TLB access - * 0 = did not miss 2nd level TLB - * 1 = missed 2nd level TLB + * Ice Lake and earlier models do not support block infos. */ - if (dse.ld_stlb_miss) - val |= P(TLB, MISS) | P(TLB, L2); - else - val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + if (!x86_pmu.pebs_block) { + val |= P(BLK, NA); + return val; + } + /* + * bit 6: load was blocked since its data could not be forwarded + * from a preceding store + */ + if (dse.ld_data_blk) + val |= P(BLK, DATA); /* - * bit 5: locked prefix + * bit 7: load was blocked due to potential address conflict with + * a preceding store */ - if (dse.ld_locked) - val |= P(LOCK, LOCKED); + if (dse.ld_addr_blk) + val |= P(BLK, ADDR); + + if (!dse.ld_data_blk && !dse.ld_addr_blk) + val |= P(BLK, NA); return val; } +static u64 store_latency_data(struct perf_event *event, u64 status) +{ + union intel_x86_pebs_dse dse; + union perf_mem_data_src src; + u64 val; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse]; + + pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked); + + val |= P(BLK, NA); + + /* + * the pebs_data_source table is only for loads + * so override the mem_op to say STORE instead + */ + src.val = val; + src.mem_op = P(OP,STORE); + + return src.val; +} + struct pebs_record_core { u64 flags, ip; u64 ax, bx, cx, dx; @@ -642,8 +761,8 @@ int intel_pmu_drain_bts_buffer(void) rcu_read_lock(); perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * - (top - base - skip))) + if (perf_output_begin(&handle, &data, event, + header.size * (top - base - skip))) goto unlock; for (at = base; at < top; at++) { @@ -670,9 +789,9 @@ unlock: static inline void intel_pmu_drain_pebs_buffer(void) { - struct pt_regs regs; + struct perf_sample_data data; - x86_pmu.drain_pebs(®s); + x86_pmu.drain_pebs(NULL, &data); } /* @@ -714,6 +833,13 @@ struct event_constraint intel_glm_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_grt_pebs_event_constraints[] = { + /* Allow all events as PEBS with no flags */ + INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0x3), + INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf), + EVENT_CONSTRAINT_END +}; + struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ @@ -851,12 +977,18 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { }; struct event_constraint intel_icl_pebs_event_constraints[] = { - INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x01c0, 0x100000000ULL), /* old INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), /* SLOTS */ INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ - INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */ @@ -870,15 +1002,43 @@ struct event_constraint intel_icl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_spr_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x100, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x800000000ULL), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xc0, 0xfe), + INTEL_PLD_CONSTRAINT(0x1cd, 0xfe), + INTEL_PSD_CONSTRAINT(0x2cd, 0x1), + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { + struct event_constraint *pebs_constraints = hybrid(event->pmu, pebs_constraints); struct event_constraint *c; if (!event->attr.precise_ip) return NULL; - if (x86_pmu.pebs_constraints) { - for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if (pebs_constraints) { + for_each_event_constraint(c, pebs_constraints) { if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; @@ -920,6 +1080,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) { struct debug_store *ds = cpuc->ds; + int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); u64 threshold; int reserved; @@ -927,9 +1089,9 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) return; if (x86_pmu.flags & PMU_FL_PEBS_ALL) - reserved = x86_pmu.max_pebs_events + x86_pmu.num_counters_fixed; + reserved = max_pebs_events + num_counters_fixed; else - reserved = x86_pmu.max_pebs_events; + reserved = max_pebs_events; if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - @@ -954,14 +1116,16 @@ static void adaptive_pebs_record_size_update(void) if (pebs_data_cfg & PEBS_DATACFG_XMMS) sz += sizeof(struct pebs_xmm); if (pebs_data_cfg & PEBS_DATACFG_LBRS) - sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry); cpuc->pebs_record_size = sz; } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE | \ + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_DATA_PAGE_SIZE) static u64 pebs_update_adaptive_cfg(struct perf_event *event) { @@ -986,7 +1150,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1088,6 +1252,9 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + u64 value = ds->pebs_event_reset[hwc->idx]; + u32 base = MSR_RELOAD_PMC0; + unsigned int idx = hwc->idx; if (!is_pebs_pt(event)) return; @@ -1097,7 +1264,15 @@ static void intel_pmu_pebs_via_pt_enable(struct perf_event *event) cpuc->pebs_enabled |= PEBS_OUTPUT_PT; - wrmsrl(MSR_RELOAD_PMC0 + hwc->idx, ds->pebs_event_reset[hwc->idx]); + if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + base = MSR_RELOAD_FIXED_CTR0; + idx = hwc->idx - INTEL_PMC_IDX_FIXED; + if (x86_pmu.intel_cap.pebs_format < 5) + value = ds->pebs_event_reset[MAX_PEBS_EVENTS_FMT4 + idx]; + else + value = ds->pebs_event_reset[MAX_PEBS_EVENTS + idx]; + } + wrmsrl(base + idx, value); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -1105,6 +1280,7 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; + unsigned int idx = hwc->idx; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; @@ -1123,19 +1299,22 @@ void intel_pmu_pebs_enable(struct perf_event *event) } } + if (idx >= INTEL_PMC_IDX_FIXED) { + if (x86_pmu.intel_cap.pebs_format < 5) + idx = MAX_PEBS_EVENTS_FMT4 + (idx - INTEL_PMC_IDX_FIXED); + else + idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); + } + /* * Use auto-reload if possible to save a MSR write in the PMI. * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { - unsigned int idx = hwc->idx; - - if (idx >= INTEL_PMC_IDX_FIXED) - idx = MAX_PEBS_EVENTS + (idx - INTEL_PMC_IDX_FIXED); ds->pebs_event_reset[idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } else { - ds->pebs_event_reset[hwc->idx] = 0; + ds->pebs_event_reset[idx] = 0; } intel_pmu_pebs_via_pt_enable(event); @@ -1194,7 +1373,7 @@ void intel_pmu_pebs_disable_all(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); if (cpuc->pebs_enabled) - wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + __intel_pmu_pebs_disable_all(); } static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) @@ -1261,17 +1440,16 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) old_to = to; #ifdef CONFIG_X86_64 - is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); + is_64bit = kernel_ip(to) || any_64bit_mode(regs); #endif insn_init(&insn, kaddr, size, is_64bit); - insn_get_length(&insn); + /* - * Make sure there was not a problem decoding the - * instruction and getting the length. This is - * doubly important because we have an infinite - * loop if insn.length=0. + * Make sure there was not a problem decoding the instruction. + * This is doubly important because we have an infinite loop if + * insn.length=0. */ - if (!insn.length) + if (insn_get_length(&insn)) break; to += insn.length; @@ -1329,7 +1507,11 @@ static u64 get_data_src(struct perf_event *event, u64 aux) bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); if (fl & PERF_X86_EVENT_PEBS_LDLAT) - val = load_latency_data(aux); + val = load_latency_data(event, aux); + else if (fl & PERF_X86_EVENT_PEBS_STLAT) + val = store_latency_data(event, aux); + else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID) + val = x86_pmu.pebs_latency_data(event, aux); else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) val = precise_datala_hsw(event, aux); else if (fst) @@ -1337,6 +1519,10 @@ static u64 get_data_src(struct perf_event *event, u64 aux) return val; } +#define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_DATA_PAGE_SIZE) + static void setup_pebs_fixed_sample_data(struct perf_event *event, struct pt_regs *iregs, void *__pebs, struct perf_sample_data *data, @@ -1364,14 +1550,18 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { + data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1379,8 +1569,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1451,18 +1643,23 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, } - if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) && - x86_pmu.intel_cap.pebs_format >= 1) + if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_get_tsx_weight(pebs->tsx_tuning); - - if (sample_type & PERF_SAMPLE_TRANSACTION) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { + data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1472,11 +1669,15 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * We can only do this for the default trace clock. */ if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) + event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(pebs->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1502,6 +1703,9 @@ static void adaptive_pebs_save_regs(struct pt_regs *regs, #endif } +#define PEBS_LATENCY_MASK 0xffff +#define PEBS_CACHE_LATENCY_OFFSET 32 + /* * With adaptive PEBS the layout depends on what fields are configured. */ @@ -1531,8 +1735,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) + if (event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(basic->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1540,8 +1746,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ @@ -1572,19 +1780,44 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT) - data->weight = meminfo->latency ?: - intel_get_tsx_weight(meminfo->tsx_tuning); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + u64 weight = meminfo->latency; + + if (x86_pmu.flags & PMU_FL_INSTR_LATENCY) { + data->weight.var2_w = weight & PEBS_LATENCY_MASK; + weight >>= PEBS_CACHE_LATENCY_OFFSET; + } - if (sample_type & PERF_SAMPLE_DATA_SRC) + /* + * Although meminfo::latency is defined as a u64, + * only the lower 32 bits include the valid data + * in practice on Ice Lake and earlier platforms. + */ + if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = weight ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } else { + data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } + + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { @@ -1595,14 +1828,15 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_LBRS) { - struct pebs_lbr *lbr = next_record; + struct lbr_entry *lbr = next_record; int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) & 0xff) + 1; - next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + next_record = next_record + num_lbr * sizeof(struct lbr_entry); if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } @@ -1689,7 +1923,7 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) * * [-period, 0] * - * the difference between two consequtive reads is: + * the difference between two consecutive reads is: * * A) value2 - value1; * when no overflows have happened in between, @@ -1721,22 +1955,24 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) return 0; } -static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, - void *base, void *top, - int bit, int count, - void (*setup_sample)(struct perf_event *, - struct pt_regs *, - void *, - struct perf_sample_data *, - struct pt_regs *)) +static __always_inline void +__intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, + struct perf_sample_data *data, + void *base, void *top, + int bit, int count, + void (*setup_sample)(struct perf_event *, + struct pt_regs *, + void *, + struct perf_sample_data *, + struct pt_regs *)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct perf_sample_data data; struct x86_perf_regs perf_regs; struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); + static struct pt_regs dummy_iregs; if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { /* @@ -1749,28 +1985,37 @@ static void __intel_pmu_pebs_event(struct perf_event *event, } else if (!intel_pmu_save_and_restart(event)) return; + if (!iregs) + iregs = &dummy_iregs; + while (count > 1) { - setup_sample(event, iregs, at, &data, regs); - perf_event_output(event, &data, regs); + setup_sample(event, iregs, at, data, regs); + perf_event_output(event, data, regs); at += cpuc->pebs_record_size; at = get_next_pebs_record_by_bit(at, top, bit); count--; } - setup_sample(event, iregs, at, &data, regs); - - /* - * All but the last records are processed. - * The last one is left to be able to call the overflow handler. - */ - if (perf_event_overflow(event, &data, regs)) { - x86_pmu_stop(event, 0); - return; + setup_sample(event, iregs, at, data, regs); + if (iregs == &dummy_iregs) { + /* + * The PEBS records may be drained in the non-overflow context, + * e.g., large PEBS + context switch. Perf should treat the + * last record the same as other PEBS records, and doesn't + * invoke the generic overflow handler. + */ + perf_event_output(event, data, regs); + } else { + /* + * All but the last records are processed. + * The last one is left to be able to call the overflow handler. + */ + if (perf_event_overflow(event, data, regs)) + x86_pmu_stop(event, 0); } - } -static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_core(struct pt_regs *iregs, struct perf_sample_data *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -1804,7 +2049,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; } - __intel_pmu_pebs_event(event, iregs, at, top, 0, n, + __intel_pmu_pebs_event(event, iregs, data, at, top, 0, n, setup_pebs_fixed_sample_data); } @@ -1827,7 +2072,7 @@ static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int } } -static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_data *data) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; @@ -1883,7 +2128,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) */ if (!pebs_status && cpuc->pebs_enabled && !(cpuc->pebs_enabled & (cpuc->pebs_enabled-1))) - pebs_status = cpuc->pebs_enabled; + pebs_status = p->status = cpuc->pebs_enabled; bit = find_first_bit((unsigned long *)&pebs_status, x86_pmu.max_pebs_events); @@ -1905,7 +2150,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * that caused the PEBS record. It's called collision. * If collision happened, the record will be dropped. */ - if (p->status != (1ULL << bit)) { + if (pebs_status != (1ULL << bit)) { for_each_set_bit(i, (unsigned long *)&pebs_status, size) error[i]++; continue; @@ -1929,22 +2174,24 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) if (error[bit]) { perf_log_lost_samples(event, error[bit]); - if (perf_event_account_interrupt(event)) + if (iregs && perf_event_account_interrupt(event)) x86_pmu_stop(event, 0); } if (counts[bit]) { - __intel_pmu_pebs_event(event, iregs, base, + __intel_pmu_pebs_event(event, iregs, data, base, top, bit, counts[bit], setup_pebs_fixed_sample_data); } } } -static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs, struct perf_sample_data *data) { short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int max_pebs_events = hybrid(cpuc->pmu, max_pebs_events); + int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed); struct debug_store *ds = cpuc->ds; struct perf_event *event; void *base, *at, *top; @@ -1959,9 +2206,9 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; - mask = ((1ULL << x86_pmu.max_pebs_events) - 1) | - (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); - size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + mask = ((1ULL << max_pebs_events) - 1) | + (((1ULL << num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); + size = INTEL_PMC_IDX_FIXED + num_counters_fixed; if (unlikely(base >= top)) { intel_pmu_pebs_event_update_no_drain(cpuc, size); @@ -1989,7 +2236,7 @@ static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) if (WARN_ON_ONCE(!event->attr.precise_ip)) continue; - __intel_pmu_pebs_event(event, iregs, base, + __intel_pmu_pebs_event(event, iregs, data, base, top, bit, counts[bit], setup_pebs_adaptive_sample_data); } @@ -2057,6 +2304,7 @@ void __init intel_ds_init(void) break; case 4: + case 5: x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; x86_pmu.pebs_record_size = sizeof(struct pebs_basic); if (x86_pmu.intel_cap.pebs_baseline) { @@ -2064,8 +2312,9 @@ void __init intel_ds_init(void) PERF_SAMPLE_BRANCH_STACK | PERF_SAMPLE_TIME; x86_pmu.flags |= PMU_FL_PEBS_ALL; + x86_pmu.pebs_capable = ~0ULL; pebs_qual = "-baseline"; - x86_get_pmu()->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_EXTENDED_REGS; } else { /* Only basic record supported */ x86_pmu.large_pebs_flags &= @@ -2078,9 +2327,9 @@ void __init intel_ds_init(void) } pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); - if (x86_pmu.intel_cap.pebs_output_pt_available) { + if (!is_hybrid() && x86_pmu.intel_cap.pebs_output_pt_available) { pr_cont("PEBS-via-PT, "); - x86_get_pmu()->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_AUX_OUTPUT; } break; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 534c76606049..8259d725054d 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -4,29 +4,9 @@ #include <asm/perf_event.h> #include <asm/msr.h> -#include <asm/insn.h> #include "../perf_event.h" -enum { - LBR_FORMAT_32 = 0x00, - LBR_FORMAT_LIP = 0x01, - LBR_FORMAT_EIP = 0x02, - LBR_FORMAT_EIP_FLAGS = 0x03, - LBR_FORMAT_EIP_FLAGS2 = 0x04, - LBR_FORMAT_INFO = 0x05, - LBR_FORMAT_TIME = 0x06, - LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, -}; - -static const enum { - LBR_EIP_FLAGS = 1, - LBR_TSX = 2, -} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { - [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, - [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, -}; - /* * Intel LBR_SELECT bits * Intel Vol3a, April 2011, Section 16.7 Table 16-10 @@ -85,65 +65,52 @@ static const enum { #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) /* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults + * Intel LBR_CTL bits + * + * Hardware branch filter for Arch LBR */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ - - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ +#define ARCH_LBR_KERNEL_BIT 1 /* capture at ring0 */ +#define ARCH_LBR_USER_BIT 2 /* capture at ring > 0 */ +#define ARCH_LBR_CALL_STACK_BIT 3 /* enable call stack */ +#define ARCH_LBR_JCC_BIT 16 /* capture conditional branches */ +#define ARCH_LBR_REL_JMP_BIT 17 /* capture relative jumps */ +#define ARCH_LBR_IND_JMP_BIT 18 /* capture indirect jumps */ +#define ARCH_LBR_REL_CALL_BIT 19 /* capture relative calls */ +#define ARCH_LBR_IND_CALL_BIT 20 /* capture indirect calls */ +#define ARCH_LBR_RETURN_BIT 21 /* capture near returns */ +#define ARCH_LBR_OTHER_BRANCH_BIT 22 /* capture other branches */ + +#define ARCH_LBR_KERNEL (1ULL << ARCH_LBR_KERNEL_BIT) +#define ARCH_LBR_USER (1ULL << ARCH_LBR_USER_BIT) +#define ARCH_LBR_CALL_STACK (1ULL << ARCH_LBR_CALL_STACK_BIT) +#define ARCH_LBR_JCC (1ULL << ARCH_LBR_JCC_BIT) +#define ARCH_LBR_REL_JMP (1ULL << ARCH_LBR_REL_JMP_BIT) +#define ARCH_LBR_IND_JMP (1ULL << ARCH_LBR_IND_JMP_BIT) +#define ARCH_LBR_REL_CALL (1ULL << ARCH_LBR_REL_CALL_BIT) +#define ARCH_LBR_IND_CALL (1ULL << ARCH_LBR_IND_CALL_BIT) +#define ARCH_LBR_RETURN (1ULL << ARCH_LBR_RETURN_BIT) +#define ARCH_LBR_OTHER_BRANCH (1ULL << ARCH_LBR_OTHER_BRANCH_BIT) + +#define ARCH_LBR_ANY \ + (ARCH_LBR_JCC |\ + ARCH_LBR_REL_JMP |\ + ARCH_LBR_IND_JMP |\ + ARCH_LBR_REL_CALL |\ + ARCH_LBR_IND_CALL |\ + ARCH_LBR_RETURN |\ + ARCH_LBR_OTHER_BRANCH) + +#define ARCH_LBR_CTL_MASK 0x7f000e -}; +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) +static __always_inline bool is_lbr_call_stack_bit_set(u64 config) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return !!(config & ARCH_LBR_CALL_STACK); -static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + return !!(config & LBR_CALL_STACK); +} /* * We only support LBR implementations that have FREEZE_LBRS_ON_PMI @@ -168,33 +135,32 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask; - if (!pmi && cpuc->lbr_sel) + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); orig_debugctl = debugctl; - debugctl |= DEBUGCTLMSR_LBR; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + debugctl |= DEBUGCTLMSR_LBR; /* * LBR callstack does not work well with FREEZE_LBRS_ON_PMI. * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions * may cause superfluous increase/decrease of LBR_TOS. */ - if (!(lbr_select & LBR_CALL_STACK)) + if (is_lbr_call_stack_bit_set(lbr_select)) + debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + else debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI; + if (orig_debugctl != debugctl) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); -} -static void __intel_pmu_lbr_disable(void) -{ - u64 debugctl; - - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); - debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN); } -static void intel_pmu_lbr_reset_32(void) +void intel_pmu_lbr_reset_32(void) { int i; @@ -202,18 +168,24 @@ static void intel_pmu_lbr_reset_32(void) wrmsrl(x86_pmu.lbr_from + i, 0); } -static void intel_pmu_lbr_reset_64(void) +void intel_pmu_lbr_reset_64(void) { int i; for (i = 0; i < x86_pmu.lbr_nr; i++) { wrmsrl(x86_pmu.lbr_from + i, 0); wrmsrl(x86_pmu.lbr_to + i, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + i, 0); + if (x86_pmu.lbr_has_info) + wrmsrl(x86_pmu.lbr_info + i, 0); } } +static void intel_pmu_arch_lbr_reset(void) +{ + /* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */ + wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr); +} + void intel_pmu_lbr_reset(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -221,13 +193,12 @@ void intel_pmu_lbr_reset(void) if (!x86_pmu.lbr_nr) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_reset_32(); - else - intel_pmu_lbr_reset_64(); + x86_pmu.lbr_reset(); cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, 0); } /* @@ -247,9 +218,9 @@ enum { }; /* - * For formats with LBR_TSX flags (e.g. LBR_FORMAT_EIP_FLAGS2), bits 61:62 in - * MSR_LAST_BRANCH_FROM_x are the TSX flags when TSX is supported, but when - * TSX is not supported they have no consistent behavior: + * For format LBR_FORMAT_EIP_FLAGS2, bits 61:62 in MSR_LAST_BRANCH_FROM_x + * are the TSX flags when TSX is supported, but when TSX is not supported + * they have no consistent behavior: * * - For wrmsr(), bits 61:62 are considered part of the sign extension. * - For HW updates (branch captures) bits 61:62 are always OFF and are not @@ -257,7 +228,7 @@ enum { * * Therefore, if: * - * 1) LBR has TSX format + * 1) LBR format LBR_FORMAT_EIP_FLAGS2 * 2) CPU has no TSX support enabled * * ... then any value passed to wrmsr() must be sign extended to 63 bits and any @@ -266,11 +237,10 @@ enum { */ static inline bool lbr_from_signext_quirk_needed(void) { - int lbr_format = x86_pmu.intel_cap.lbr_format; bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM); - return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX); + return !tsx_support; } static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key); @@ -308,119 +278,247 @@ static u64 lbr_from_signext_quirk_rd(u64 val) return val; } -static inline void wrlbr_from(unsigned int idx, u64 val) +static __always_inline void wrlbr_from(unsigned int idx, u64 val) { val = lbr_from_signext_quirk_wr(val); wrmsrl(x86_pmu.lbr_from + idx, val); } -static inline void wrlbr_to(unsigned int idx, u64 val) +static __always_inline void wrlbr_to(unsigned int idx, u64 val) { wrmsrl(x86_pmu.lbr_to + idx, val); } -static inline u64 rdlbr_from(unsigned int idx) +static __always_inline void wrlbr_info(unsigned int idx, u64 val) +{ + wrmsrl(x86_pmu.lbr_info + idx, val); +} + +static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->from; + rdmsrl(x86_pmu.lbr_from + idx, val); return lbr_from_signext_quirk_rd(val); } -static inline u64 rdlbr_to(unsigned int idx) +static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr) { u64 val; + if (lbr) + return lbr->to; + rdmsrl(x86_pmu.lbr_to + idx, val); return val; } -static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) +static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - unsigned lbr_idx, mask; - u64 tos; + u64 val; - if (task_ctx->lbr_callstack_users == 0 || - task_ctx->lbr_stack_state == LBR_NONE) { - intel_pmu_lbr_reset(); - return; - } + if (lbr) + return lbr->info; - tos = task_ctx->tos; - /* - * Does not restore the LBR registers, if - * - No one else touched them, and - * - Did not enter C6 - */ - if ((task_ctx == cpuc->last_task_ctx) && - (task_ctx->log_id == cpuc->last_log_id) && - rdlbr_from(tos)) { - task_ctx->lbr_stack_state = LBR_NONE; - return; - } + rdmsrl(x86_pmu.lbr_info + idx, val); + + return val; +} + +static inline void +wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + wrlbr_from(idx, lbr->from); + wrlbr_to(idx, lbr->to); + if (need_info) + wrlbr_info(idx, lbr->info); +} + +static inline bool +rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info) +{ + u64 from = rdlbr_from(idx, NULL); + + /* Don't read invalid entry */ + if (!from) + return false; + + lbr->from = from; + lbr->to = rdlbr_to(idx, NULL); + if (need_info) + lbr->info = rdlbr_info(idx, NULL); + + return true; +} + +void intel_pmu_lbr_restore(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + bool need_info = x86_pmu.lbr_has_info; + u64 tos = task_ctx->tos; + unsigned lbr_idx, mask; + int i; mask = x86_pmu.lbr_nr - 1; for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; - wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); - wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); - - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); + wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info); } for (; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, 0); wrlbr_to(lbr_idx, 0); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + if (need_info) + wrlbr_info(lbr_idx, 0); } wrmsrl(x86_pmu.lbr_tos, tos); - task_ctx->lbr_stack_state = LBR_NONE; + + if (cpuc->lbr_select) + wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); } -static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) +static void intel_pmu_arch_lbr_restore(void *ctx) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - unsigned lbr_idx, mask; - u64 tos, from; + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; int i; - if (task_ctx->lbr_callstack_users == 0) { - task_ctx->lbr_stack_state = LBR_NONE; + /* Fast reset the LBRs before restore if the call stack is not full. */ + if (!entries[x86_pmu.lbr_nr - 1].from) + intel_pmu_arch_lbr_reset(); + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!entries[i].from) + break; + wrlbr_all(&entries[i], i, true); + } +} + +/* + * Restore the Architecture LBR state from the xsave area in the perf + * context data for the task via the XRSTORS instruction. + */ +static void intel_pmu_arch_lbr_xrstors(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + xrstors(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + +static __always_inline bool lbr_is_reset_in_cstate(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL); + + return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL); +} + +static void __intel_pmu_lbr_restore(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0 || + task_context_opt(ctx)->lbr_stack_state == LBR_NONE) { + intel_pmu_lbr_reset(); + return; + } + + /* + * Does not restore the LBR registers, if + * - No one else touched them, and + * - Was not cleared in Cstate + */ + if ((ctx == cpuc->last_task_ctx) && + (task_context_opt(ctx)->log_id == cpuc->last_log_id) && + !lbr_is_reset_in_cstate(ctx)) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; return; } + x86_pmu.lbr_restore(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; +} + +void intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct x86_perf_task_context *task_ctx = ctx; + bool need_info = x86_pmu.lbr_has_info; + unsigned lbr_idx, mask; + u64 tos; + int i; + mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - from = rdlbr_from(lbr_idx); - if (!from) + if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info)) break; - task_ctx->lbr_from[i] = from; - task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } task_ctx->valid_lbrs = i; task_ctx->tos = tos; - task_ctx->lbr_stack_state = LBR_VALID; - cpuc->last_task_ctx = task_ctx; - cpuc->last_log_id = ++task_ctx->log_id; + if (cpuc->lbr_select) + rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel); +} + +static void intel_pmu_arch_lbr_save(void *ctx) +{ + struct x86_perf_task_context_arch_lbr *task_ctx = ctx; + struct lbr_entry *entries = task_ctx->entries; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + if (!rdlbr_all(&entries[i], i, true)) + break; + } + + /* LBR call stack is not full. Reset is required in restore. */ + if (i < x86_pmu.lbr_nr) + entries[x86_pmu.lbr_nr - 1].from = 0; +} + +/* + * Save the Architecture LBR state to the xsave area in the perf + * context data for the task via the XSAVES instruction. + */ +static void intel_pmu_arch_lbr_xsaves(void *ctx) +{ + struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx; + + xsaves(&task_ctx->xsave, XFEATURE_MASK_LBR); +} + +static void __intel_pmu_lbr_save(void *ctx) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (task_context_opt(ctx)->lbr_callstack_users == 0) { + task_context_opt(ctx)->lbr_stack_state = LBR_NONE; + return; + } + + x86_pmu.lbr_save(ctx); + + task_context_opt(ctx)->lbr_stack_state = LBR_VALID; + + cpuc->last_task_ctx = ctx; + cpuc->last_log_id = ++task_context_opt(ctx)->log_id; } void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next) { - struct x86_perf_task_context *prev_ctx_data, *next_ctx_data; + void *prev_ctx_data, *next_ctx_data; swap(prev->task_ctx_data, next->task_ctx_data); @@ -436,14 +534,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, if (!prev_ctx_data || !next_ctx_data) return; - swap(prev_ctx_data->lbr_callstack_users, - next_ctx_data->lbr_callstack_users); + swap(task_context_opt(prev_ctx_data)->lbr_callstack_users, + task_context_opt(next_ctx_data)->lbr_callstack_users); } void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; + void *task_ctx; if (!cpuc->lbr_users) return; @@ -480,17 +578,17 @@ static inline bool branch_user_callstack(unsigned br_sel) void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 1; + cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users++; - } + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++; /* * Request pmu::sched_task() callback, which will fire inside the @@ -518,19 +616,59 @@ void intel_pmu_lbr_add(struct perf_event *event) intel_pmu_lbr_reset(); } +void release_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + kmem_cache = x86_get_pmu(cpu)->task_ctx_cache; + if (kmem_cache && cpuc->lbr_xsave) { + kmem_cache_free(kmem_cache, cpuc->lbr_xsave); + cpuc->lbr_xsave = NULL; + } + } +} + +void reserve_lbr_buffers(void) +{ + struct kmem_cache *kmem_cache; + struct cpu_hw_events *cpuc; + int cpu; + + if (!static_cpu_has(X86_FEATURE_ARCH_LBR)) + return; + + for_each_possible_cpu(cpu) { + cpuc = per_cpu_ptr(&cpu_hw_events, cpu); + kmem_cache = x86_get_pmu(cpu)->task_ctx_cache; + if (!kmem_cache || cpuc->lbr_xsave) + continue; + + cpuc->lbr_xsave = kmem_cache_alloc_node(kmem_cache, + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + } +} + void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - struct x86_perf_task_context *task_ctx; if (!x86_pmu.lbr_nr) return; if (branch_user_callstack(cpuc->br_sel) && - event->ctx->task_ctx_data) { - task_ctx = event->ctx->task_ctx_data; - task_ctx->lbr_callstack_users--; - } + event->ctx->task_ctx_data) + task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--; + + if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT) + cpuc->lbr_select = 0; if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) cpuc->lbr_pebs_users--; @@ -540,11 +678,19 @@ void intel_pmu_lbr_del(struct perf_event *event) perf_sched_cb_dec(event->ctx->pmu); } +static inline bool vlbr_exclude_host(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + return test_bit(INTEL_PMC_IDX_FIXED_VLBR, + (unsigned long *)&cpuc->intel_ctrl_guest_mask); +} + void intel_pmu_lbr_enable_all(bool pmi) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) __intel_pmu_lbr_enable(pmi); } @@ -552,13 +698,18 @@ void intel_pmu_lbr_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (cpuc->lbr_users) + if (cpuc->lbr_users && !vlbr_exclude_host()) { + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return __intel_pmu_arch_lbr_disable(); + __intel_pmu_lbr_disable(); + } } -static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) { unsigned long mask = x86_pmu.lbr_nr - 1; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; @@ -574,17 +725,14 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].mispred = 0; - cpuc->lbr_entries[i].predicted = 0; - cpuc->lbr_entries[i].in_tx = 0; - cpuc->lbr_entries[i].abort = 0; - cpuc->lbr_entries[i].cycles = 0; - cpuc->lbr_entries[i].type = 0; - cpuc->lbr_entries[i].reserved = 0; + perf_clear_branch_entry_bitfields(br); + + br->from = msr_lastbranch.from; + br->to = msr_lastbranch.to; + br++; } cpuc->lbr_stack.nr = i; + cpuc->lbr_stack.hw_idx = tos; } /* @@ -592,11 +740,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) * is the same as the linear address, allowing us to merge the LIP and EIP * LBR formats. */ -static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; - int lbr_format = x86_pmu.intel_cap.lbr_format; + struct perf_branch_entry *br = cpuc->lbr_entries; u64 tos = intel_pmu_lbr_tos(); int i; int out = 0; @@ -611,12 +759,10 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; - int skip = 0; u16 cycles = 0; - int lbr_flags = lbr_desc[lbr_format]; - from = rdlbr_from(lbr_idx); - to = rdlbr_to(lbr_idx); + from = rdlbr_from(lbr_idx, NULL); + to = rdlbr_to(lbr_idx, NULL); /* * Read LBR call stack entries @@ -625,37 +771,39 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (call_stack && !from) break; - if (lbr_format == LBR_FORMAT_INFO && need_info) { - u64 info; - - rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info); - mis = !!(info & LBR_INFO_MISPRED); - pred = !mis; - in_tx = !!(info & LBR_INFO_IN_TX); - abort = !!(info & LBR_INFO_ABORT); - cycles = (info & LBR_INFO_CYCLES); - } - - if (lbr_format == LBR_FORMAT_TIME) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - cycles = ((to >> 48) & LBR_INFO_CYCLES); - - to = (u64)((((s64)to) << 16) >> 16); - } - - if (lbr_flags & LBR_EIP_FLAGS) { - mis = !!(from & LBR_FROM_FLAG_MISPRED); - pred = !mis; - skip = 1; - } - if (lbr_flags & LBR_TSX) { - in_tx = !!(from & LBR_FROM_FLAG_IN_TX); - abort = !!(from & LBR_FROM_FLAG_ABORT); - skip = 3; + if (x86_pmu.lbr_has_info) { + if (need_info) { + u64 info; + + info = rdlbr_info(lbr_idx, NULL); + mis = !!(info & LBR_INFO_MISPRED); + pred = !mis; + cycles = (info & LBR_INFO_CYCLES); + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(info & LBR_INFO_IN_TX); + abort = !!(info & LBR_INFO_ABORT); + } + } + } else { + int skip = 0; + + if (x86_pmu.lbr_from_flags) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + } + if (x86_pmu.lbr_has_tsx) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); + + if (x86_pmu.lbr_to_cycles) { + cycles = ((to >> 48) & LBR_INFO_CYCLES); + to = (u64)((((s64)to) << 16) >> 16); + } } - from = (u64)((((s64)from) << skip) >> skip); /* * Some CPUs report duplicated abort records, @@ -668,18 +816,109 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (abort && x86_pmu.lbr_double_abort && out > 0) out--; - cpuc->lbr_entries[out].from = from; - cpuc->lbr_entries[out].to = to; - cpuc->lbr_entries[out].mispred = mis; - cpuc->lbr_entries[out].predicted = pred; - cpuc->lbr_entries[out].in_tx = in_tx; - cpuc->lbr_entries[out].abort = abort; - cpuc->lbr_entries[out].cycles = cycles; - cpuc->lbr_entries[out].type = 0; - cpuc->lbr_entries[out].reserved = 0; + perf_clear_branch_entry_bitfields(br+out); + br[out].from = from; + br[out].to = to; + br[out].mispred = mis; + br[out].predicted = pred; + br[out].in_tx = in_tx; + br[out].abort = abort; + br[out].cycles = cycles; out++; } cpuc->lbr_stack.nr = out; + cpuc->lbr_stack.hw_idx = tos; +} + +static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles); +static DEFINE_STATIC_KEY_FALSE(x86_lbr_type); + +static __always_inline int get_lbr_br_type(u64 info) +{ + int type = 0; + + if (static_branch_likely(&x86_lbr_type)) + type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET; + + return type; +} + +static __always_inline bool get_lbr_mispred(u64 info) +{ + bool mispred = 0; + + if (static_branch_likely(&x86_lbr_mispred)) + mispred = !!(info & LBR_INFO_MISPRED); + + return mispred; +} + +static __always_inline u16 get_lbr_cycles(u64 info) +{ + u16 cycles = info & LBR_INFO_CYCLES; + + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + (!static_branch_likely(&x86_lbr_cycles) || + !(info & LBR_INFO_CYC_CNT_VALID))) + cycles = 0; + + return cycles; +} + +static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc, + struct lbr_entry *entries) +{ + struct perf_branch_entry *e; + struct lbr_entry *lbr; + u64 from, to, info; + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + lbr = entries ? &entries[i] : NULL; + e = &cpuc->lbr_entries[i]; + + from = rdlbr_from(i, lbr); + /* + * Read LBR entries until invalid entry (0s) is detected. + */ + if (!from) + break; + + to = rdlbr_to(i, lbr); + info = rdlbr_info(i, lbr); + + perf_clear_branch_entry_bitfields(e); + + e->from = from; + e->to = to; + e->mispred = get_lbr_mispred(info); + e->predicted = !e->mispred; + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = get_lbr_cycles(info); + e->type = get_lbr_br_type(info); + } + + cpuc->lbr_stack.nr = i; +} + +static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc) +{ + intel_pmu_store_lbr(cpuc, NULL); +} + +static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc) +{ + struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave; + + if (!xsave) { + intel_pmu_store_lbr(cpuc, NULL); + return; + } + xsaves(&xsave->xsave, XFEATURE_MASK_LBR); + + intel_pmu_store_lbr(cpuc, xsave->lbr.entries); } void intel_pmu_lbr_read(void) @@ -692,13 +931,11 @@ void intel_pmu_lbr_read(void) * This could be smarter and actually check the event, * but this simple approach seems to work for now. */ - if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) + if (!cpuc->lbr_users || vlbr_exclude_host() || + cpuc->lbr_users == cpuc->lbr_pebs_users) return; - if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) - intel_pmu_lbr_read_32(cpuc); - else - intel_pmu_lbr_read_64(cpuc); + x86_pmu.lbr_read(cpuc); intel_pmu_lbr_filter(cpuc); } @@ -798,6 +1035,19 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) reg = &event->hw.branch_reg; reg->idx = EXTRA_REG_LBR; + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) { + reg->config = mask; + + /* + * The Arch LBR HW can retrieve the common branch types + * from the LBR_INFO. It doesn't require the high overhead + * SW disassemble. + * Enable the branch type by default for the Arch LBR. + */ + reg->reg |= X86_BR_TYPE_SAVE; + return 0; + } + /* * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate * in suppress mode. So LBR_SELECT should be set to @@ -809,7 +1059,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) && (br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) && - (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)) + x86_pmu.lbr_has_info) reg->config |= LBR_NO_INFO; return 0; @@ -841,218 +1091,26 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return ret; } -/* - * return the type of control flow change at address "from" - * instruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindy read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); -#endif - insn_init(&insn, addr, bytes_read, is64); - insn_get_opcode(&insn); - if (!insn.opcode.got) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - insn_get_immediate(&insn); - if (insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - /* fall through */ - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - insn_get_modrm(&insn); - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -#define X86_BR_TYPE_MAP_MAX 16 - -static int branch_map[X86_BR_TYPE_MAP_MAX] = { - PERF_BR_CALL, /* X86_BR_CALL */ - PERF_BR_RET, /* X86_BR_RET */ - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ - PERF_BR_SYSRET, /* X86_BR_SYSRET */ - PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_UNKNOWN, /* X86_BR_IRET */ - PERF_BR_COND, /* X86_BR_JCC */ - PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_UNKNOWN, /* X86_BR_IRQ */ - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ - PERF_BR_IND, /* X86_BR_IND_JMP */ +enum { + ARCH_LBR_BR_TYPE_JCC = 0, + ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, + ARCH_LBR_BR_TYPE_NEAR_REL_JMP = 2, + ARCH_LBR_BR_TYPE_NEAR_IND_CALL = 3, + ARCH_LBR_BR_TYPE_NEAR_REL_CALL = 4, + ARCH_LBR_BR_TYPE_NEAR_RET = 5, + ARCH_LBR_BR_TYPE_KNOWN_MAX = ARCH_LBR_BR_TYPE_NEAR_RET, + + ARCH_LBR_BR_TYPE_MAP_MAX = 16, }; -static int -common_branch_type(int type) -{ - int i; - - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ - - if (type) { - i = __ffs(type); - if (i < X86_BR_TYPE_MAP_MAX) - return branch_map[i]; - } - - return PERF_BR_UNKNOWN; -} +static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = { + [ARCH_LBR_BR_TYPE_JCC] = X86_BR_JCC, + [ARCH_LBR_BR_TYPE_NEAR_IND_JMP] = X86_BR_IND_JMP, + [ARCH_LBR_BR_TYPE_NEAR_REL_JMP] = X86_BR_JMP, + [ARCH_LBR_BR_TYPE_NEAR_IND_CALL] = X86_BR_IND_CALL, + [ARCH_LBR_BR_TYPE_NEAR_REL_CALL] = X86_BR_CALL, + [ARCH_LBR_BR_TYPE_NEAR_RET] = X86_BR_RET, +}; /* * implement actual branch filter based on user demand. @@ -1066,7 +1124,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) { u64 from, to; int br_sel = cpuc->br_sel; - int i, j, type; + int i, j, type, to_plm; bool compress = false; /* if sampling all branches, then nothing to filter */ @@ -1078,8 +1136,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; + type = cpuc->lbr_entries[i].type; - type = branch_type(from, to, cpuc->lbr_entries[i].abort); + /* + * Parse the branch type recorded in LBR_x_INFO MSR. + * Doesn't support OTHER_BRANCH decoding for now. + * OTHER_BRANCH branch type still rely on software decoding. + */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) && + type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) { + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + type = arch_lbr_br_type_map[type] | to_plm; + } else + type = branch_type(from, to, cpuc->lbr_entries[i].abort); if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { if (cpuc->lbr_entries[i].in_tx) type |= X86_BR_IN_TX; @@ -1114,25 +1183,18 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int i; - cpuc->lbr_stack.nr = x86_pmu.lbr_nr; - for (i = 0; i < x86_pmu.lbr_nr; i++) { - u64 info = lbr->lbr[i].info; - struct perf_branch_entry *e = &cpuc->lbr_entries[i]; + /* Cannot get TOS for large PEBS and Arch LBR */ + if (static_cpu_has(X86_FEATURE_ARCH_LBR) || + (cpuc->n_pebs == cpuc->n_large_pebs)) + cpuc->lbr_stack.hw_idx = -1ULL; + else + cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos(); - e->from = lbr->lbr[i].from; - e->to = lbr->lbr[i].to; - e->mispred = !!(info & LBR_INFO_MISPRED); - e->predicted = !(info & LBR_INFO_MISPRED); - e->in_tx = !!(info & LBR_INFO_IN_TX); - e->abort = !!(info & LBR_INFO_ABORT); - e->cycles = info & LBR_INFO_CYCLES; - e->reserved = 0; - } + intel_pmu_store_lbr(cpuc, lbr); intel_pmu_lbr_filter(cpuc); } @@ -1189,6 +1251,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, }; +static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = ARCH_LBR_ANY, + [PERF_SAMPLE_BRANCH_USER_SHIFT] = ARCH_LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = ARCH_LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = ARCH_LBR_RETURN | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_OTHER_BRANCH, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = ARCH_LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = ARCH_LBR_JCC, + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = ARCH_LBR_REL_CALL | + ARCH_LBR_IND_CALL | + ARCH_LBR_RETURN | + ARCH_LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = ARCH_LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = ARCH_LBR_REL_CALL, +}; + /* core */ void __init intel_pmu_lbr_init_core(void) { @@ -1242,9 +1324,17 @@ void __init intel_pmu_lbr_init_snb(void) */ } +static inline struct kmem_cache * +create_lbr_kmem_cache(size_t size, size_t align) +{ + return kmem_cache_create("x86_lbr", size, align, 0, NULL); +} + /* haswell */ void intel_pmu_lbr_init_hsw(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 16; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; @@ -1253,21 +1343,25 @@ void intel_pmu_lbr_init_hsw(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; - if (lbr_from_signext_quirk_needed()) - static_branch_enable(&lbr_from_quirk_key); + x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); } /* skylake */ __init void intel_pmu_lbr_init_skl(void) { + size_t size = sizeof(struct x86_perf_task_context); + x86_pmu.lbr_nr = 32; x86_pmu.lbr_tos = MSR_LBR_TOS; x86_pmu.lbr_from = MSR_LBR_NHM_FROM; x86_pmu.lbr_to = MSR_LBR_NHM_TO; + x86_pmu.lbr_info = MSR_LBR_INFO_0; x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = hsw_lbr_sel_map; + x86_get_pmu(smp_processor_id())->task_ctx_cache = create_lbr_kmem_cache(size, 0); + /* * SW branch filter usage: * - support syscall, sysret capture. @@ -1334,3 +1428,197 @@ void intel_pmu_lbr_init_knl(void) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } + +void intel_pmu_lbr_init(void) +{ + switch (x86_pmu.intel_cap.lbr_format) { + case LBR_FORMAT_EIP_FLAGS2: + x86_pmu.lbr_has_tsx = 1; + x86_pmu.lbr_from_flags = 1; + if (lbr_from_signext_quirk_needed()) + static_branch_enable(&lbr_from_quirk_key); + break; + + case LBR_FORMAT_EIP_FLAGS: + x86_pmu.lbr_from_flags = 1; + break; + + case LBR_FORMAT_INFO: + x86_pmu.lbr_has_tsx = 1; + fallthrough; + case LBR_FORMAT_INFO2: + x86_pmu.lbr_has_info = 1; + break; + + case LBR_FORMAT_TIME: + x86_pmu.lbr_from_flags = 1; + x86_pmu.lbr_to_cycles = 1; + break; + } + + if (x86_pmu.lbr_has_info) { + /* + * Only used in combination with baseline pebs. + */ + static_branch_enable(&x86_lbr_mispred); + static_branch_enable(&x86_lbr_cycles); + } +} + +/* + * LBR state size is variable based on the max number of registers. + * This calculates the expected state size, which should match + * what the hardware enumerates for the size of XFEATURE_LBR. + */ +static inline unsigned int get_lbr_state_size(void) +{ + return sizeof(struct arch_lbr_state) + + x86_pmu.lbr_nr * sizeof(struct lbr_entry); +} + +static bool is_arch_lbr_xsave_available(void) +{ + if (!boot_cpu_has(X86_FEATURE_XSAVES)) + return false; + + /* + * Check the LBR state with the corresponding software structure. + * Disable LBR XSAVES support if the size doesn't match. + */ + if (xfeature_size(XFEATURE_LBR) == 0) + return false; + + if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size())) + return false; + + return true; +} + +void __init intel_pmu_arch_lbr_init(void) +{ + struct pmu *pmu = x86_get_pmu(smp_processor_id()); + union cpuid28_eax eax; + union cpuid28_ebx ebx; + union cpuid28_ecx ecx; + unsigned int unused_edx; + bool arch_lbr_xsave; + size_t size; + u64 lbr_nr; + + /* Arch LBR Capabilities */ + cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx); + + lbr_nr = fls(eax.split.lbr_depth_mask) * 8; + if (!lbr_nr) + goto clear_arch_lbr; + + /* Apply the max depth of Arch LBR */ + if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr)) + goto clear_arch_lbr; + + x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask; + x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset; + x86_pmu.lbr_lip = eax.split.lbr_lip; + x86_pmu.lbr_cpl = ebx.split.lbr_cpl; + x86_pmu.lbr_filter = ebx.split.lbr_filter; + x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack; + x86_pmu.lbr_mispred = ecx.split.lbr_mispred; + x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr; + x86_pmu.lbr_br_type = ecx.split.lbr_br_type; + x86_pmu.lbr_nr = lbr_nr; + + if (x86_pmu.lbr_mispred) + static_branch_enable(&x86_lbr_mispred); + if (x86_pmu.lbr_timed_lbr) + static_branch_enable(&x86_lbr_cycles); + if (x86_pmu.lbr_br_type) + static_branch_enable(&x86_lbr_type); + + arch_lbr_xsave = is_arch_lbr_xsave_available(); + if (arch_lbr_xsave) { + size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) + + get_lbr_state_size(); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, + XSAVE_ALIGNMENT); + } + + if (!pmu->task_ctx_cache) { + arch_lbr_xsave = false; + + size = sizeof(struct x86_perf_task_context_arch_lbr) + + lbr_nr * sizeof(struct lbr_entry); + pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0); + } + + x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0; + x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0; + x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0; + + /* LBR callstack requires both CPL and Branch Filtering support */ + if (!x86_pmu.lbr_cpl || + !x86_pmu.lbr_filter || + !x86_pmu.lbr_call_stack) + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP; + + if (!x86_pmu.lbr_cpl) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP; + } else if (!x86_pmu.lbr_filter) { + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP; + arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP; + } + + x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK; + x86_pmu.lbr_ctl_map = arch_lbr_ctl_map; + + if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter) + x86_pmu.lbr_ctl_map = NULL; + + x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset; + if (arch_lbr_xsave) { + x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave; + pr_cont("XSAVE "); + } else { + x86_pmu.lbr_save = intel_pmu_arch_lbr_save; + x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore; + x86_pmu.lbr_read = intel_pmu_arch_lbr_read; + } + + pr_cont("Architectural LBR, "); + + return; + +clear_arch_lbr: + setup_clear_cpu_cap(X86_FEATURE_ARCH_LBR); +} + +/** + * x86_perf_get_lbr - get the LBR records information + * + * @lbr: the caller's memory to store the LBR records information + * + * Returns: 0 indicates the LBR info has been successfully obtained + */ +int x86_perf_get_lbr(struct x86_pmu_lbr *lbr) +{ + int lbr_fmt = x86_pmu.intel_cap.lbr_format; + + lbr->nr = x86_pmu.lbr_nr; + lbr->from = x86_pmu.lbr_from; + lbr->to = x86_pmu.lbr_to; + lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0; + + return 0; +} +EXPORT_SYMBOL_GPL(x86_perf_get_lbr); + +struct event_constraint vlbr_constraint = + __EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR), + FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT); diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index a4cc66005ce8..03bbcc2fa2ff 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -24,7 +24,7 @@ struct p4_event_bind { unsigned int escr_msr[2]; /* ESCR MSR for this event */ unsigned int escr_emask; /* valid ESCR EventMask bits */ unsigned int shared; /* event is shared across threads */ - char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on absence */ }; struct p4_pebs_bind { @@ -45,7 +45,7 @@ struct p4_pebs_bind { * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of * event configuration to find out which values are to be * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT - * resgisters + * registers */ static struct p4_pebs_bind p4_pebs_bind_map[] = { P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), @@ -947,7 +947,7 @@ static void p4_pmu_enable_pebs(u64 config) (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } -static void p4_pmu_enable_event(struct perf_event *event) +static void __p4_pmu_enable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int thread = p4_ht_config_thread(hwc->config); @@ -983,6 +983,16 @@ static void p4_pmu_enable_event(struct perf_event *event) (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } +static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(X86_PMC_IDX_MAX)], p4_running); + +static void p4_pmu_enable_event(struct perf_event *event) +{ + int idx = event->hw.idx; + + __set_bit(idx, per_cpu(p4_running, smp_processor_id())); + __p4_pmu_enable_event(event); +} + static void p4_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -992,10 +1002,33 @@ static void p4_pmu_enable_all(int added) struct perf_event *event = cpuc->events[idx]; if (!test_bit(idx, cpuc->active_mask)) continue; - p4_pmu_enable_event(event); + __p4_pmu_enable_event(event); } } +static int p4_pmu_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); + int ret; + + ret = x86_perf_event_set_period(event); + + if (hwc->event_base) { + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } + + return ret; +} + static int p4_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -1012,7 +1045,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) if (!test_bit(idx, cpuc->active_mask)) { /* catch in-flight IRQs */ - if (__test_and_clear_bit(idx, cpuc->running)) + if (__test_and_clear_bit(idx, per_cpu(p4_running, smp_processor_id()))) handled++; continue; } @@ -1034,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) /* event overflow for sure */ perf_sample_data_init(&data, 0, hwc->last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; @@ -1306,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = { .enable_all = p4_pmu_enable_all, .enable = p4_pmu_enable_event, .disable = p4_pmu_disable_event, + + .set_period = p4_pmu_set_period, + .eventsel = MSR_P4_BPU_CCCR0, .perfctr = MSR_P4_BPU_PERFCTR0, .event_map = p4_pmu_event_map, @@ -1313,7 +1349,7 @@ static __initconst const struct x86_pmu p4_pmu = { .get_event_constraints = x86_get_event_constraints, /* * IF HT disabled we may need to use all - * ARCH_P4_MAX_CCCR counters simulaneously + * ARCH_P4_MAX_CCCR counters simultaneously * though leave it restricted at moment assuming * HT is on */ @@ -1324,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, .format_attrs = intel_p4_formats_attr, }; diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 1db7a51d9792..82ef87e9a897 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -13,6 +13,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> +#include <linux/bits.h> +#include <linux/limits.h> #include <linux/slab.h> #include <linux/device.h> @@ -57,12 +59,14 @@ static struct pt_cap_desc { PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(event_trace, 0, CPUID_EBX, BIT(7)), + PT_CAP(tnt_disable, 0, CPUID_EBX, BIT(8)), PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), PT_CAP(output_subsys, 0, CPUID_ECX, BIT(3)), PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x7), PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), @@ -108,6 +112,8 @@ PMU_FORMAT_ATTR(tsc, "config:10" ); PMU_FORMAT_ATTR(noretcomp, "config:11" ); PMU_FORMAT_ATTR(ptw, "config:12" ); PMU_FORMAT_ATTR(branch, "config:13" ); +PMU_FORMAT_ATTR(event, "config:31" ); +PMU_FORMAT_ATTR(notnt, "config:55" ); PMU_FORMAT_ATTR(mtc_period, "config:14-17" ); PMU_FORMAT_ATTR(cyc_thresh, "config:19-22" ); PMU_FORMAT_ATTR(psb_period, "config:24-27" ); @@ -116,6 +122,8 @@ static struct attribute *pt_formats_attr[] = { &format_attr_pt.attr, &format_attr_cyc.attr, &format_attr_pwr_evt.attr, + &format_attr_event.attr, + &format_attr_notnt.attr, &format_attr_fup_on_ptw.attr, &format_attr_mtc.attr, &format_attr_tsc.attr, @@ -226,8 +234,6 @@ static int __init pt_pmu_hw_init(void) pt_pmu.vmx = true; } - attrs = NULL; - for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], @@ -298,6 +304,8 @@ fail: RTIT_CTL_CYC_PSB | \ RTIT_CTL_MTC | \ RTIT_CTL_PWR_EVT_EN | \ + RTIT_CTL_EVENT_EN | \ + RTIT_CTL_NOTNT | \ RTIT_CTL_FUP_ON_PTW | \ RTIT_CTL_PTW_EN) @@ -352,6 +360,14 @@ static bool pt_event_valid(struct perf_event *event) !intel_pt_validate_hw_cap(PT_CAP_power_event_trace)) return false; + if (config & RTIT_CTL_EVENT_EN && + !intel_pt_validate_hw_cap(PT_CAP_event_trace)) + return false; + + if (config & RTIT_CTL_NOTNT && + !intel_pt_validate_hw_cap(PT_CAP_tnt_disable)) + return false; + if (config & RTIT_CTL_PTW) { if (!intel_pt_validate_hw_cap(PT_CAP_ptwrite)) return false; @@ -364,7 +380,7 @@ static bool pt_event_valid(struct perf_event *event) /* * Setting bit 0 (TraceEn in RTIT_CTL MSR) in the attr.config - * clears the assomption that BranchEn must always be enabled, + * clears the assumption that BranchEn must always be enabled, * as was the case with the first implementation of PT. * If this bit is not set, the legacy behavior is preserved * for compatibility with the older userspace. @@ -474,7 +490,7 @@ static u64 pt_config_filters(struct perf_event *event) pt->filters.filter[range].msr_b = filter->msr_b; } - rtit_ctl |= filter->config << pt_address_ranges[range].reg_off; + rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off; } return rtit_ctl; @@ -899,8 +915,9 @@ static void pt_handle_status(struct pt *pt) * means we are already losing data; need to let the decoder * know. */ - if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == pt_buffer_region_size(buf)) { + if (!buf->single && + (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || + buf->output_off == pt_buffer_region_size(buf))) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; @@ -1349,11 +1366,27 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } -static inline bool valid_kernel_ip(unsigned long ip) +#ifdef CONFIG_X86_64 +/* Clamp to a canonical address greater-than-or-equal-to the address given */ +static u64 clamp_to_ge_canonical_addr(u64 vaddr, u8 vaddr_bits) { - return virt_addr_valid(ip) && kernel_ip(ip); + return __is_canonical_address(vaddr, vaddr_bits) ? + vaddr : + -BIT_ULL(vaddr_bits - 1); } +/* Clamp to a canonical address less-than-or-equal-to the address given */ +static u64 clamp_to_le_canonical_addr(u64 vaddr, u8 vaddr_bits) +{ + return __is_canonical_address(vaddr, vaddr_bits) ? + vaddr : + BIT_ULL(vaddr_bits - 1) - 1; +} +#else +#define clamp_to_ge_canonical_addr(x, y) (x) +#define clamp_to_le_canonical_addr(x, y) (x) +#endif + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1368,14 +1401,6 @@ static int pt_event_addr_filters_validate(struct list_head *filters) filter->action == PERF_ADDR_FILTER_ACTION_START) return -EOPNOTSUPP; - if (!filter->path.dentry) { - if (!valid_kernel_ip(filter->offset)) - return -EINVAL; - - if (!valid_kernel_ip(filter->offset + filter->size)) - return -EINVAL; - } - if (++range > intel_pt_validate_hw_cap(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; } @@ -1399,9 +1424,26 @@ static void pt_event_addr_filters_sync(struct perf_event *event) if (filter->path.dentry && !fr[range].start) { msr_a = msr_b = 0; } else { - /* apply the offset */ - msr_a = fr[range].start; - msr_b = msr_a + fr[range].size - 1; + unsigned long n = fr[range].size - 1; + unsigned long a = fr[range].start; + unsigned long b; + + if (a > ULONG_MAX - n) + b = ULONG_MAX; + else + b = a + n; + /* + * Apply the offset. 64-bit addresses written to the + * MSRs must be canonical, but the range can encompass + * non-canonical addresses. Since software cannot + * execute at non-canonical addresses, adjusting to + * canonical addresses does not affect the result of the + * address filter. + */ + msr_a = clamp_to_ge_canonical_addr(a, boot_cpu_data.x86_virt_bits); + msr_b = clamp_to_le_canonical_addr(b, boot_cpu_data.x86_virt_bits); + if (msr_b < msr_a) + msr_a = msr_b = 0; } filters->filter[range].msr_a = msr_a; @@ -1710,7 +1752,7 @@ static __init int pt_init(void) if (!boot_cpu_has(X86_FEATURE_INTEL_PT)) return -ENODEV; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) { u64 ctl; @@ -1718,7 +1760,7 @@ static __init int pt_init(void) if (!ret && (ctl & RTIT_CTL_TRACEEN)) prior_warn++; } - put_online_cpus(); + cpus_read_unlock(); if (prior_warn) { x86_add_exclusive(x86_lbr_exclusive_pt); diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 86467f85c383..6f1ccc57a692 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -4,19 +4,26 @@ #include <asm/cpu_device_id.h> #include <asm/intel-family.h> #include "uncore.h" +#include "uncore_discovery.h" -static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static bool uncore_no_discover; +module_param(uncore_no_discover, bool, 0); +MODULE_PARM_DESC(uncore_no_discover, "Don't enable the Intel uncore PerfMon discovery mechanism " + "(default: enable the discovery mechanism)."); +struct intel_uncore_type *empty_uncore[] = { NULL, }; struct intel_uncore_type **uncore_msr_uncores = empty_uncore; struct intel_uncore_type **uncore_pci_uncores = empty_uncore; struct intel_uncore_type **uncore_mmio_uncores = empty_uncore; static bool pcidrv_registered; struct pci_driver *uncore_pci_driver; +/* The PCI driver for the device which the uncore doesn't own. */ +struct pci_driver *uncore_pci_sub_driver; /* pci bus to socket mapping */ DEFINE_RAW_SPINLOCK(pci2phy_map_lock); struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head); struct pci_extra_dev *uncore_extra_pci_dev; -static int max_dies; +int __uncore_max_dies; /* mask of cpus that collect uncore events */ static cpumask_t uncore_cpu_mask; @@ -29,21 +36,33 @@ struct event_constraint uncore_constraint_empty = MODULE_LICENSE("GPL"); -int uncore_pcibus_to_physid(struct pci_bus *bus) +int uncore_pcibus_to_dieid(struct pci_bus *bus) { struct pci2phy_map *map; - int phys_id = -1; + int die_id = -1; raw_spin_lock(&pci2phy_map_lock); list_for_each_entry(map, &pci2phy_map_head, list) { if (map->segment == pci_domain_nr(bus)) { - phys_id = map->pbus_to_physid[bus->number]; + die_id = map->pbus_to_dieid[bus->number]; break; } } raw_spin_unlock(&pci2phy_map_lock); - return phys_id; + return die_id; +} + +int uncore_die_to_segment(int die) +{ + struct pci_bus *bus = NULL; + + /* Find first pci bus which attributes to specified die. */ + while ((bus = pci_find_next_bus(bus)) && + (die != uncore_pcibus_to_dieid(bus))) + ; + + return bus ? pci_domain_nr(bus) : -EINVAL; } static void uncore_free_pcibus_map(void) @@ -84,7 +103,7 @@ lookup: alloc = NULL; map->segment = segment; for (i = 0; i < 256; i++) - map->pbus_to_physid[i] = -1; + map->pbus_to_dieid[i] = -1; list_add_tail(&map->list, &pci2phy_map_head); end: @@ -92,8 +111,8 @@ end: return map; } -ssize_t uncore_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +ssize_t uncore_event_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct uncore_event_desc *event = container_of(attr, struct uncore_event_desc, attr); @@ -108,7 +127,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu * The unsigned check also catches the '-1' return value for non * existent mappings in the topology map. */ - return dieid < max_dies ? pmu->boxes[dieid] : NULL; + return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL; } u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) @@ -132,6 +151,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box, if (!box->io_addr) return 0; + if (!uncore_mmio_is_valid_offset(box, event->hw.event_base)) + return 0; + return readq(box->io_addr + event->hw.event_base); } @@ -327,7 +349,6 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, uncore_pmu_init_hrtimer(box); box->cpu = -1; - box->pci_phys_id = -1; box->dieid = -1; /* set default hrtimer timeout */ @@ -780,8 +801,6 @@ static void uncore_pmu_enable(struct pmu *pmu) struct intel_uncore_box *box; uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); - if (!uncore_pmu) - return; box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); if (!box) @@ -797,8 +816,6 @@ static void uncore_pmu_disable(struct pmu *pmu) struct intel_uncore_box *box; uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu); - if (!uncore_pmu) - return; box = uncore_pmu_to_box(uncore_pmu, smp_processor_id()); if (!box) @@ -825,6 +842,45 @@ static const struct attribute_group uncore_pmu_attr_group = { .attrs = uncore_pmu_attrs, }; +void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu) +{ + struct intel_uncore_type *type = pmu->type; + + if (type->num_boxes == 1) + sprintf(pmu_name, "uncore_type_%u", type->type_id); + else { + sprintf(pmu_name, "uncore_type_%u_%d", + type->type_id, type->box_ids[pmu->pmu_idx]); + } +} + +static void uncore_get_pmu_name(struct intel_uncore_pmu *pmu) +{ + struct intel_uncore_type *type = pmu->type; + + /* + * No uncore block name in discovery table. + * Use uncore_type_&typeid_&boxid as name. + */ + if (!type->name) { + uncore_get_alias_name(pmu->name, pmu); + return; + } + + if (type->num_boxes == 1) { + if (strlen(type->name) > 0) + sprintf(pmu->name, "uncore_%s", type->name); + else + sprintf(pmu->name, "uncore"); + } else { + /* + * Use the box ID from the discovery table if applicable. + */ + sprintf(pmu->name, "uncore_%s_%d", type->name, + type->box_ids ? type->box_ids[pmu->pmu_idx] : pmu->pmu_idx); + } +} + static int uncore_pmu_register(struct intel_uncore_pmu *pmu) { int ret; @@ -843,21 +899,15 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu) .read = uncore_pmu_event_read, .module = THIS_MODULE, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .attr_update = pmu->type->attr_update, }; } else { pmu->pmu = *pmu->type->pmu; pmu->pmu.attr_groups = pmu->type->attr_groups; + pmu->pmu.attr_update = pmu->type->attr_update; } - if (pmu->type->num_boxes == 1) { - if (strlen(pmu->type->name) > 0) - sprintf(pmu->name, "uncore_%s", pmu->type->name); - else - sprintf(pmu->name, "uncore"); - } else { - sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, - pmu->pmu_idx); - } + uncore_get_pmu_name(pmu); ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); if (!ret) @@ -877,7 +927,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu) { int die; - for (die = 0; die < max_dies; die++) + for (die = 0; die < uncore_max_dies(); die++) kfree(pmu->boxes[die]); kfree(pmu->boxes); } @@ -887,6 +937,9 @@ static void uncore_type_exit(struct intel_uncore_type *type) struct intel_uncore_pmu *pmu = type->pmus; int i; + if (type->cleanup_mapping) + type->cleanup_mapping(type); + if (pmu) { for (i = 0; i < type->num_boxes; i++, pmu++) { uncore_pmu_unregister(pmu); @@ -895,6 +948,10 @@ static void uncore_type_exit(struct intel_uncore_type *type) kfree(type->pmus); type->pmus = NULL; } + if (type->box_ids) { + kfree(type->box_ids); + type->box_ids = NULL; + } kfree(type->events_group); type->events_group = NULL; } @@ -915,7 +972,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) if (!pmus) return -ENOMEM; - size = max_dies * sizeof(struct intel_uncore_box *); + size = uncore_max_dies() * sizeof(struct intel_uncore_box *); for (i = 0; i < type->num_boxes; i++) { pmus[i].func_id = setid ? i : -1; @@ -954,6 +1011,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid) type->pmu_group = &uncore_pmu_attr_group; + if (type->set_mapping) + type->set_mapping(type); + return 0; err: @@ -978,65 +1038,93 @@ uncore_types_init(struct intel_uncore_type **types, bool setid) } /* - * add a pci uncore device + * Get the die information of a PCI device. + * @pdev: The PCI device. + * @die: The die id which the device maps to. */ -static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static int uncore_pci_get_dev_die_info(struct pci_dev *pdev, int *die) { - struct intel_uncore_type *type; - struct intel_uncore_pmu *pmu = NULL; - struct intel_uncore_box *box; - int phys_id, die, ret; - - phys_id = uncore_pcibus_to_physid(pdev->bus); - if (phys_id < 0) - return -ENODEV; - - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); - if (die < 0) + *die = uncore_pcibus_to_dieid(pdev->bus); + if (*die < 0) return -EINVAL; - if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { - int idx = UNCORE_PCI_DEV_IDX(id->driver_data); + return 0; +} - uncore_extra_pci_dev[die].dev[idx] = pdev; - pci_set_drvdata(pdev, NULL); - return 0; +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu_from_types(struct pci_dev *pdev) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; + if (pdev->devfn == UNCORE_DISCOVERY_PCI_DEVFN(box_ctl) && + pdev->bus->number == UNCORE_DISCOVERY_PCI_BUS(box_ctl) && + pci_domain_nr(pdev->bus) == UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl)) + return &type->pmus[i]; + } + } } - type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + return NULL; +} - /* - * Some platforms, e.g. Knights Landing, use a common PCI device ID - * for multiple instances of an uncore PMU device type. We should check - * PCI slot and func to indicate the uncore box. - */ - if (id->driver_data & ~0xffff) { - struct pci_driver *pci_drv = pdev->driver; - const struct pci_device_id *ids = pci_drv->id_table; - unsigned int devfn; - - while (ids && ids->vendor) { - if ((ids->vendor == pdev->vendor) && - (ids->device == pdev->device)) { - devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), - UNCORE_PCI_DEV_FUNC(ids->driver_data)); - if (devfn == pdev->devfn) { - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; - break; - } +/* + * Find the PMU of a PCI device. + * @pdev: The PCI device. + * @ids: The ID table of the available PCI devices with a PMU. + * If NULL, search the whole uncore_pci_uncores. + */ +static struct intel_uncore_pmu * +uncore_pci_find_dev_pmu(struct pci_dev *pdev, const struct pci_device_id *ids) +{ + struct intel_uncore_pmu *pmu = NULL; + struct intel_uncore_type *type; + kernel_ulong_t data; + unsigned int devfn; + + if (!ids) + return uncore_pci_find_dev_pmu_from_types(pdev); + + while (ids && ids->vendor) { + if ((ids->vendor == pdev->vendor) && + (ids->device == pdev->device)) { + data = ids->driver_data; + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(data), + UNCORE_PCI_DEV_FUNC(data)); + if (devfn == pdev->devfn) { + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(data)]; + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(data)]; + break; } - ids++; } - if (pmu == NULL) - return -ENODEV; - } else { - /* - * for performance monitoring unit with multiple boxes, - * each box has a different function id. - */ - pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + ids++; } + return pmu; +} + +/* + * Register the PMU for a PCI device + * @pdev: The PCI device. + * @type: The corresponding PMU type of the device. + * @pmu: The corresponding PMU of the device. + * @die: The die id which the device maps to. + */ +static int uncore_pci_pmu_register(struct pci_dev *pdev, + struct intel_uncore_type *type, + struct intel_uncore_pmu *pmu, + int die) +{ + struct intel_uncore_box *box; + int ret; if (WARN_ON_ONCE(pmu->boxes[die] != NULL)) return -EINVAL; @@ -1051,12 +1139,10 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id WARN_ON_ONCE(pmu->func_id != pdev->devfn); atomic_inc(&box->refcnt); - box->pci_phys_id = phys_id; box->dieid = die; box->pci_dev = pdev; box->pmu = pmu; uncore_box_init(box); - pci_set_drvdata(pdev, box); pmu->boxes[die] = box; if (atomic_inc_return(&pmu->activeboxes) > 1) @@ -1065,7 +1151,6 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id /* First active box registers the pmu */ ret = uncore_pmu_register(pmu); if (ret) { - pci_set_drvdata(pdev, NULL); pmu->boxes[die] = NULL; uncore_box_exit(box); kfree(box); @@ -1073,18 +1158,82 @@ static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id return ret; } +/* + * add a pci uncore device + */ +static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu = NULL; + int die, ret; + + ret = uncore_pci_get_dev_die_info(pdev, &die); + if (ret) + return ret; + + if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) { + int idx = UNCORE_PCI_DEV_IDX(id->driver_data); + + uncore_extra_pci_dev[die].dev[idx] = pdev; + pci_set_drvdata(pdev, NULL); + return 0; + } + + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)]; + + /* + * Some platforms, e.g. Knights Landing, use a common PCI device ID + * for multiple instances of an uncore PMU device type. We should check + * PCI slot and func to indicate the uncore box. + */ + if (id->driver_data & ~0xffff) { + struct pci_driver *pci_drv = to_pci_driver(pdev->dev.driver); + + pmu = uncore_pci_find_dev_pmu(pdev, pci_drv->id_table); + if (pmu == NULL) + return -ENODEV; + } else { + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)]; + } + + ret = uncore_pci_pmu_register(pdev, type, pmu, die); + + pci_set_drvdata(pdev, pmu->boxes[die]); + + return ret; +} + +/* + * Unregister the PMU of a PCI device + * @pmu: The corresponding PMU is unregistered. + * @die: The die id which the device maps to. + */ +static void uncore_pci_pmu_unregister(struct intel_uncore_pmu *pmu, int die) +{ + struct intel_uncore_box *box = pmu->boxes[die]; + + pmu->boxes[die] = NULL; + if (atomic_dec_return(&pmu->activeboxes) == 0) + uncore_pmu_unregister(pmu); + uncore_box_exit(box); + kfree(box); +} + static void uncore_pci_remove(struct pci_dev *pdev) { struct intel_uncore_box *box; struct intel_uncore_pmu *pmu; - int i, phys_id, die; + int i, die; - phys_id = uncore_pcibus_to_physid(pdev->bus); + if (uncore_pci_get_dev_die_info(pdev, &die)) + return; box = pci_get_drvdata(pdev); if (!box) { - die = (topology_max_die_per_package() > 1) ? phys_id : - topology_phys_to_logical_pkg(phys_id); for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) { if (uncore_extra_pci_dev[die].dev[i] == pdev) { uncore_extra_pci_dev[die].dev[i] = NULL; @@ -1096,15 +1245,133 @@ static void uncore_pci_remove(struct pci_dev *pdev) } pmu = box->pmu; - if (WARN_ON_ONCE(phys_id != box->pci_phys_id)) - return; pci_set_drvdata(pdev, NULL); - pmu->boxes[box->dieid] = NULL; - if (atomic_dec_return(&pmu->activeboxes) == 0) - uncore_pmu_unregister(pmu); - uncore_box_exit(box); - kfree(box); + + uncore_pci_pmu_unregister(pmu, die); +} + +static int uncore_bus_notify(struct notifier_block *nb, + unsigned long action, void *data, + const struct pci_device_id *ids) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + struct intel_uncore_pmu *pmu; + int die; + + /* Unregister the PMU when the device is going to be deleted. */ + if (action != BUS_NOTIFY_DEL_DEVICE) + return NOTIFY_DONE; + + pmu = uncore_pci_find_dev_pmu(pdev, ids); + if (!pmu) + return NOTIFY_DONE; + + if (uncore_pci_get_dev_die_info(pdev, &die)) + return NOTIFY_DONE; + + uncore_pci_pmu_unregister(pmu, die); + + return NOTIFY_OK; +} + +static int uncore_pci_sub_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, + uncore_pci_sub_driver->id_table); +} + +static struct notifier_block uncore_pci_sub_notifier = { + .notifier_call = uncore_pci_sub_bus_notify, +}; + +static void uncore_pci_sub_driver_init(void) +{ + const struct pci_device_id *ids = uncore_pci_sub_driver->id_table; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pci_sub_dev; + bool notify = false; + unsigned int devfn; + int die; + + while (ids && ids->vendor) { + pci_sub_dev = NULL; + type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(ids->driver_data)]; + /* + * Search the available device, and register the + * corresponding PMU. + */ + while ((pci_sub_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + ids->device, pci_sub_dev))) { + devfn = PCI_DEVFN(UNCORE_PCI_DEV_DEV(ids->driver_data), + UNCORE_PCI_DEV_FUNC(ids->driver_data)); + if (devfn != pci_sub_dev->devfn) + continue; + + pmu = &type->pmus[UNCORE_PCI_DEV_IDX(ids->driver_data)]; + if (!pmu) + continue; + + if (uncore_pci_get_dev_die_info(pci_sub_dev, &die)) + continue; + + if (!uncore_pci_pmu_register(pci_sub_dev, type, pmu, + die)) + notify = true; + } + ids++; + } + + if (notify && bus_register_notifier(&pci_bus_type, &uncore_pci_sub_notifier)) + notify = false; + + if (!notify) + uncore_pci_sub_driver = NULL; +} + +static int uncore_pci_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + return uncore_bus_notify(nb, action, data, NULL); +} + +static struct notifier_block uncore_pci_notifier = { + .notifier_call = uncore_pci_bus_notify, +}; + + +static void uncore_pci_pmus_register(void) +{ + struct intel_uncore_type **types = uncore_pci_uncores; + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct pci_dev *pdev; + u64 box_ctl; + int i, die; + + for (; *types; types++) { + type = *types; + for (die = 0; die < __uncore_max_dies; die++) { + for (i = 0; i < type->num_boxes; i++) { + if (!type->box_ctls[die]) + continue; + box_ctl = type->box_ctls[die] + type->pci_offsets[i]; + pdev = pci_get_domain_bus_and_slot(UNCORE_DISCOVERY_PCI_DOMAIN(box_ctl), + UNCORE_DISCOVERY_PCI_BUS(box_ctl), + UNCORE_DISCOVERY_PCI_DEVFN(box_ctl)); + if (!pdev) + continue; + pmu = &type->pmus[i]; + + uncore_pci_pmu_register(pdev, type, pmu, die); + } + } + } + + bus_register_notifier(&pci_bus_type, &uncore_pci_notifier); } static int __init uncore_pci_init(void) @@ -1112,7 +1379,7 @@ static int __init uncore_pci_init(void) size_t size; int ret; - size = max_dies * sizeof(struct pci_extra_dev); + size = uncore_max_dies() * sizeof(struct pci_extra_dev); uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL); if (!uncore_extra_pci_dev) { ret = -ENOMEM; @@ -1123,12 +1390,18 @@ static int __init uncore_pci_init(void) if (ret) goto errtype; - uncore_pci_driver->probe = uncore_pci_probe; - uncore_pci_driver->remove = uncore_pci_remove; + if (uncore_pci_driver) { + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; - ret = pci_register_driver(uncore_pci_driver); - if (ret) - goto errtype; + ret = pci_register_driver(uncore_pci_driver); + if (ret) + goto errtype; + } else + uncore_pci_pmus_register(); + + if (uncore_pci_sub_driver) + uncore_pci_sub_driver_init(); pcidrv_registered = true; return 0; @@ -1147,7 +1420,12 @@ static void uncore_pci_exit(void) { if (pcidrv_registered) { pcidrv_registered = false; - pci_unregister_driver(uncore_pci_driver); + if (uncore_pci_sub_driver) + bus_unregister_notifier(&pci_bus_type, &uncore_pci_sub_notifier); + if (uncore_pci_driver) + pci_unregister_driver(uncore_pci_driver); + else + bus_unregister_notifier(&pci_bus_type, &uncore_pci_notifier); uncore_types_exit(uncore_pci_uncores); kfree(uncore_extra_pci_dev); uncore_free_pcibus_map(); @@ -1392,14 +1670,11 @@ err: return ret; } - -#define X86_UNCORE_MODEL_MATCH(model, init) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } - struct intel_uncore_init_fun { void (*cpu_init)(void); int (*pci_init)(void); void (*mmio_init)(void); + bool use_discovery; }; static const struct intel_uncore_init_fun nhm_uncore_init __initconst = { @@ -1470,45 +1745,97 @@ static const struct intel_uncore_init_fun icl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun tgl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .mmio_init = tgl_uncore_mmio_init, +}; + +static const struct intel_uncore_init_fun tgl_l_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .mmio_init = tgl_l_uncore_mmio_init, +}; + +static const struct intel_uncore_init_fun rkl_uncore_init __initconst = { + .cpu_init = tgl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + +static const struct intel_uncore_init_fun adl_uncore_init __initconst = { + .cpu_init = adl_uncore_cpu_init, + .mmio_init = adl_uncore_mmio_init, +}; + +static const struct intel_uncore_init_fun icx_uncore_init __initconst = { + .cpu_init = icx_uncore_cpu_init, + .pci_init = icx_uncore_pci_init, + .mmio_init = icx_uncore_mmio_init, +}; + static const struct intel_uncore_init_fun snr_uncore_init __initconst = { .cpu_init = snr_uncore_cpu_init, .pci_init = snr_uncore_pci_init, .mmio_init = snr_uncore_mmio_init, }; +static const struct intel_uncore_init_fun spr_uncore_init __initconst = { + .cpu_init = spr_uncore_cpu_init, + .pci_init = spr_uncore_pci_init, + .mmio_init = spr_uncore_mmio_init, + .use_discovery = true, +}; + +static const struct intel_uncore_init_fun generic_uncore_init __initconst = { + .cpu_init = intel_uncore_generic_uncore_cpu_init, + .pci_init = intel_uncore_generic_uncore_pci_init, + .mmio_init = intel_uncore_generic_uncore_mmio_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EP, nhm_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, snb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, ivb_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_L, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_G, hsw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, bdw_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, snbep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EX, nhmex_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_WESTMERE_EX, nhmex_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, ivbep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_HASWELL_X, hswep_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, bdx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, knl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE, skl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_NNPI, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE, icl_uncore_init), - X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ATOM_TREMONT_D, snr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EP, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EP, &nhm_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &snb_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &ivb_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &hsw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &bdw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &bdw_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &snbep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(NEHALEM_EX, &nhmex_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(WESTMERE_EX, &nhmex_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &ivbep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &hswep_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &bdx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &bdx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &knl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &knl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &skx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &skl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &icl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &icx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &icx_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &tgl_l_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &tgl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE, &rkl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, }; - MODULE_DEVICE_TABLE(x86cpu, intel_uncore_match); static int __init intel_uncore_init(void) @@ -1517,16 +1844,26 @@ static int __init intel_uncore_init(void) struct intel_uncore_init_fun *uncore_init; int pret = 0, cret = 0, mret = 0, ret; - id = x86_match_cpu(intel_uncore_match); - if (!id) - return -ENODEV; - if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return -ENODEV; - max_dies = topology_max_packages() * topology_max_die_per_package(); + __uncore_max_dies = + topology_max_packages() * topology_max_die_per_package(); + + id = x86_match_cpu(intel_uncore_match); + if (!id) { + if (!uncore_no_discover && intel_uncore_has_discovery_tables()) + uncore_init = (struct intel_uncore_init_fun *)&generic_uncore_init; + else + return -ENODEV; + } else { + uncore_init = (struct intel_uncore_init_fun *)id->driver_data; + if (uncore_no_discover && uncore_init->use_discovery) + return -ENODEV; + if (uncore_init->use_discovery && !intel_uncore_has_discovery_tables()) + return -ENODEV; + } - uncore_init = (struct intel_uncore_init_fun *)id->driver_data; if (uncore_init->pci_init) { pret = uncore_init->pci_init(); if (!pret) @@ -1543,8 +1880,10 @@ static int __init intel_uncore_init(void) mret = uncore_mmio_init(); } - if (cret && pret && mret) - return -ENODEV; + if (cret && pret && mret) { + ret = -ENODEV; + goto free_discovery; + } /* Install hotplug callbacks to setup the targets for each package */ ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_UNCORE_ONLINE, @@ -1559,6 +1898,8 @@ err: uncore_types_exit(uncore_msr_uncores); uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); +free_discovery: + intel_uncore_clear_discovery_tables(); return ret; } module_init(intel_uncore_init); @@ -1569,5 +1910,6 @@ static void __exit intel_uncore_exit(void) uncore_types_exit(uncore_msr_uncores); uncore_types_exit(uncore_mmio_uncores); uncore_pci_exit(); + intel_uncore_clear_discovery_tables(); } module_exit(intel_uncore_exit); diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index bbfdaa720b45..2adeaf4de4df 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -42,6 +42,7 @@ struct intel_uncore_pmu; struct intel_uncore_box; struct uncore_event_desc; struct freerunning_counters; +struct intel_uncore_topology; struct intel_uncore_type { const char *name; @@ -50,6 +51,7 @@ struct intel_uncore_type { int perf_ctr_bits; int fixed_ctr_bits; int num_freerunning_types; + int type_id; unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; @@ -57,14 +59,21 @@ struct intel_uncore_type { unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; + u64 *box_ctls; /* Unit ctrl addr of the first box of each die */ union { unsigned msr_offset; unsigned mmio_offset; }; + unsigned mmio_map_size; unsigned num_shared_regs:8; unsigned single_fixed:1; unsigned pair_ctr_ctl:1; - unsigned *msr_offsets; + union { + unsigned *msr_offsets; + unsigned *pci_offsets; + unsigned *mmio_offsets; + }; + unsigned *box_ids; struct event_constraint unconstrainted; struct event_constraint *constraints; struct intel_uncore_pmu *pmus; @@ -72,7 +81,20 @@ struct intel_uncore_type { struct uncore_event_desc *event_descs; struct freerunning_counters *freerunning; const struct attribute_group *attr_groups[4]; + const struct attribute_group **attr_update; struct pmu *pmu; /* for custom pmu ops */ + /* + * Uncore PMU would store relevant platform topology configuration here + * to identify which platform component each PMON block of that type is + * supposed to monitor. + */ + struct intel_uncore_topology *topology; + /* + * Optional callbacks for managing mapping of Uncore units to PMONs + */ + int (*get_topology)(struct intel_uncore_type *type); + int (*set_mapping)(struct intel_uncore_type *type); + void (*cleanup_mapping)(struct intel_uncore_type *type); }; #define pmu_group attr_groups[0] @@ -111,7 +133,6 @@ struct intel_uncore_extra_reg { }; struct intel_uncore_box { - int pci_phys_id; int dieid; /* Logical die ID */ int n_active; /* number of active events */ int n_events; @@ -130,7 +151,7 @@ struct intel_uncore_box { struct list_head list; struct list_head active_list; void __iomem *io_addr; - struct intel_uncore_extra_reg shared_regs[0]; + struct intel_uncore_extra_reg shared_regs[]; }; /* CFL uncore 8th cbox MSRs */ @@ -144,7 +165,7 @@ struct intel_uncore_box { #define UNCORE_BOX_FLAG_CFL8_CBOX_MSR_OFFS 2 struct uncore_event_desc { - struct kobj_attribute attr; + struct device_attribute attr; const char *config; }; @@ -154,19 +175,38 @@ struct freerunning_counters { unsigned int box_offset; unsigned int num_counters; unsigned int bits; + unsigned *box_offsets; +}; + +struct intel_uncore_topology { + u64 configuration; + int segment; }; struct pci2phy_map { struct list_head list; int segment; - int pbus_to_physid[256]; + int pbus_to_dieid[256]; }; struct pci2phy_map *__find_pci2phy_map(int segment); -int uncore_pcibus_to_physid(struct pci_bus *bus); +int uncore_pcibus_to_dieid(struct pci_bus *bus); +int uncore_die_to_segment(int die); + +ssize_t uncore_event_show(struct device *dev, + struct device_attribute *attr, char *buf); -ssize_t uncore_event_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf); +static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev) +{ + return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu); +} + +#define to_device_attribute(n) container_of(n, struct device_attribute, attr) +#define to_dev_ext_attribute(n) container_of(n, struct dev_ext_attribute, attr) +#define attr_to_ext_attr(n) to_dev_ext_attribute(to_device_attribute(n)) + +extern int __uncore_max_dies; +#define uncore_max_dies() (__uncore_max_dies) #define INTEL_UNCORE_EVENT_DESC(_name, _config) \ { \ @@ -175,14 +215,14 @@ ssize_t uncore_event_show(struct kobject *kobj, } #define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ +static ssize_t __uncore_##_var##_show(struct device *dev, \ + struct device_attribute *attr, \ char *page) \ { \ BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ return sprintf(page, _format "\n"); \ } \ -static struct kobj_attribute format_attr_##_var = \ +static struct device_attribute format_attr_##_var = \ __ATTR(_name, 0444, __uncore_##_var##_show, NULL) static inline bool uncore_pmc_fixed(int idx) @@ -195,6 +235,18 @@ static inline bool uncore_pmc_freerunning(int idx) return idx == UNCORE_PMC_IDX_FREERUNNING; } +static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box, + unsigned long offset) +{ + if (offset < box->pmu->type->mmio_map_size) + return true; + + pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n", + offset, box->pmu->type->name); + + return false; +} + static inline unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box) { @@ -310,7 +362,9 @@ unsigned int uncore_freerunning_counter(struct intel_uncore_box *box, return pmu->type->freerunning[type].counter_base + pmu->type->freerunning[type].counter_offset * idx + - pmu->type->freerunning[type].box_offset * pmu->pmu_idx; + (pmu->type->freerunning[type].box_offsets ? + pmu->type->freerunning[type].box_offsets[pmu->pmu_idx] : + pmu->type->freerunning[type].box_offset * pmu->pmu_idx); } static inline @@ -507,11 +561,14 @@ struct event_constraint * uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event); void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event); u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx); +void uncore_get_alias_name(char *pmu_name, struct intel_uncore_pmu *pmu); +extern struct intel_uncore_type *empty_uncore[]; extern struct intel_uncore_type **uncore_msr_uncores; extern struct intel_uncore_type **uncore_pci_uncores; extern struct intel_uncore_type **uncore_mmio_uncores; extern struct pci_driver *uncore_pci_driver; +extern struct pci_driver *uncore_pci_sub_driver; extern raw_spinlock_t pci2phy_map_lock; extern struct list_head pci2phy_map_head; extern struct pci_extra_dev *uncore_extra_pci_dev; @@ -527,6 +584,11 @@ void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); void icl_uncore_cpu_init(void); +void tgl_uncore_cpu_init(void); +void adl_uncore_cpu_init(void); +void tgl_uncore_mmio_init(void); +void tgl_l_uncore_mmio_init(void); +void adl_uncore_mmio_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ @@ -545,6 +607,12 @@ void skx_uncore_cpu_init(void); int snr_uncore_pci_init(void); void snr_uncore_cpu_init(void); void snr_uncore_mmio_init(void); +int icx_uncore_pci_init(void); +void icx_uncore_cpu_init(void); +void icx_uncore_mmio_init(void); +int spr_uncore_pci_init(void); +void spr_uncore_cpu_init(void); +void spr_uncore_mmio_init(void); /* uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c new file mode 100644 index 000000000000..5fd72d4b8bbb --- /dev/null +++ b/arch/x86/events/intel/uncore_discovery.c @@ -0,0 +1,630 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Support Intel uncore PerfMon discovery mechanism. + * Copyright(c) 2021 Intel Corporation. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "uncore.h" +#include "uncore_discovery.h" + +static struct rb_root discovery_tables = RB_ROOT; +static int num_discovered_types[UNCORE_ACCESS_MAX]; + +static bool has_generic_discovery_table(void) +{ + struct pci_dev *dev; + int dvsec; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, UNCORE_DISCOVERY_TABLE_DEVICE, NULL); + if (!dev) + return false; + + /* A discovery table device has the unique capability ID. */ + dvsec = pci_find_next_ext_capability(dev, 0, UNCORE_EXT_CAP_ID_DISCOVERY); + pci_dev_put(dev); + if (dvsec) + return true; + + return false; +} + +static int logical_die_id; + +static int get_device_die_id(struct pci_dev *dev) +{ + int cpu, node = pcibus_to_node(dev->bus); + + /* + * If the NUMA info is not available, assume that the logical die id is + * continuous in the order in which the discovery table devices are + * detected. + */ + if (node < 0) + return logical_die_id++; + + for_each_cpu(cpu, cpumask_of_node(node)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) + return c->logical_die_id; + } + + /* + * All CPUs of a node may be offlined. For this case, + * the PCI and MMIO type of uncore blocks which are + * enumerated by the device will be unavailable. + */ + return -1; +} + +#define __node_2_type(cur) \ + rb_entry((cur), struct intel_uncore_discovery_type, node) + +static inline int __type_cmp(const void *key, const struct rb_node *b) +{ + struct intel_uncore_discovery_type *type_b = __node_2_type(b); + const u16 *type_id = key; + + if (type_b->type > *type_id) + return -1; + else if (type_b->type < *type_id) + return 1; + + return 0; +} + +static inline struct intel_uncore_discovery_type * +search_uncore_discovery_type(u16 type_id) +{ + struct rb_node *node = rb_find(&type_id, &discovery_tables, __type_cmp); + + return (node) ? __node_2_type(node) : NULL; +} + +static inline bool __type_less(struct rb_node *a, const struct rb_node *b) +{ + return (__node_2_type(a)->type < __node_2_type(b)->type); +} + +static struct intel_uncore_discovery_type * +add_uncore_discovery_type(struct uncore_unit_discovery *unit) +{ + struct intel_uncore_discovery_type *type; + + if (unit->access_type >= UNCORE_ACCESS_MAX) { + pr_warn("Unsupported access type %d\n", unit->access_type); + return NULL; + } + + type = kzalloc(sizeof(struct intel_uncore_discovery_type), GFP_KERNEL); + if (!type) + return NULL; + + type->box_ctrl_die = kcalloc(__uncore_max_dies, sizeof(u64), GFP_KERNEL); + if (!type->box_ctrl_die) + goto free_type; + + type->access_type = unit->access_type; + num_discovered_types[type->access_type]++; + type->type = unit->box_type; + + rb_add(&type->node, &discovery_tables, __type_less); + + return type; + +free_type: + kfree(type); + + return NULL; + +} + +static struct intel_uncore_discovery_type * +get_uncore_discovery_type(struct uncore_unit_discovery *unit) +{ + struct intel_uncore_discovery_type *type; + + type = search_uncore_discovery_type(unit->box_type); + if (type) + return type; + + return add_uncore_discovery_type(unit); +} + +static void +uncore_insert_box_info(struct uncore_unit_discovery *unit, + int die, bool parsed) +{ + struct intel_uncore_discovery_type *type; + unsigned int *box_offset, *ids; + int i; + + if (WARN_ON_ONCE(!unit->ctl || !unit->ctl_offset || !unit->ctr_offset)) + return; + + if (parsed) { + type = search_uncore_discovery_type(unit->box_type); + if (WARN_ON_ONCE(!type)) + return; + /* Store the first box of each die */ + if (!type->box_ctrl_die[die]) + type->box_ctrl_die[die] = unit->ctl; + return; + } + + type = get_uncore_discovery_type(unit); + if (!type) + return; + + box_offset = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + if (!box_offset) + return; + + ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL); + if (!ids) + goto free_box_offset; + + /* Store generic information for the first box */ + if (!type->num_boxes) { + type->box_ctrl = unit->ctl; + type->box_ctrl_die[die] = unit->ctl; + type->num_counters = unit->num_regs; + type->counter_width = unit->bit_width; + type->ctl_offset = unit->ctl_offset; + type->ctr_offset = unit->ctr_offset; + *ids = unit->box_id; + goto end; + } + + for (i = 0; i < type->num_boxes; i++) { + ids[i] = type->ids[i]; + box_offset[i] = type->box_offset[i]; + + if (WARN_ON_ONCE(unit->box_id == ids[i])) + goto free_ids; + } + ids[i] = unit->box_id; + box_offset[i] = unit->ctl - type->box_ctrl; + kfree(type->ids); + kfree(type->box_offset); +end: + type->ids = ids; + type->box_offset = box_offset; + type->num_boxes++; + return; + +free_ids: + kfree(ids); + +free_box_offset: + kfree(box_offset); + +} + +static int parse_discovery_table(struct pci_dev *dev, int die, + u32 bar_offset, bool *parsed) +{ + struct uncore_global_discovery global; + struct uncore_unit_discovery unit; + void __iomem *io_addr; + resource_size_t addr; + unsigned long size; + u32 val; + int i; + + pci_read_config_dword(dev, bar_offset, &val); + + if (val & ~PCI_BASE_ADDRESS_MEM_MASK & ~PCI_BASE_ADDRESS_MEM_TYPE_64) + return -EINVAL; + + addr = (resource_size_t)(val & PCI_BASE_ADDRESS_MEM_MASK); +#ifdef CONFIG_PHYS_ADDR_T_64BIT + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + u32 val2; + + pci_read_config_dword(dev, bar_offset + 4, &val2); + addr |= ((resource_size_t)val2) << 32; + } +#endif + size = UNCORE_DISCOVERY_GLOBAL_MAP_SIZE; + io_addr = ioremap(addr, size); + if (!io_addr) + return -ENOMEM; + + /* Read Global Discovery State */ + memcpy_fromio(&global, io_addr, sizeof(struct uncore_global_discovery)); + if (uncore_discovery_invalid_unit(global)) { + pr_info("Invalid Global Discovery State: 0x%llx 0x%llx 0x%llx\n", + global.table1, global.ctl, global.table3); + iounmap(io_addr); + return -EINVAL; + } + iounmap(io_addr); + + size = (1 + global.max_units) * global.stride * 8; + io_addr = ioremap(addr, size); + if (!io_addr) + return -ENOMEM; + + /* Parsing Unit Discovery State */ + for (i = 0; i < global.max_units; i++) { + memcpy_fromio(&unit, io_addr + (i + 1) * (global.stride * 8), + sizeof(struct uncore_unit_discovery)); + + if (uncore_discovery_invalid_unit(unit)) + continue; + + if (unit.access_type >= UNCORE_ACCESS_MAX) + continue; + + uncore_insert_box_info(&unit, die, *parsed); + } + + *parsed = true; + iounmap(io_addr); + return 0; +} + +bool intel_uncore_has_discovery_tables(void) +{ + u32 device, val, entry_id, bar_offset; + int die, dvsec = 0, ret = true; + struct pci_dev *dev = NULL; + bool parsed = false; + + if (has_generic_discovery_table()) + device = UNCORE_DISCOVERY_TABLE_DEVICE; + else + device = PCI_ANY_ID; + + /* + * Start a new search and iterates through the list of + * the discovery table devices. + */ + while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, dev)) != NULL) { + while ((dvsec = pci_find_next_ext_capability(dev, dvsec, UNCORE_EXT_CAP_ID_DISCOVERY))) { + pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC_OFFSET, &val); + entry_id = val & UNCORE_DISCOVERY_DVSEC_ID_MASK; + if (entry_id != UNCORE_DISCOVERY_DVSEC_ID_PMON) + continue; + + pci_read_config_dword(dev, dvsec + UNCORE_DISCOVERY_DVSEC2_OFFSET, &val); + + if (val & ~UNCORE_DISCOVERY_DVSEC2_BIR_MASK) { + ret = false; + goto err; + } + bar_offset = UNCORE_DISCOVERY_BIR_BASE + + (val & UNCORE_DISCOVERY_DVSEC2_BIR_MASK) * UNCORE_DISCOVERY_BIR_STEP; + + die = get_device_die_id(dev); + if (die < 0) + continue; + + parse_discovery_table(dev, die, bar_offset, &parsed); + } + } + + /* None of the discovery tables are available */ + if (!parsed) + ret = false; +err: + pci_dev_put(dev); + + return ret; +} + +void intel_uncore_clear_discovery_tables(void) +{ + struct intel_uncore_discovery_type *type, *next; + + rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) { + kfree(type->box_ctrl_die); + kfree(type); + } +} + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh, thresh, "config:24-31"); + +static struct attribute *generic_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh.attr, + NULL, +}; + +static const struct attribute_group generic_uncore_format_group = { + .name = "format", + .attrs = generic_uncore_formats_attr, +}; + +void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_INT); +} + +void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), GENERIC_PMON_BOX_CTL_FRZ); +} + +void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(uncore_msr_box_ctl(box), 0); +} + +static void intel_generic_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static void intel_generic_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, 0); +} + +static struct intel_uncore_ops generic_uncore_msr_ops = { + .init_box = intel_generic_uncore_msr_init_box, + .disable_box = intel_generic_uncore_msr_disable_box, + .enable_box = intel_generic_uncore_msr_enable_box, + .disable_event = intel_generic_uncore_msr_disable_event, + .enable_event = intel_generic_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_INT); +} + +void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + pci_write_config_dword(pdev, box_ctl, GENERIC_PMON_BOX_CTL_FRZ); +} + +void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + + pci_write_config_dword(pdev, box_ctl, 0); +} + +static void intel_generic_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, 0); +} + +u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count = 0; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static struct intel_uncore_ops generic_uncore_pci_ops = { + .init_box = intel_generic_uncore_pci_init_box, + .disable_box = intel_generic_uncore_pci_disable_box, + .enable_box = intel_generic_uncore_pci_enable_box, + .disable_event = intel_generic_uncore_pci_disable_event, + .enable_event = intel_generic_uncore_pci_enable_event, + .read_counter = intel_generic_uncore_pci_read_counter, +}; + +#define UNCORE_GENERIC_MMIO_SIZE 0x4000 + +static u64 generic_uncore_mmio_box_ctl(struct intel_uncore_box *box) +{ + struct intel_uncore_type *type = box->pmu->type; + + if (!type->box_ctls || !type->box_ctls[box->dieid] || !type->mmio_offsets) + return 0; + + return type->box_ctls[box->dieid] + type->mmio_offsets[box->pmu->pmu_idx]; +} + +void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + u64 box_ctl = generic_uncore_mmio_box_ctl(box); + struct intel_uncore_type *type = box->pmu->type; + resource_size_t addr; + + if (!box_ctl) { + pr_warn("Uncore type %d box %d: Invalid box control address.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx]); + return; + } + + addr = box_ctl; + box->io_addr = ioremap(addr, UNCORE_GENERIC_MMIO_SIZE); + if (!box->io_addr) { + pr_warn("Uncore type %d box %d: ioremap error for 0x%llx.\n", + type->type_id, type->box_ids[box->pmu->pmu_idx], + (unsigned long long)addr); + return; + } + + writel(GENERIC_PMON_BOX_CTL_INT, box->io_addr); +} + +void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(GENERIC_PMON_BOX_CTL_FRZ, box->io_addr); +} + +void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr); +} + +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(hwc->config, box->io_addr + hwc->config_base); +} + +void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + writel(0, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_ops generic_uncore_mmio_ops = { + .init_box = intel_generic_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = intel_generic_uncore_mmio_disable_box, + .enable_box = intel_generic_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static bool uncore_update_uncore_type(enum uncore_access_type type_id, + struct intel_uncore_type *uncore, + struct intel_uncore_discovery_type *type) +{ + uncore->type_id = type->type; + uncore->num_boxes = type->num_boxes; + uncore->num_counters = type->num_counters; + uncore->perf_ctr_bits = type->counter_width; + uncore->box_ids = type->ids; + + switch (type_id) { + case UNCORE_ACCESS_MSR: + uncore->ops = &generic_uncore_msr_ops; + uncore->perf_ctr = (unsigned int)type->box_ctrl + type->ctr_offset; + uncore->event_ctl = (unsigned int)type->box_ctrl + type->ctl_offset; + uncore->box_ctl = (unsigned int)type->box_ctrl; + uncore->msr_offsets = type->box_offset; + break; + case UNCORE_ACCESS_PCI: + uncore->ops = &generic_uncore_pci_ops; + uncore->perf_ctr = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctr_offset; + uncore->event_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl) + type->ctl_offset; + uncore->box_ctl = (unsigned int)UNCORE_DISCOVERY_PCI_BOX_CTRL(type->box_ctrl); + uncore->box_ctls = type->box_ctrl_die; + uncore->pci_offsets = type->box_offset; + break; + case UNCORE_ACCESS_MMIO: + uncore->ops = &generic_uncore_mmio_ops; + uncore->perf_ctr = (unsigned int)type->ctr_offset; + uncore->event_ctl = (unsigned int)type->ctl_offset; + uncore->box_ctl = (unsigned int)type->box_ctrl; + uncore->box_ctls = type->box_ctrl_die; + uncore->mmio_offsets = type->box_offset; + uncore->mmio_map_size = UNCORE_GENERIC_MMIO_SIZE; + break; + default: + return false; + } + + return true; +} + +struct intel_uncore_type ** +intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra) +{ + struct intel_uncore_discovery_type *type; + struct intel_uncore_type **uncores; + struct intel_uncore_type *uncore; + struct rb_node *node; + int i = 0; + + uncores = kcalloc(num_discovered_types[type_id] + num_extra + 1, + sizeof(struct intel_uncore_type *), GFP_KERNEL); + if (!uncores) + return empty_uncore; + + for (node = rb_first(&discovery_tables); node; node = rb_next(node)) { + type = rb_entry(node, struct intel_uncore_discovery_type, node); + if (type->access_type != type_id) + continue; + + uncore = kzalloc(sizeof(struct intel_uncore_type), GFP_KERNEL); + if (!uncore) + break; + + uncore->event_mask = GENERIC_PMON_RAW_EVENT_MASK; + uncore->format_group = &generic_uncore_format_group; + + if (!uncore_update_uncore_type(type_id, uncore, type)) { + kfree(uncore); + continue; + } + uncores[i++] = uncore; + } + + return uncores; +} + +void intel_uncore_generic_uncore_cpu_init(void) +{ + uncore_msr_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MSR, 0); +} + +int intel_uncore_generic_uncore_pci_init(void) +{ + uncore_pci_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_PCI, 0); + + return 0; +} + +void intel_uncore_generic_uncore_mmio_init(void) +{ + uncore_mmio_uncores = intel_uncore_generic_init_uncores(UNCORE_ACCESS_MMIO, 0); +} diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h new file mode 100644 index 000000000000..f4439357779a --- /dev/null +++ b/arch/x86/events/intel/uncore_discovery.h @@ -0,0 +1,152 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +/* Generic device ID of a discovery table device */ +#define UNCORE_DISCOVERY_TABLE_DEVICE 0x09a7 +/* Capability ID for a discovery table device */ +#define UNCORE_EXT_CAP_ID_DISCOVERY 0x23 +/* First DVSEC offset */ +#define UNCORE_DISCOVERY_DVSEC_OFFSET 0x8 +/* Mask of the supported discovery entry type */ +#define UNCORE_DISCOVERY_DVSEC_ID_MASK 0xffff +/* PMON discovery entry type ID */ +#define UNCORE_DISCOVERY_DVSEC_ID_PMON 0x1 +/* Second DVSEC offset */ +#define UNCORE_DISCOVERY_DVSEC2_OFFSET 0xc +/* Mask of the discovery table BAR offset */ +#define UNCORE_DISCOVERY_DVSEC2_BIR_MASK 0x7 +/* Discovery table BAR base offset */ +#define UNCORE_DISCOVERY_BIR_BASE 0x10 +/* Discovery table BAR step */ +#define UNCORE_DISCOVERY_BIR_STEP 0x4 +/* Global discovery table size */ +#define UNCORE_DISCOVERY_GLOBAL_MAP_SIZE 0x20 + +#define UNCORE_DISCOVERY_PCI_DOMAIN(data) ((data >> 28) & 0x7) +#define UNCORE_DISCOVERY_PCI_BUS(data) ((data >> 20) & 0xff) +#define UNCORE_DISCOVERY_PCI_DEVFN(data) ((data >> 12) & 0xff) +#define UNCORE_DISCOVERY_PCI_BOX_CTRL(data) (data & 0xfff) + + +#define uncore_discovery_invalid_unit(unit) \ + (!unit.table1 || !unit.ctl || \ + unit.table1 == -1ULL || unit.ctl == -1ULL || \ + unit.table3 == -1ULL) + +#define GENERIC_PMON_CTL_EV_SEL_MASK 0x000000ff +#define GENERIC_PMON_CTL_UMASK_MASK 0x0000ff00 +#define GENERIC_PMON_CTL_EDGE_DET (1 << 18) +#define GENERIC_PMON_CTL_INVERT (1 << 23) +#define GENERIC_PMON_CTL_TRESH_MASK 0xff000000 +#define GENERIC_PMON_RAW_EVENT_MASK (GENERIC_PMON_CTL_EV_SEL_MASK | \ + GENERIC_PMON_CTL_UMASK_MASK | \ + GENERIC_PMON_CTL_EDGE_DET | \ + GENERIC_PMON_CTL_INVERT | \ + GENERIC_PMON_CTL_TRESH_MASK) + +#define GENERIC_PMON_BOX_CTL_FRZ (1 << 0) +#define GENERIC_PMON_BOX_CTL_RST_CTRL (1 << 8) +#define GENERIC_PMON_BOX_CTL_RST_CTRS (1 << 9) +#define GENERIC_PMON_BOX_CTL_INT (GENERIC_PMON_BOX_CTL_RST_CTRL | \ + GENERIC_PMON_BOX_CTL_RST_CTRS) + +enum uncore_access_type { + UNCORE_ACCESS_MSR = 0, + UNCORE_ACCESS_MMIO, + UNCORE_ACCESS_PCI, + + UNCORE_ACCESS_MAX, +}; + +struct uncore_global_discovery { + union { + u64 table1; + struct { + u64 type : 8, + stride : 8, + max_units : 10, + __reserved_1 : 36, + access_type : 2; + }; + }; + + u64 ctl; /* Global Control Address */ + + union { + u64 table3; + struct { + u64 status_offset : 8, + num_status : 16, + __reserved_2 : 40; + }; + }; +}; + +struct uncore_unit_discovery { + union { + u64 table1; + struct { + u64 num_regs : 8, + ctl_offset : 8, + bit_width : 8, + ctr_offset : 8, + status_offset : 8, + __reserved_1 : 22, + access_type : 2; + }; + }; + + u64 ctl; /* Unit Control Address */ + + union { + u64 table3; + struct { + u64 box_type : 16, + box_id : 16, + __reserved_2 : 32; + }; + }; +}; + +struct intel_uncore_discovery_type { + struct rb_node node; + enum uncore_access_type access_type; + u64 box_ctrl; /* Unit ctrl addr of the first box */ + u64 *box_ctrl_die; /* Unit ctrl addr of the first box of each die */ + u16 type; /* Type ID of the uncore block */ + u8 num_counters; + u8 counter_width; + u8 ctl_offset; /* Counter Control 0 offset */ + u8 ctr_offset; /* Counter 0 offset */ + u16 num_boxes; /* number of boxes for the uncore block */ + unsigned int *ids; /* Box IDs */ + unsigned int *box_offset; /* Box offset */ +}; + +bool intel_uncore_has_discovery_tables(void); +void intel_uncore_clear_discovery_tables(void); +void intel_uncore_generic_uncore_cpu_init(void); +int intel_uncore_generic_uncore_pci_init(void); +void intel_uncore_generic_uncore_mmio_init(void); + +void intel_generic_uncore_msr_init_box(struct intel_uncore_box *box); +void intel_generic_uncore_msr_disable_box(struct intel_uncore_box *box); +void intel_generic_uncore_msr_enable_box(struct intel_uncore_box *box); + +void intel_generic_uncore_mmio_init_box(struct intel_uncore_box *box); +void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box); +void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box); +void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box, + struct perf_event *event); +void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event); + +void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box); +void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box); +void intel_generic_uncore_pci_enable_box(struct intel_uncore_box *box); +void intel_generic_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event); +u64 intel_generic_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event); + +struct intel_uncore_type ** +intel_uncore_generic_init_uncores(enum uncore_access_type type_id, int num_extra); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index c37cb12d0ef6..1ef4f7861e2e 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */ #include "uncore.h" +#include "uncore_discovery.h" /* Uncore IMC PCI IDs */ #define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 @@ -42,9 +43,79 @@ #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC 0x3ed0 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC 0x3e34 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC 0x3e35 +#define PCI_DEVICE_ID_INTEL_CML_H1_IMC 0x9b44 +#define PCI_DEVICE_ID_INTEL_CML_H2_IMC 0x9b54 +#define PCI_DEVICE_ID_INTEL_CML_H3_IMC 0x9b64 +#define PCI_DEVICE_ID_INTEL_CML_U1_IMC 0x9b51 +#define PCI_DEVICE_ID_INTEL_CML_U2_IMC 0x9b61 +#define PCI_DEVICE_ID_INTEL_CML_U3_IMC 0x9b71 +#define PCI_DEVICE_ID_INTEL_CML_S1_IMC 0x9b33 +#define PCI_DEVICE_ID_INTEL_CML_S2_IMC 0x9b43 +#define PCI_DEVICE_ID_INTEL_CML_S3_IMC 0x9b53 +#define PCI_DEVICE_ID_INTEL_CML_S4_IMC 0x9b63 +#define PCI_DEVICE_ID_INTEL_CML_S5_IMC 0x9b73 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 - +#define PCI_DEVICE_ID_INTEL_TGL_U1_IMC 0x9a02 +#define PCI_DEVICE_ID_INTEL_TGL_U2_IMC 0x9a04 +#define PCI_DEVICE_ID_INTEL_TGL_U3_IMC 0x9a12 +#define PCI_DEVICE_ID_INTEL_TGL_U4_IMC 0x9a14 +#define PCI_DEVICE_ID_INTEL_TGL_H_IMC 0x9a36 +#define PCI_DEVICE_ID_INTEL_RKL_1_IMC 0x4c43 +#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53 +#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660 +#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641 +#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601 +#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602 +#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609 +#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a +#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621 +#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623 +#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629 +#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637 +#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b +#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648 +#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649 +#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650 +#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668 +#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670 +#define PCI_DEVICE_ID_INTEL_ADL_17_IMC 0x4614 +#define PCI_DEVICE_ID_INTEL_ADL_18_IMC 0x4617 +#define PCI_DEVICE_ID_INTEL_ADL_19_IMC 0x4618 +#define PCI_DEVICE_ID_INTEL_ADL_20_IMC 0x461B +#define PCI_DEVICE_ID_INTEL_ADL_21_IMC 0x461C +#define PCI_DEVICE_ID_INTEL_RPL_1_IMC 0xA700 +#define PCI_DEVICE_ID_INTEL_RPL_2_IMC 0xA702 +#define PCI_DEVICE_ID_INTEL_RPL_3_IMC 0xA706 +#define PCI_DEVICE_ID_INTEL_RPL_4_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_5_IMC 0xA701 +#define PCI_DEVICE_ID_INTEL_RPL_6_IMC 0xA703 +#define PCI_DEVICE_ID_INTEL_RPL_7_IMC 0xA704 +#define PCI_DEVICE_ID_INTEL_RPL_8_IMC 0xA705 +#define PCI_DEVICE_ID_INTEL_RPL_9_IMC 0xA706 +#define PCI_DEVICE_ID_INTEL_RPL_10_IMC 0xA707 +#define PCI_DEVICE_ID_INTEL_RPL_11_IMC 0xA708 +#define PCI_DEVICE_ID_INTEL_RPL_12_IMC 0xA709 +#define PCI_DEVICE_ID_INTEL_RPL_13_IMC 0xA70a +#define PCI_DEVICE_ID_INTEL_RPL_14_IMC 0xA70b +#define PCI_DEVICE_ID_INTEL_RPL_15_IMC 0xA715 +#define PCI_DEVICE_ID_INTEL_RPL_16_IMC 0xA716 +#define PCI_DEVICE_ID_INTEL_RPL_17_IMC 0xA717 +#define PCI_DEVICE_ID_INTEL_RPL_18_IMC 0xA718 +#define PCI_DEVICE_ID_INTEL_RPL_19_IMC 0xA719 +#define PCI_DEVICE_ID_INTEL_RPL_20_IMC 0xA71A +#define PCI_DEVICE_ID_INTEL_RPL_21_IMC 0xA71B +#define PCI_DEVICE_ID_INTEL_RPL_22_IMC 0xA71C +#define PCI_DEVICE_ID_INTEL_RPL_23_IMC 0xA728 +#define PCI_DEVICE_ID_INTEL_RPL_24_IMC 0xA729 +#define PCI_DEVICE_ID_INTEL_RPL_25_IMC 0xA72A + + +#define IMC_UNCORE_DEV(a) \ +{ \ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_##a##_IMC), \ + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), \ +} /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -110,12 +181,38 @@ #define ICL_UNC_CBO_0_PER_CTR0 0x702 #define ICL_UNC_CBO_MSR_OFFSET 0x8 +/* ICL ARB register */ +#define ICL_UNC_ARB_PER_CTR 0x3b1 +#define ICL_UNC_ARB_PERFEVTSEL 0x3b3 + +/* ADL uncore global control */ +#define ADL_UNC_PERF_GLOBAL_CTL 0x2ff0 +#define ADL_UNC_FIXED_CTR_CTRL 0x2fde +#define ADL_UNC_FIXED_CTR 0x2fdf + +/* ADL Cbo register */ +#define ADL_UNC_CBO_0_PER_CTR0 0x2002 +#define ADL_UNC_CBO_0_PERFEVTSEL0 0x2000 +#define ADL_UNC_CTL_THRESHOLD 0x3f000000 +#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + ADL_UNC_CTL_THRESHOLD) + +/* ADL ARB register */ +#define ADL_UNC_ARB_PER_CTR0 0x2FD2 +#define ADL_UNC_ARB_PERFEVTSEL0 0x2FD0 +#define ADL_UNC_ARB_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29"); /* Sandy Bridge uncore support */ static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -297,15 +394,21 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_ops icl_uncore_msr_ops = { + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + static struct intel_uncore_type icl_uncore_cbox = { .name = "cbox", - .num_counters = 4, + .num_counters = 2, .perf_ctr_bits = 44, .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, .event_mask = SNB_UNC_RAW_EVENT_MASK, .msr_offset = ICL_UNC_CBO_MSR_OFFSET, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .format_group = &snb_uncore_format_group, }; @@ -334,13 +437,25 @@ static struct intel_uncore_type icl_uncore_clockbox = { .single_fixed = 1, .event_mask = SNB_UNC_CTL_EV_SEL_MASK, .format_group = &icl_uncore_clock_format_group, - .ops = &skl_uncore_msr_ops, + .ops = &icl_uncore_msr_ops, .event_descs = icl_uncore_events, }; +static struct intel_uncore_type icl_uncore_arb = { + .name = "arb", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_ARB_PER_CTR, + .event_ctl = ICL_UNC_ARB_PERFEVTSEL, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .ops = &icl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + static struct intel_uncore_type *icl_msr_uncores[] = { &icl_uncore_cbox, - &snb_uncore_arb, + &icl_uncore_arb, &icl_uncore_clockbox, NULL, }; @@ -358,7 +473,129 @@ void icl_uncore_cpu_init(void) { uncore_msr_uncores = icl_msr_uncores; icl_uncore_cbox.num_boxes = icl_get_cbox_num(); +} + +static struct intel_uncore_type *tgl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +static void rkl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(SKL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + +void tgl_uncore_cpu_init(void) +{ + uncore_msr_uncores = tgl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + icl_uncore_cbox.ops = &skl_uncore_msr_ops; + icl_uncore_clockbox.ops = &skl_uncore_msr_ops; snb_uncore_arb.ops = &skl_uncore_msr_ops; + skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box; +} + +static void adl_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + +static void adl_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN); +} + +static void adl_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); +} + +static void adl_uncore_msr_exit_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) + wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, 0); +} + +static struct intel_uncore_ops adl_uncore_msr_ops = { + .init_box = adl_uncore_msr_init_box, + .enable_box = adl_uncore_msr_enable_box, + .disable_box = adl_uncore_msr_disable_box, + .exit_box = adl_uncore_msr_exit_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct attribute *adl_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_threshold.attr, + NULL, +}; + +static const struct attribute_group adl_uncore_format_group = { + .name = "format", + .attrs = adl_uncore_formats_attr, +}; + +static struct intel_uncore_type adl_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .perf_ctr_bits = 44, + .perf_ctr = ADL_UNC_CBO_0_PER_CTR0, + .event_ctl = ADL_UNC_CBO_0_PERFEVTSEL0, + .event_mask = ADL_UNC_RAW_EVENT_MASK, + .msr_offset = ICL_UNC_CBO_MSR_OFFSET, + .ops = &adl_uncore_msr_ops, + .format_group = &adl_uncore_format_group, +}; + +static struct intel_uncore_type adl_uncore_arb = { + .name = "arb", + .num_counters = 2, + .num_boxes = 2, + .perf_ctr_bits = 44, + .perf_ctr = ADL_UNC_ARB_PER_CTR0, + .event_ctl = ADL_UNC_ARB_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = ADL_UNC_ARB_MSR_OFFSET, + .constraints = snb_uncore_arb_constraints, + .ops = &adl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type adl_uncore_clockbox = { + .name = "clock", + .num_counters = 1, + .num_boxes = 1, + .fixed_ctr_bits = 48, + .fixed_ctr = ADL_UNC_FIXED_CTR, + .fixed_ctl = ADL_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &adl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *adl_msr_uncores[] = { + &adl_uncore_cbox, + &adl_uncore_arb, + &adl_uncore_clockbox, + NULL, +}; + +void adl_uncore_cpu_init(void) +{ + adl_uncore_cbox.num_boxes = icl_get_cbox_num(); + uncore_msr_uncores = adl_msr_uncores; } enum { @@ -374,6 +611,18 @@ static struct uncore_event_desc snb_uncore_imc_events[] = { INTEL_UNCORE_EVENT_DESC(data_writes.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(data_writes.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(gt_requests, "event=0x03"), + INTEL_UNCORE_EVENT_DESC(gt_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(gt_requests.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(ia_requests, "event=0x04"), + INTEL_UNCORE_EVENT_DESC(ia_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(ia_requests.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(io_requests, "event=0x05"), + INTEL_UNCORE_EVENT_DESC(io_requests.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(io_requests.unit, "MiB"), + { /* end: all zeroes */ }, }; @@ -389,13 +638,35 @@ static struct uncore_event_desc snb_uncore_imc_events[] = { #define SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE 0x5054 #define SNB_UNCORE_PCI_IMC_CTR_BASE SNB_UNCORE_PCI_IMC_DATA_READS_BASE +/* BW break down- legacy counters */ +#define SNB_UNCORE_PCI_IMC_GT_REQUESTS 0x3 +#define SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE 0x5040 +#define SNB_UNCORE_PCI_IMC_IA_REQUESTS 0x4 +#define SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE 0x5044 +#define SNB_UNCORE_PCI_IMC_IO_REQUESTS 0x5 +#define SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE 0x5048 + enum perf_snb_uncore_imc_freerunning_types { - SNB_PCI_UNCORE_IMC_DATA = 0, + SNB_PCI_UNCORE_IMC_DATA_READS = 0, + SNB_PCI_UNCORE_IMC_DATA_WRITES, + SNB_PCI_UNCORE_IMC_GT_REQUESTS, + SNB_PCI_UNCORE_IMC_IA_REQUESTS, + SNB_PCI_UNCORE_IMC_IO_REQUESTS, + SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, }; static struct freerunning_counters snb_uncore_imc_freerunning[] = { - [SNB_PCI_UNCORE_IMC_DATA] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, 0x4, 0x0, 2, 32 }, + [SNB_PCI_UNCORE_IMC_DATA_READS] = { SNB_UNCORE_PCI_IMC_DATA_READS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_DATA_WRITES] = { SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_GT_REQUESTS] = { SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_IA_REQUESTS] = { SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, + [SNB_PCI_UNCORE_IMC_IO_REQUESTS] = { SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE, + 0x0, 0x0, 1, 32 }, }; static struct attribute *snb_uncore_imc_formats_attr[] = { @@ -410,6 +681,7 @@ static const struct attribute_group snb_uncore_imc_format_group = { static void snb_uncore_imc_init_box(struct intel_uncore_box *box) { + struct intel_uncore_type *type = box->pmu->type; struct pci_dev *pdev = box->pci_dev; int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET; resource_size_t addr; @@ -425,7 +697,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box) addr &= ~(PAGE_SIZE - 1); - box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE); + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL; } @@ -505,6 +780,18 @@ static int snb_uncore_imc_event_init(struct perf_event *event) base = SNB_UNCORE_PCI_IMC_DATA_WRITES_BASE; idx = UNCORE_PMC_IDX_FREERUNNING; break; + case SNB_UNCORE_PCI_IMC_GT_REQUESTS: + base = SNB_UNCORE_PCI_IMC_GT_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; + case SNB_UNCORE_PCI_IMC_IA_REQUESTS: + base = SNB_UNCORE_PCI_IMC_IA_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; + case SNB_UNCORE_PCI_IMC_IO_REQUESTS: + base = SNB_UNCORE_PCI_IMC_IO_REQUESTS_BASE; + idx = UNCORE_PMC_IDX_FREERUNNING; + break; default: return -EINVAL; } @@ -546,7 +833,7 @@ int snb_pci2phy_map_init(int devid) pci_dev_put(dev); return -ENOMEM; } - map->pbus_to_physid[bus] = 0; + map->pbus_to_dieid[bus] = 0; raw_spin_unlock(&pci2phy_map_lock); pci_dev_put(dev); @@ -554,6 +841,22 @@ int snb_pci2phy_map_init(int devid) return 0; } +static u64 snb_uncore_imc_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * SNB IMC counters are 32-bit and are laid out back to back + * in MMIO space. Therefore we must use a 32-bit accessor function + * using readq() from uncore_mmio_read_counter() causes problems + * because it is reading 64-bit at a time. This is okay for the + * uncore_perf_event_update() function because it drops the upper + * 32-bits but not okay for plain uncore_read_counter() as invoked + * in uncore_pmu_event_start(). + */ + return (u64)readl(box->io_addr + hwc->event_base); +} + static struct pmu snb_uncore_imc_pmu = { .task_ctx_nr = perf_invalid_context, .event_init = snb_uncore_imc_event_init, @@ -573,14 +876,15 @@ static struct intel_uncore_ops snb_uncore_imc_ops = { .disable_event = snb_uncore_imc_disable_event, .enable_event = snb_uncore_imc_enable_event, .hw_config = snb_uncore_imc_hw_config, - .read_counter = uncore_mmio_read_counter, + .read_counter = snb_uncore_imc_read_counter, }; static struct intel_uncore_type snb_uncore_imc = { .name = "imc", - .num_counters = 2, + .num_counters = 5, .num_boxes = 1, .num_freerunning_types = SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNB_UNCORE_PCI_IMC_MAP_SIZE, .freerunning = snb_uncore_imc_freerunning, .event_descs = snb_uncore_imc_events, .format_group = &snb_uncore_imc_format_group, @@ -594,190 +898,80 @@ static struct intel_uncore_type *snb_pci_uncores[] = { }; static const struct pci_device_id snb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SNB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SNB), { /* end: all zeroes */ }, }; static const struct pci_device_id ivb_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IVB_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(IVB), + IMC_UNCORE_DEV(IVB_E3), { /* end: all zeroes */ }, }; static const struct pci_device_id hsw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(HSW), + IMC_UNCORE_DEV(HSW_U), { /* end: all zeroes */ }, }; static const struct pci_device_id bdw_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(BDW), { /* end: all zeroes */ }, }; static const struct pci_device_id skl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_E3_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_HQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_WQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AML_YQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(SKL_Y), + IMC_UNCORE_DEV(SKL_U), + IMC_UNCORE_DEV(SKL_HD), + IMC_UNCORE_DEV(SKL_HQ), + IMC_UNCORE_DEV(SKL_SD), + IMC_UNCORE_DEV(SKL_SQ), + IMC_UNCORE_DEV(SKL_E3), + IMC_UNCORE_DEV(KBL_Y), + IMC_UNCORE_DEV(KBL_U), + IMC_UNCORE_DEV(KBL_UQ), + IMC_UNCORE_DEV(KBL_SD), + IMC_UNCORE_DEV(KBL_SQ), + IMC_UNCORE_DEV(KBL_HQ), + IMC_UNCORE_DEV(KBL_WQ), + IMC_UNCORE_DEV(CFL_2U), + IMC_UNCORE_DEV(CFL_4U), + IMC_UNCORE_DEV(CFL_4H), + IMC_UNCORE_DEV(CFL_6H), + IMC_UNCORE_DEV(CFL_2S_D), + IMC_UNCORE_DEV(CFL_4S_D), + IMC_UNCORE_DEV(CFL_6S_D), + IMC_UNCORE_DEV(CFL_8S_D), + IMC_UNCORE_DEV(CFL_4S_W), + IMC_UNCORE_DEV(CFL_6S_W), + IMC_UNCORE_DEV(CFL_8S_W), + IMC_UNCORE_DEV(CFL_4S_S), + IMC_UNCORE_DEV(CFL_6S_S), + IMC_UNCORE_DEV(CFL_8S_S), + IMC_UNCORE_DEV(AML_YD), + IMC_UNCORE_DEV(AML_YQ), + IMC_UNCORE_DEV(WHL_UQ), + IMC_UNCORE_DEV(WHL_4_UQ), + IMC_UNCORE_DEV(WHL_UD), + IMC_UNCORE_DEV(CML_H1), + IMC_UNCORE_DEV(CML_H2), + IMC_UNCORE_DEV(CML_H3), + IMC_UNCORE_DEV(CML_U1), + IMC_UNCORE_DEV(CML_U2), + IMC_UNCORE_DEV(CML_U3), + IMC_UNCORE_DEV(CML_S1), + IMC_UNCORE_DEV(CML_S2), + IMC_UNCORE_DEV(CML_S3), + IMC_UNCORE_DEV(CML_S4), + IMC_UNCORE_DEV(CML_S5), { /* end: all zeroes */ }, }; static const struct pci_device_id icl_uncore_pci_ids[] = { - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, - { /* IMC */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), - .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), - }, + IMC_UNCORE_DEV(ICL_U), + IMC_UNCORE_DEV(ICL_U2), + IMC_UNCORE_DEV(RKL_1), + IMC_UNCORE_DEV(RKL_2), { /* end: all zeroes */ }, }; @@ -858,8 +1052,21 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Quad Core */ IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U Mobile Dual Core */ + IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver), + IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver), IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(RKL_1_IMC, &icl_uncore_pci_driver), + IMC_DEV(RKL_2_IMC, &icl_uncore_pci_driver), { /* end marker */ } }; @@ -1002,3 +1209,333 @@ void nhm_uncore_cpu_init(void) } /* end of Nehalem uncore support */ + +/* Tiger Lake MMIO uncore support */ + +static const struct pci_device_id tgl_uncore_pci_ids[] = { + IMC_UNCORE_DEV(TGL_U1), + IMC_UNCORE_DEV(TGL_U2), + IMC_UNCORE_DEV(TGL_U3), + IMC_UNCORE_DEV(TGL_U4), + IMC_UNCORE_DEV(TGL_H), + IMC_UNCORE_DEV(ADL_1), + IMC_UNCORE_DEV(ADL_2), + IMC_UNCORE_DEV(ADL_3), + IMC_UNCORE_DEV(ADL_4), + IMC_UNCORE_DEV(ADL_5), + IMC_UNCORE_DEV(ADL_6), + IMC_UNCORE_DEV(ADL_7), + IMC_UNCORE_DEV(ADL_8), + IMC_UNCORE_DEV(ADL_9), + IMC_UNCORE_DEV(ADL_10), + IMC_UNCORE_DEV(ADL_11), + IMC_UNCORE_DEV(ADL_12), + IMC_UNCORE_DEV(ADL_13), + IMC_UNCORE_DEV(ADL_14), + IMC_UNCORE_DEV(ADL_15), + IMC_UNCORE_DEV(ADL_16), + IMC_UNCORE_DEV(ADL_17), + IMC_UNCORE_DEV(ADL_18), + IMC_UNCORE_DEV(ADL_19), + IMC_UNCORE_DEV(ADL_20), + IMC_UNCORE_DEV(ADL_21), + IMC_UNCORE_DEV(RPL_1), + IMC_UNCORE_DEV(RPL_2), + IMC_UNCORE_DEV(RPL_3), + IMC_UNCORE_DEV(RPL_4), + IMC_UNCORE_DEV(RPL_5), + IMC_UNCORE_DEV(RPL_6), + IMC_UNCORE_DEV(RPL_7), + IMC_UNCORE_DEV(RPL_8), + IMC_UNCORE_DEV(RPL_9), + IMC_UNCORE_DEV(RPL_10), + IMC_UNCORE_DEV(RPL_11), + IMC_UNCORE_DEV(RPL_12), + IMC_UNCORE_DEV(RPL_13), + IMC_UNCORE_DEV(RPL_14), + IMC_UNCORE_DEV(RPL_15), + IMC_UNCORE_DEV(RPL_16), + IMC_UNCORE_DEV(RPL_17), + IMC_UNCORE_DEV(RPL_18), + IMC_UNCORE_DEV(RPL_19), + IMC_UNCORE_DEV(RPL_20), + IMC_UNCORE_DEV(RPL_21), + IMC_UNCORE_DEV(RPL_22), + IMC_UNCORE_DEV(RPL_23), + IMC_UNCORE_DEV(RPL_24), + IMC_UNCORE_DEV(RPL_25), + { /* end: all zeroes */ } +}; + +enum perf_tgl_uncore_imc_freerunning_types { + TGL_MMIO_UNCORE_IMC_DATA_TOTAL, + TGL_MMIO_UNCORE_IMC_DATA_READ, + TGL_MMIO_UNCORE_IMC_DATA_WRITE, + TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX +}; + +static struct freerunning_counters tgl_l_uncore_imc_freerunning[] = { + [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x5040, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0x5058, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0x50A0, 0x0, 0x0, 1, 64 }, +}; + +static struct freerunning_counters tgl_uncore_imc_freerunning[] = { + [TGL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0xd840, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_READ] = { 0xd858, 0x0, 0x0, 1, 64 }, + [TGL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xd8A0, 0x0, 0x0, 1, 64 }, +}; + +static struct uncore_event_desc tgl_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(data_total, "event=0xff,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(data_total.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_total.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_read, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(data_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_read.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(data_write, "event=0xff,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(data_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(data_write.unit, "MiB"), + + { /* end: all zeroes */ } +}; + +static struct pci_dev *tgl_uncore_get_mc_dev(void) +{ + const struct pci_device_id *ids = tgl_uncore_pci_ids; + struct pci_dev *mc_dev = NULL; + + while (ids && ids->vendor) { + mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, ids->device, NULL); + if (mc_dev) + return mc_dev; + ids++; + } + + return mc_dev; +} + +#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000 +#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000 + +static void __uncore_imc_init_box(struct intel_uncore_box *box, + unsigned int base_offset) +{ + struct pci_dev *pdev = tgl_uncore_get_mc_dev(); + struct intel_uncore_pmu *pmu = box->pmu; + struct intel_uncore_type *type = pmu->type; + resource_size_t addr; + u32 mch_bar; + + if (!pdev) { + pr_warn("perf uncore: Cannot find matched IMC device.\n"); + return; + } + + pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar); + /* MCHBAR is disabled */ + if (!(mch_bar & BIT(0))) { + pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n"); + return; + } + mch_bar &= ~BIT(0); + addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx); + +#ifdef CONFIG_PHYS_ADDR_T_64BIT + pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar); + addr |= ((resource_size_t)mch_bar << 32); +#endif + + addr += base_offset; + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); +} + +static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, 0); +} + +static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = { + .init_box = tgl_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct attribute *tgl_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL +}; + +static const struct attribute_group tgl_uncore_imc_format_group = { + .name = "format", + .attrs = tgl_uncore_imc_formats_attr, +}; + +static struct intel_uncore_type tgl_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 2, + .num_freerunning_types = TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = TGL_UNCORE_PCI_IMC_MAP_SIZE, + .freerunning = tgl_uncore_imc_freerunning, + .ops = &tgl_uncore_imc_freerunning_ops, + .event_descs = tgl_uncore_imc_events, + .format_group = &tgl_uncore_imc_format_group, +}; + +static struct intel_uncore_type *tgl_mmio_uncores[] = { + &tgl_uncore_imc_free_running, + NULL +}; + +void tgl_l_uncore_mmio_init(void) +{ + tgl_uncore_imc_free_running.freerunning = tgl_l_uncore_imc_freerunning; + uncore_mmio_uncores = tgl_mmio_uncores; +} + +void tgl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = tgl_mmio_uncores; +} + +/* end of Tiger Lake MMIO uncore support */ + +/* Alder Lake MMIO uncore support */ +#define ADL_UNCORE_IMC_BASE 0xd900 +#define ADL_UNCORE_IMC_MAP_SIZE 0x200 +#define ADL_UNCORE_IMC_CTR 0xe8 +#define ADL_UNCORE_IMC_CTRL 0xd0 +#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0 +#define ADL_UNCORE_IMC_BOX_CTL 0xc4 +#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800 +#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100 + +#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0) +#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1) +#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2) +#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \ + ADL_UNCORE_IMC_CTL_RST_CTRS) + +static void adl_uncore_imc_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE); + + /* The global control in MC1 can control both MCs. */ + if (box->io_addr && (box->pmu->pmu_idx == 1)) + writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL); +} + +static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box) +{ + if (!box->io_addr) + return; + + writel(0, box->io_addr + uncore_mmio_box_ctl(box)); +} + +static struct intel_uncore_ops adl_uncore_mmio_ops = { + .init_box = adl_uncore_imc_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = adl_uncore_mmio_disable_box, + .enable_box = adl_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = intel_generic_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00 +#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + ADL_UNC_CTL_CHMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET) + +static struct attribute *adl_uncore_imc_formats_attr[] = { + &format_attr_event.attr, + &format_attr_chmask.attr, + &format_attr_edge.attr, + NULL, +}; + +static const struct attribute_group adl_uncore_imc_format_group = { + .name = "format", + .attrs = adl_uncore_imc_formats_attr, +}; + +static struct intel_uncore_type adl_uncore_imc = { + .name = "imc", + .num_counters = 5, + .num_boxes = 2, + .perf_ctr_bits = 64, + .perf_ctr = ADL_UNCORE_IMC_CTR, + .event_ctl = ADL_UNCORE_IMC_CTRL, + .event_mask = ADL_UNC_IMC_EVENT_MASK, + .box_ctl = ADL_UNCORE_IMC_BOX_CTL, + .mmio_offset = 0, + .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE, + .ops = &adl_uncore_mmio_ops, + .format_group = &adl_uncore_imc_format_group, +}; + +enum perf_adl_uncore_imc_freerunning_types { + ADL_MMIO_UNCORE_IMC_DATA_TOTAL, + ADL_MMIO_UNCORE_IMC_DATA_READ, + ADL_MMIO_UNCORE_IMC_DATA_WRITE, + ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX +}; + +static struct freerunning_counters adl_uncore_imc_freerunning[] = { + [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 }, + [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 }, +}; + +static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE); +} + +static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = { + .init_box = adl_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type adl_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .num_boxes = 2, + .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE, + .freerunning = adl_uncore_imc_freerunning, + .ops = &adl_uncore_imc_freerunning_ops, + .event_descs = tgl_uncore_imc_events, + .format_group = &tgl_uncore_imc_format_group, +}; + +static struct intel_uncore_type *adl_mmio_uncores[] = { + &adl_uncore_imc, + &adl_uncore_imc_free_running, + NULL +}; + +void adl_uncore_mmio_init(void) +{ + uncore_mmio_uncores = adl_mmio_uncores; +} + +/* end of Alder Lake MMIO uncore support */ diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index ad20220af303..ed869443efb2 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* SandyBridge-EP/IvyTown uncore support */ #include "uncore.h" +#include "uncore_discovery.h" /* SNB-EP pci bus to socket mapping */ #define SNBEP_CPUNODEID 0x40 @@ -273,6 +274,30 @@ #define SKX_CPUNODEID 0xc0 #define SKX_GIDNIDMAP 0xd4 +/* + * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR + * that BIOS programmed. MSR has package scope. + * | Bit | Default | Description + * | [63] | 00h | VALID - When set, indicates the CPU bus + * numbers have been initialized. (RO) + * |[62:48]| --- | Reserved + * |[47:40]| 00h | BUS_NUM_5 - Return the bus number BIOS assigned + * CPUBUSNO(5). (RO) + * |[39:32]| 00h | BUS_NUM_4 - Return the bus number BIOS assigned + * CPUBUSNO(4). (RO) + * |[31:24]| 00h | BUS_NUM_3 - Return the bus number BIOS assigned + * CPUBUSNO(3). (RO) + * |[23:16]| 00h | BUS_NUM_2 - Return the bus number BIOS assigned + * CPUBUSNO(2). (RO) + * |[15:8] | 00h | BUS_NUM_1 - Return the bus number BIOS assigned + * CPUBUSNO(1). (RO) + * | [7:0] | 00h | BUS_NUM_0 - Return the bus number BIOS assigned + * CPUBUSNO(0). (RO) + */ +#define SKX_MSR_CPU_BUS_NUMBER 0x300 +#define SKX_MSR_CPU_BUS_VALID_BIT (1ULL << 63) +#define BUS_NUM_STRIDE 8 + /* SKX CHA */ #define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) @@ -324,6 +349,13 @@ #define SKX_M2M_PCI_PMON_CTR0 0x200 #define SKX_M2M_PCI_PMON_BOX_CTL 0x258 +/* Memory Map registers device ID */ +#define SNR_ICX_MESH2IIO_MMAP_DID 0x9a2 +#define SNR_ICX_SAD_CONTROL_CFG 0x3f4 + +/* Getting I/O stack id in SAD_COTROL_CFG notation */ +#define SAD_CONTROL_STACK_ID(data) (((data) >> 4) & 0x7) + /* SNR Ubox */ #define SNR_U_MSR_PMON_CTR0 0x1f98 #define SNR_U_MSR_PMON_CTL0 0x1f91 @@ -369,6 +401,11 @@ #define SNR_M2M_PCI_PMON_BOX_CTL 0x438 #define SNR_M2M_PCI_PMON_UMASK_EXT 0xff +/* SNR PCIE3 */ +#define SNR_PCIE3_PCI_PMON_CTL0 0x508 +#define SNR_PCIE3_PCI_PMON_CTR0 0x4e8 +#define SNR_PCIE3_PCI_PMON_BOX_CTL 0x4e0 + /* SNR IMC */ #define SNR_IMC_MMIO_PMON_FIXED_CTL 0x54 #define SNR_IMC_MMIO_PMON_FIXED_CTR 0x38 @@ -382,6 +419,53 @@ #define SNR_IMC_MMIO_MEM0_OFFSET 0xd8 #define SNR_IMC_MMIO_MEM0_MASK 0x7FF +/* ICX CHA */ +#define ICX_C34_MSR_PMON_CTR0 0xb68 +#define ICX_C34_MSR_PMON_CTL0 0xb61 +#define ICX_C34_MSR_PMON_BOX_CTL 0xb60 +#define ICX_C34_MSR_PMON_BOX_FILTER0 0xb65 + +/* ICX IIO */ +#define ICX_IIO_MSR_PMON_CTL0 0xa58 +#define ICX_IIO_MSR_PMON_CTR0 0xa51 +#define ICX_IIO_MSR_PMON_BOX_CTL 0xa50 + +/* ICX IRP */ +#define ICX_IRP0_MSR_PMON_CTL0 0xa4d +#define ICX_IRP0_MSR_PMON_CTR0 0xa4b +#define ICX_IRP0_MSR_PMON_BOX_CTL 0xa4a + +/* ICX M2PCIE */ +#define ICX_M2PCIE_MSR_PMON_CTL0 0xa46 +#define ICX_M2PCIE_MSR_PMON_CTR0 0xa41 +#define ICX_M2PCIE_MSR_PMON_BOX_CTL 0xa40 + +/* ICX UPI */ +#define ICX_UPI_PCI_PMON_CTL0 0x350 +#define ICX_UPI_PCI_PMON_CTR0 0x320 +#define ICX_UPI_PCI_PMON_BOX_CTL 0x318 +#define ICX_UPI_CTL_UMASK_EXT 0xffffff + +/* ICX M3UPI*/ +#define ICX_M3UPI_PCI_PMON_CTL0 0xd8 +#define ICX_M3UPI_PCI_PMON_CTR0 0xa8 +#define ICX_M3UPI_PCI_PMON_BOX_CTL 0xa0 + +/* ICX IMC */ +#define ICX_NUMBER_IMC_CHN 3 +#define ICX_IMC_MEM_STRIDE 0x4 + +/* SPR */ +#define SPR_RAW_EVENT_MASK_EXT 0xffffff + +/* SPR CHA */ +#define SPR_CHA_PMON_CTL_TID_EN (1 << 16) +#define SPR_CHA_PMON_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SPR_CHA_PMON_CTL_TID_EN) +#define SPR_CHA_PMON_BOX_FILTER_TID 0x3ff + +#define SPR_C0_MSR_PMON_BOX_FILTER0 0x200e + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); @@ -390,9 +474,11 @@ DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-43,45-55"); DEFINE_UNCORE_FORMAT_ATTR(umask_ext2, umask, "config:8-15,32-57"); DEFINE_UNCORE_FORMAT_ATTR(umask_ext3, umask, "config:8-15,32-39"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext4, umask, "config:8-15,32-55"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en2, tid_en, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35"); DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); @@ -1093,7 +1179,6 @@ enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, BDX_PCI_QPI_PORT2_FILTER, - HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -1293,7 +1378,7 @@ static struct pci_driver snbep_uncore_pci_driver = { static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse) { struct pci_dev *ubox_dev = NULL; - int i, bus, nodeid, segment; + int i, bus, nodeid, segment, die_id; struct pci2phy_map *map; int err = 0; u32 config = 0; @@ -1304,36 +1389,79 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool if (!ubox_dev) break; bus = ubox_dev->bus->number; - /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); - if (err) - break; - nodeid = config & NODE_ID_MASK; - /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, idmap_loc, &config); - if (err) - break; + /* + * The nodeid and idmap registers only contain enough + * information to handle 8 nodes. On systems with more + * than 8 nodes, we need to rely on NUMA information, + * filled in from BIOS supplied information, to determine + * the topology. + */ + if (nr_node_ids <= 8) { + /* get the Node ID of the local register */ + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); + if (err) + break; + nodeid = config & NODE_ID_MASK; + /* get the Node ID mapping */ + err = pci_read_config_dword(ubox_dev, idmap_loc, &config); + if (err) + break; - segment = pci_domain_nr(ubox_dev->bus); - raw_spin_lock(&pci2phy_map_lock); - map = __find_pci2phy_map(segment); - if (!map) { + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } + + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + if (topology_max_die_per_package() > 1) + die_id = i; + else + die_id = topology_phys_to_logical_pkg(i); + if (die_id < 0) + die_id = -ENODEV; + map->pbus_to_dieid[bus] = die_id; + break; + } + } raw_spin_unlock(&pci2phy_map_lock); - err = -ENOMEM; - break; - } + } else { + int node = pcibus_to_node(ubox_dev->bus); + int cpu; + + segment = pci_domain_nr(ubox_dev->bus); + raw_spin_lock(&pci2phy_map_lock); + map = __find_pci2phy_map(segment); + if (!map) { + raw_spin_unlock(&pci2phy_map_lock); + err = -ENOMEM; + break; + } - /* - * every three bits in the Node ID mapping register maps - * to a particular node. - */ - for (i = 0; i < 8; i++) { - if (nodeid == ((config >> (3 * i)) & 0x7)) { - map->pbus_to_physid[bus] = i; + die_id = -1; + for_each_cpu(cpu, cpumask_of_pcibus(ubox_dev->bus)) { + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->initialized && cpu_to_node(cpu) == node) { + map->pbus_to_dieid[bus] = die_id = c->logical_die_id; + break; + } + } + raw_spin_unlock(&pci2phy_map_lock); + + if (WARN_ON_ONCE(die_id == -1)) { + err = -EINVAL; break; } } - raw_spin_unlock(&pci2phy_map_lock); } if (!err) { @@ -1346,17 +1474,17 @@ static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool i = -1; if (reverse) { for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] != -1) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } else { for (bus = 0; bus <= 255; bus++) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; + if (map->pbus_to_dieid[bus] != -1) + i = map->pbus_to_dieid[bus]; else - map->pbus_to_physid[bus] = i; + map->pbus_to_dieid[bus] = i; } } } @@ -2750,22 +2878,33 @@ static struct intel_uncore_type *hswep_msr_uncores[] = { NULL, }; -void hswep_uncore_cpu_init(void) +#define HSWEP_PCU_DID 0x2fc0 +#define HSWEP_PCU_CAPID4_OFFET 0x94 +#define hswep_get_chop(_cap) (((_cap) >> 6) & 0x3) + +static bool hswep_has_limit_sbox(unsigned int device) { - int pkg = boot_cpu_data.logical_proc_id; + struct pci_dev *dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, NULL); + u32 capid4; + + if (!dev) + return false; + + pci_read_config_dword(dev, HSWEP_PCU_CAPID4_OFFET, &capid4); + if (!hswep_get_chop(capid4)) + return true; + + return false; +} +void hswep_uncore_cpu_init(void) +{ if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; /* Detect 6-8 core systems with only two SBOXes */ - if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - u32 capid4; - - pci_read_config_dword(uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3], - 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - hswep_uncore_sbox.num_boxes = 2; - } + if (hswep_has_limit_sbox(HSWEP_PCU_DID)) + hswep_uncore_sbox.num_boxes = 2; uncore_msr_uncores = hswep_msr_uncores; } @@ -3028,11 +3167,6 @@ static const struct pci_device_id hswep_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3124,27 +3258,18 @@ static struct event_constraint bdx_uncore_pcu_constraints[] = { EVENT_CONSTRAINT_END }; +#define BDX_PCU_DID 0x6fc0 + void bdx_uncore_cpu_init(void) { - int pkg = topology_phys_to_logical_pkg(boot_cpu_data.phys_proc_id); - if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; uncore_msr_uncores = bdx_msr_uncores; - /* BDX-DE doesn't have SBOX */ - if (boot_cpu_data.x86_model == 86) { - uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; /* Detect systems with no SBOXes */ - } else if (uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]) { - struct pci_dev *pdev; - u32 capid4; - - pdev = uncore_extra_pci_dev[pkg].dev[HSWEP_PCI_PCU_3]; - pci_read_config_dword(pdev, 0x94, &capid4); - if (((capid4 >> 6) & 0x3) == 0) - bdx_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; - } + if ((boot_cpu_data.x86_model == 86) || hswep_has_limit_sbox(BDX_PCU_DID)) + uncore_msr_uncores[BDX_MSR_UNCORE_SBOX] = NULL; + hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints; } @@ -3365,11 +3490,6 @@ static const struct pci_device_id bdx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, BDX_PCI_QPI_PORT2_FILTER), }, - { /* PCU.3 (for Capability registers) */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fc0), - .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, - HSWEP_PCI_PCU_3), - }, { /* end: all zeroes */ } }; @@ -3488,6 +3608,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID; for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask)) @@ -3555,6 +3678,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END }; @@ -3575,6 +3699,188 @@ static struct intel_uncore_ops skx_uncore_iio_ops = { .read_counter = uncore_msr_read_counter, }; +static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die) +{ + return pmu->type->topology[die].configuration >> + (pmu->pmu_idx * BUS_NUM_STRIDE); +} + +static umode_t +pmu_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, + int die, int zero_bus_pmu) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj)); + + return (!skx_iio_stack(pmu, die) && pmu->pmu_idx != zero_bus_pmu) ? 0 : attr->mode; +} + +static umode_t +skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 0. */ + return pmu_iio_mapping_visible(kobj, attr, die, 0); +} + +static ssize_t skx_iio_mapping_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); + struct dev_ext_attribute *ea = to_dev_ext_attribute(attr); + long die = (long)ea->var; + + return sprintf(buf, "%04x:%02x\n", pmu->type->topology[die].segment, + skx_iio_stack(pmu, die)); +} + +static int skx_msr_cpu_bus_read(int cpu, u64 *topology) +{ + u64 msr_value; + + if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) || + !(msr_value & SKX_MSR_CPU_BUS_VALID_BIT)) + return -ENXIO; + + *topology = msr_value; + + return 0; +} + +static int die_to_cpu(int die) +{ + int res = 0, cpu, current_die; + /* + * Using cpus_read_lock() to ensure cpu is not going down between + * looking at cpu_online_mask. + */ + cpus_read_lock(); + for_each_online_cpu(cpu) { + current_die = topology_logical_die_id(cpu); + if (current_die == die) { + res = cpu; + break; + } + } + cpus_read_unlock(); + return res; +} + +static int skx_iio_get_topology(struct intel_uncore_type *type) +{ + int die, ret = -EPERM; + + type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), + GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + for (die = 0; die < uncore_max_dies(); die++) { + ret = skx_msr_cpu_bus_read(die_to_cpu(die), + &type->topology[die].configuration); + if (ret) + break; + + ret = uncore_die_to_segment(die); + if (ret < 0) + break; + + type->topology[die].segment = ret; + } + + if (ret < 0) { + kfree(type->topology); + type->topology = NULL; + } + + return ret; +} + +static struct attribute_group skx_iio_mapping_group = { + .is_visible = skx_iio_mapping_visible, +}; + +static const struct attribute_group *skx_iio_attr_update[] = { + &skx_iio_mapping_group, + NULL, +}; + +static int +pmu_iio_set_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + char buf[64]; + int ret; + long die = -1; + struct attribute **attrs = NULL; + struct dev_ext_attribute *eas = NULL; + + ret = type->get_topology(type); + if (ret < 0) + goto clear_attr_update; + + ret = -ENOMEM; + + /* One more for NULL. */ + attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL); + if (!attrs) + goto clear_topology; + + eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL); + if (!eas) + goto clear_attrs; + + for (die = 0; die < uncore_max_dies(); die++) { + sprintf(buf, "die%ld", die); + sysfs_attr_init(&eas[die].attr.attr); + eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL); + if (!eas[die].attr.attr.name) + goto err; + eas[die].attr.attr.mode = 0444; + eas[die].attr.show = skx_iio_mapping_show; + eas[die].attr.store = NULL; + eas[die].var = (void *)die; + attrs[die] = &eas[die].attr.attr; + } + ag->attrs = attrs; + + return 0; +err: + for (; die >= 0; die--) + kfree(eas[die].attr.attr.name); + kfree(eas); +clear_attrs: + kfree(attrs); +clear_topology: + kfree(type->topology); +clear_attr_update: + type->attr_update = NULL; + return ret; +} + +static void +pmu_iio_cleanup_mapping(struct intel_uncore_type *type, struct attribute_group *ag) +{ + struct attribute **attr = ag->attrs; + + if (!attr) + return; + + for (; *attr; attr++) + kfree((*attr)->name); + kfree(attr_to_ext_attr(*ag->attrs)); + kfree(ag->attrs); + ag->attrs = NULL; + kfree(type->topology); +} + +static int skx_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &skx_iio_mapping_group); +} + +static void skx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_iio_cleanup_mapping(type, &skx_iio_mapping_group); +} + static struct intel_uncore_type skx_uncore_iio = { .name = "iio", .num_counters = 4, @@ -3589,6 +3895,10 @@ static struct intel_uncore_type skx_uncore_iio = { .constraints = skx_uncore_iio_constraints, .ops = &skx_uncore_iio_ops, .format_group = &skx_uncore_iio_format_group, + .attr_update = skx_iio_attr_update, + .get_topology = skx_iio_get_topology, + .set_mapping = skx_iio_set_mapping, + .cleanup_mapping = skx_iio_cleanup_mapping, }; enum perf_uncore_iio_freerunning_type_id { @@ -4129,6 +4439,103 @@ static const struct attribute_group snr_uncore_iio_format_group = { .attrs = snr_uncore_iio_formats_attr, }; +static umode_t +snr_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 1. */ + return pmu_iio_mapping_visible(kobj, attr, die, 1); +} + +static struct attribute_group snr_iio_mapping_group = { + .is_visible = snr_iio_mapping_visible, +}; + +static const struct attribute_group *snr_iio_attr_update[] = { + &snr_iio_mapping_group, + NULL, +}; + +static int sad_cfg_iio_topology(struct intel_uncore_type *type, u8 *sad_pmon_mapping) +{ + u32 sad_cfg; + int die, stack_id, ret = -EPERM; + struct pci_dev *dev = NULL; + + type->topology = kcalloc(uncore_max_dies(), sizeof(*type->topology), + GFP_KERNEL); + if (!type->topology) + return -ENOMEM; + + while ((dev = pci_get_device(PCI_VENDOR_ID_INTEL, SNR_ICX_MESH2IIO_MMAP_DID, dev))) { + ret = pci_read_config_dword(dev, SNR_ICX_SAD_CONTROL_CFG, &sad_cfg); + if (ret) { + ret = pcibios_err_to_errno(ret); + break; + } + + die = uncore_pcibus_to_dieid(dev->bus); + stack_id = SAD_CONTROL_STACK_ID(sad_cfg); + if (die < 0 || stack_id >= type->num_boxes) { + ret = -EPERM; + break; + } + + /* Convert stack id from SAD_CONTROL to PMON notation. */ + stack_id = sad_pmon_mapping[stack_id]; + + ((u8 *)&(type->topology[die].configuration))[stack_id] = dev->bus->number; + type->topology[die].segment = pci_domain_nr(dev->bus); + } + + if (ret) { + kfree(type->topology); + type->topology = NULL; + } + + return ret; +} + +/* + * SNR has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON + */ +enum { + SNR_QAT_PMON_ID, + SNR_CBDMA_DMI_PMON_ID, + SNR_NIS_PMON_ID, + SNR_DLB_PMON_ID, + SNR_PCIE_GEN3_PMON_ID +}; + +static u8 snr_sad_pmon_mapping[] = { + SNR_CBDMA_DMI_PMON_ID, + SNR_PCIE_GEN3_PMON_ID, + SNR_DLB_PMON_ID, + SNR_NIS_PMON_ID, + SNR_QAT_PMON_ID +}; + +static int snr_iio_get_topology(struct intel_uncore_type *type) +{ + return sad_cfg_iio_topology(type, snr_sad_pmon_mapping); +} + +static int snr_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &snr_iio_mapping_group); +} + +static void snr_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_iio_cleanup_mapping(type, &snr_iio_mapping_group); +} + +static struct event_constraint snr_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + EVENT_CONSTRAINT_END +}; + static struct intel_uncore_type snr_uncore_iio = { .name = "iio", .num_counters = 4, @@ -4140,8 +4547,13 @@ static struct intel_uncore_type snr_uncore_iio = { .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, .box_ctl = SNR_IIO_MSR_PMON_BOX_CTL, .msr_offset = SNR_IIO_MSR_OFFSET, + .constraints = snr_uncore_iio_constraints, .ops = &ivbep_uncore_msr_ops, .format_group = &snr_uncore_iio_format_group, + .attr_update = snr_iio_attr_update, + .get_topology = snr_iio_get_topology, + .set_mapping = snr_iio_set_mapping, + .cleanup_mapping = snr_iio_cleanup_mapping, }; static struct intel_uncore_type snr_uncore_irp = { @@ -4323,12 +4735,46 @@ static struct intel_uncore_type snr_uncore_m2m = { .format_group = &snr_m2m_uncore_format_group, }; +static void snr_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, (u32)(hwc->config | SNBEP_PMON_CTL_EN)); + pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32)); +} + +static struct intel_uncore_ops snr_pcie3_uncore_pci_ops = { + .init_box = snr_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snr_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type snr_uncore_pcie3 = { + .name = "pcie3", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNR_PCIE3_PCI_PMON_CTR0, + .event_ctl = SNR_PCIE3_PCI_PMON_CTL0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SNR_PCIE3_PCI_PMON_BOX_CTL, + .ops = &snr_pcie3_uncore_pci_ops, + .format_group = &skx_uncore_iio_format_group, +}; + enum { SNR_PCI_UNCORE_M2M, + SNR_PCI_UNCORE_PCIE3, }; static struct intel_uncore_type *snr_pci_uncores[] = { [SNR_PCI_UNCORE_M2M] = &snr_uncore_m2m, + [SNR_PCI_UNCORE_PCIE3] = &snr_uncore_pcie3, NULL, }; @@ -4345,6 +4791,19 @@ static struct pci_driver snr_uncore_pci_driver = { .id_table = snr_uncore_pci_ids, }; +static const struct pci_device_id snr_uncore_pci_sub_ids[] = { + { /* PCIe3 RP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x334a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 0, SNR_PCI_UNCORE_PCIE3, 0), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snr_uncore_pci_sub_driver = { + .name = "snr_uncore_sub", + .id_table = snr_uncore_pci_sub_ids, +}; + int snr_uncore_pci_init(void) { /* SNR UBOX DID */ @@ -4356,53 +4815,70 @@ int snr_uncore_pci_init(void) uncore_pci_uncores = snr_pci_uncores; uncore_pci_driver = &snr_uncore_pci_driver; + uncore_pci_sub_driver = &snr_uncore_pci_sub_driver; return 0; } -static struct pci_dev *snr_uncore_get_mc_dev(int id) +#define SNR_MC_DEVICE_ID 0x3451 + +static struct pci_dev *snr_uncore_get_mc_dev(unsigned int device, int id) { struct pci_dev *mc_dev = NULL; - int phys_id, pkg; + int pkg; while (1) { - mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3451, mc_dev); + mc_dev = pci_get_device(PCI_VENDOR_ID_INTEL, device, mc_dev); if (!mc_dev) break; - phys_id = uncore_pcibus_to_physid(mc_dev->bus); - if (phys_id < 0) - continue; - pkg = topology_phys_to_logical_pkg(phys_id); - if (pkg < 0) - continue; - else if (pkg == id) + pkg = uncore_pcibus_to_dieid(mc_dev->bus); + if (pkg == id) break; } return mc_dev; } -static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +static int snr_uncore_mmio_map(struct intel_uncore_box *box, + unsigned int box_ctl, int mem_offset, + unsigned int device) { - struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid); - unsigned int box_ctl = uncore_mmio_box_ctl(box); + struct pci_dev *pdev = snr_uncore_get_mc_dev(device, box->dieid); + struct intel_uncore_type *type = box->pmu->type; resource_size_t addr; u32 pci_dword; if (!pdev) - return; + return -ENODEV; pci_read_config_dword(pdev, SNR_IMC_MMIO_BASE_OFFSET, &pci_dword); - addr = (pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; + addr = ((resource_size_t)pci_dword & SNR_IMC_MMIO_BASE_MASK) << 23; - pci_read_config_dword(pdev, SNR_IMC_MMIO_MEM0_OFFSET, &pci_dword); + pci_read_config_dword(pdev, mem_offset, &pci_dword); addr |= (pci_dword & SNR_IMC_MMIO_MEM0_MASK) << 12; addr += box_ctl; - box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE); - if (!box->io_addr) - return; + box->io_addr = ioremap(addr, type->mmio_map_size); + if (!box->io_addr) { + pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name); + return -EINVAL; + } + + return 0; +} + +static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box, + unsigned int box_ctl, int mem_offset, + unsigned int device) +{ + if (!snr_uncore_mmio_map(box, box_ctl, mem_offset, device)) + writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); +} - writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr); +static void snr_uncore_mmio_init_box(struct intel_uncore_box *box) +{ + __snr_uncore_mmio_init_box(box, uncore_mmio_box_ctl(box), + SNR_IMC_MMIO_MEM0_OFFSET, + SNR_MC_DEVICE_ID); } static void snr_uncore_mmio_disable_box(struct intel_uncore_box *box) @@ -4437,6 +4913,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config | SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); } @@ -4449,6 +4928,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box, if (!box->io_addr) return; + if (!uncore_mmio_is_valid_offset(box, hwc->config_base)) + return; + writel(hwc->config, box->io_addr + hwc->config_base); } @@ -4487,6 +4969,7 @@ static struct intel_uncore_type snr_uncore_imc = { .event_mask = SNBEP_PMON_RAW_EVENT_MASK, .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .ops = &snr_uncore_mmio_ops, .format_group = &skx_uncore_format_group, }; @@ -4507,10 +4990,10 @@ static struct uncore_event_desc snr_uncore_imc_freerunning_events[] = { INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), - INTEL_UNCORE_EVENT_DESC(read.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), - INTEL_UNCORE_EVENT_DESC(write.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), { /* end: all zeroes */ }, }; @@ -4527,6 +5010,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = { .num_counters = 3, .num_boxes = 1, .num_freerunning_types = SNR_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, .freerunning = snr_imc_freerunning, .ops = &snr_uncore_imc_freerunning_ops, .event_descs = snr_uncore_imc_freerunning_events, @@ -4545,3 +5029,1044 @@ void snr_uncore_mmio_init(void) } /* end of SNR uncore support */ + +/* ICX uncore support */ + +static unsigned icx_cha_msr_offsets[] = { + 0x2a0, 0x2ae, 0x2bc, 0x2ca, 0x2d8, 0x2e6, 0x2f4, 0x302, 0x310, + 0x31e, 0x32c, 0x33a, 0x348, 0x356, 0x364, 0x372, 0x380, 0x38e, + 0x3aa, 0x3b8, 0x3c6, 0x3d4, 0x3e2, 0x3f0, 0x3fe, 0x40c, 0x41a, + 0x428, 0x436, 0x444, 0x452, 0x460, 0x46e, 0x47c, 0x0, 0xe, + 0x1c, 0x2a, 0x38, 0x46, +}; + +static int icx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + bool tie_en = !!(event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN); + + if (tie_en) { + reg1->reg = ICX_C34_MSR_PMON_BOX_FILTER0 + + icx_cha_msr_offsets[box->pmu->pmu_idx]; + reg1->config = event->attr.config1 & SKX_CHA_MSR_PMON_BOX_FILTER_TID; + reg1->idx = 0; + } + + return 0; +} + +static struct intel_uncore_ops icx_uncore_chabox_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snr_cha_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = icx_cha_hw_config, +}; + +static struct intel_uncore_type icx_uncore_chabox = { + .name = "cha", + .num_counters = 4, + .perf_ctr_bits = 48, + .event_ctl = ICX_C34_MSR_PMON_CTL0, + .perf_ctr = ICX_C34_MSR_PMON_CTR0, + .box_ctl = ICX_C34_MSR_PMON_BOX_CTL, + .msr_offsets = icx_cha_msr_offsets, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_CHA_RAW_EVENT_MASK_EXT, + .constraints = skx_uncore_chabox_constraints, + .ops = &icx_uncore_chabox_ops, + .format_group = &snr_uncore_chabox_format_group, +}; + +static unsigned icx_msr_offsets[] = { + 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0, +}; + +static struct event_constraint icx_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x03, 0x3), + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), + EVENT_CONSTRAINT_END +}; + +static umode_t +icx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die) +{ + /* Root bus 0x00 is valid only for pmu_idx = 5. */ + return pmu_iio_mapping_visible(kobj, attr, die, 5); +} + +static struct attribute_group icx_iio_mapping_group = { + .is_visible = icx_iio_mapping_visible, +}; + +static const struct attribute_group *icx_iio_attr_update[] = { + &icx_iio_mapping_group, + NULL, +}; + +/* + * ICX has a static mapping of stack IDs from SAD_CONTROL_CFG notation to PMON + */ +enum { + ICX_PCIE1_PMON_ID, + ICX_PCIE2_PMON_ID, + ICX_PCIE3_PMON_ID, + ICX_PCIE4_PMON_ID, + ICX_PCIE5_PMON_ID, + ICX_CBDMA_DMI_PMON_ID +}; + +static u8 icx_sad_pmon_mapping[] = { + ICX_CBDMA_DMI_PMON_ID, + ICX_PCIE1_PMON_ID, + ICX_PCIE2_PMON_ID, + ICX_PCIE3_PMON_ID, + ICX_PCIE4_PMON_ID, + ICX_PCIE5_PMON_ID, +}; + +static int icx_iio_get_topology(struct intel_uncore_type *type) +{ + return sad_cfg_iio_topology(type, icx_sad_pmon_mapping); +} + +static int icx_iio_set_mapping(struct intel_uncore_type *type) +{ + return pmu_iio_set_mapping(type, &icx_iio_mapping_group); +} + +static void icx_iio_cleanup_mapping(struct intel_uncore_type *type) +{ + pmu_iio_cleanup_mapping(type, &icx_iio_mapping_group); +} + +static struct intel_uncore_type icx_uncore_iio = { + .name = "iio", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .event_ctl = ICX_IIO_MSR_PMON_CTL0, + .perf_ctr = ICX_IIO_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = ICX_IIO_MSR_PMON_BOX_CTL, + .msr_offsets = icx_msr_offsets, + .constraints = icx_uncore_iio_constraints, + .ops = &skx_uncore_iio_ops, + .format_group = &snr_uncore_iio_format_group, + .attr_update = icx_iio_attr_update, + .get_topology = icx_iio_get_topology, + .set_mapping = icx_iio_set_mapping, + .cleanup_mapping = icx_iio_cleanup_mapping, +}; + +static struct intel_uncore_type icx_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 6, + .perf_ctr_bits = 48, + .event_ctl = ICX_IRP0_MSR_PMON_CTL0, + .perf_ctr = ICX_IRP0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = ICX_IRP0_MSR_PMON_BOX_CTL, + .msr_offsets = icx_msr_offsets, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +static struct event_constraint icx_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type icx_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .event_ctl = ICX_M2PCIE_MSR_PMON_CTL0, + .perf_ctr = ICX_M2PCIE_MSR_PMON_CTR0, + .box_ctl = ICX_M2PCIE_MSR_PMON_BOX_CTL, + .msr_offsets = icx_msr_offsets, + .constraints = icx_uncore_m2pcie_constraints, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_format_group, +}; + +enum perf_uncore_icx_iio_freerunning_type_id { + ICX_IIO_MSR_IOCLK, + ICX_IIO_MSR_BW_IN, + + ICX_IIO_FREERUNNING_TYPE_MAX, +}; + +static unsigned icx_iio_clk_freerunning_box_offsets[] = { + 0x0, 0x20, 0x40, 0x90, 0xb0, 0xd0, +}; + +static unsigned icx_iio_bw_freerunning_box_offsets[] = { + 0x0, 0x10, 0x20, 0x90, 0xa0, 0xb0, +}; + +static struct freerunning_counters icx_iio_freerunning[] = { + [ICX_IIO_MSR_IOCLK] = { 0xa55, 0x1, 0x20, 1, 48, icx_iio_clk_freerunning_box_offsets }, + [ICX_IIO_MSR_BW_IN] = { 0xaa0, 0x1, 0x10, 8, 48, icx_iio_bw_freerunning_box_offsets }, +}; + +static struct uncore_event_desc icx_uncore_iio_freerunning_events[] = { + /* Free-Running IIO CLOCKS Counter */ + INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), + /* Free-Running IIO BANDWIDTH IN Counters */ + INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type icx_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 9, + .num_boxes = 6, + .num_freerunning_types = ICX_IIO_FREERUNNING_TYPE_MAX, + .freerunning = icx_iio_freerunning, + .ops = &skx_uncore_iio_freerunning_ops, + .event_descs = icx_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *icx_msr_uncores[] = { + &skx_uncore_ubox, + &icx_uncore_chabox, + &icx_uncore_iio, + &icx_uncore_irp, + &icx_uncore_m2pcie, + &skx_uncore_pcu, + &icx_uncore_iio_free_running, + NULL, +}; + +/* + * To determine the number of CHAs, it should read CAPID6(Low) and CAPID7 (High) + * registers which located at Device 30, Function 3 + */ +#define ICX_CAPID6 0x9c +#define ICX_CAPID7 0xa0 + +static u64 icx_count_chabox(void) +{ + struct pci_dev *dev = NULL; + u64 caps = 0; + + dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x345b, dev); + if (!dev) + goto out; + + pci_read_config_dword(dev, ICX_CAPID6, (u32 *)&caps); + pci_read_config_dword(dev, ICX_CAPID7, (u32 *)&caps + 1); +out: + pci_dev_put(dev); + return hweight64(caps); +} + +void icx_uncore_cpu_init(void) +{ + u64 num_boxes = icx_count_chabox(); + + if (WARN_ON(num_boxes > ARRAY_SIZE(icx_cha_msr_offsets))) + return; + icx_uncore_chabox.num_boxes = num_boxes; + uncore_msr_uncores = icx_msr_uncores; +} + +static struct intel_uncore_type icx_uncore_m2m = { + .name = "m2m", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .perf_ctr = SNR_M2M_PCI_PMON_CTR0, + .event_ctl = SNR_M2M_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_M2M_PCI_PMON_UMASK_EXT, + .box_ctl = SNR_M2M_PCI_PMON_BOX_CTL, + .ops = &snr_m2m_uncore_pci_ops, + .format_group = &snr_m2m_uncore_format_group, +}; + +static struct attribute *icx_upi_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext4.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static const struct attribute_group icx_upi_uncore_format_group = { + .name = "format", + .attrs = icx_upi_uncore_formats_attr, +}; + +static struct intel_uncore_type icx_uncore_upi = { + .name = "upi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = ICX_UPI_PCI_PMON_CTR0, + .event_ctl = ICX_UPI_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = ICX_UPI_CTL_UMASK_EXT, + .box_ctl = ICX_UPI_PCI_PMON_BOX_CTL, + .ops = &skx_upi_uncore_pci_ops, + .format_group = &icx_upi_uncore_format_group, +}; + +static struct event_constraint icx_uncore_m3upi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x1c, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1d, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1e, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1f, 0x1), + UNCORE_EVENT_CONSTRAINT(0x40, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4e, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4f, 0x7), + UNCORE_EVENT_CONSTRAINT(0x50, 0x7), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type icx_uncore_m3upi = { + .name = "m3upi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = ICX_M3UPI_PCI_PMON_CTR0, + .event_ctl = ICX_M3UPI_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = ICX_M3UPI_PCI_PMON_BOX_CTL, + .constraints = icx_uncore_m3upi_constraints, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +enum { + ICX_PCI_UNCORE_M2M, + ICX_PCI_UNCORE_UPI, + ICX_PCI_UNCORE_M3UPI, +}; + +static struct intel_uncore_type *icx_pci_uncores[] = { + [ICX_PCI_UNCORE_M2M] = &icx_uncore_m2m, + [ICX_PCI_UNCORE_UPI] = &icx_uncore_upi, + [ICX_PCI_UNCORE_M3UPI] = &icx_uncore_m3upi, + NULL, +}; + +static const struct pci_device_id icx_uncore_pci_ids[] = { + { /* M2M 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 0, ICX_PCI_UNCORE_M2M, 0), + }, + { /* M2M 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 0, ICX_PCI_UNCORE_M2M, 1), + }, + { /* M2M 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, ICX_PCI_UNCORE_M2M, 2), + }, + { /* M2M 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x344a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, ICX_PCI_UNCORE_M2M, 3), + }, + { /* UPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(2, 1, ICX_PCI_UNCORE_UPI, 0), + }, + { /* UPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(3, 1, ICX_PCI_UNCORE_UPI, 1), + }, + { /* UPI Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3441), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(4, 1, ICX_PCI_UNCORE_UPI, 2), + }, + { /* M3UPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(5, 1, ICX_PCI_UNCORE_M3UPI, 0), + }, + { /* M3UPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(6, 1, ICX_PCI_UNCORE_M3UPI, 1), + }, + { /* M3UPI Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x3446), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(7, 1, ICX_PCI_UNCORE_M3UPI, 2), + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver icx_uncore_pci_driver = { + .name = "icx_uncore", + .id_table = icx_uncore_pci_ids, +}; + +int icx_uncore_pci_init(void) +{ + /* ICX UBOX DID */ + int ret = snbep_pci2phy_map_init(0x3450, SKX_CPUNODEID, + SKX_GIDNIDMAP, true); + + if (ret) + return ret; + + uncore_pci_uncores = icx_pci_uncores; + uncore_pci_driver = &icx_uncore_pci_driver; + return 0; +} + +static void icx_uncore_imc_init_box(struct intel_uncore_box *box) +{ + unsigned int box_ctl = box->pmu->type->box_ctl + + box->pmu->type->mmio_offset * (box->pmu->pmu_idx % ICX_NUMBER_IMC_CHN); + int mem_offset = (box->pmu->pmu_idx / ICX_NUMBER_IMC_CHN) * ICX_IMC_MEM_STRIDE + + SNR_IMC_MMIO_MEM0_OFFSET; + + __snr_uncore_mmio_init_box(box, box_ctl, mem_offset, + SNR_MC_DEVICE_ID); +} + +static struct intel_uncore_ops icx_uncore_mmio_ops = { + .init_box = icx_uncore_imc_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = snr_uncore_mmio_disable_box, + .enable_box = snr_uncore_mmio_enable_box, + .disable_event = snr_uncore_mmio_disable_event, + .enable_event = snr_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static struct intel_uncore_type icx_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 12, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .event_descs = snr_uncore_imc_events, + .perf_ctr = SNR_IMC_MMIO_PMON_CTR0, + .event_ctl = SNR_IMC_MMIO_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNR_IMC_MMIO_PMON_BOX_CTL, + .mmio_offset = SNR_IMC_MMIO_OFFSET, + .mmio_map_size = SNR_IMC_MMIO_SIZE, + .ops = &icx_uncore_mmio_ops, + .format_group = &skx_uncore_format_group, +}; + +enum perf_uncore_icx_imc_freerunning_type_id { + ICX_IMC_DCLK, + ICX_IMC_DDR, + ICX_IMC_DDRT, + + ICX_IMC_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters icx_imc_freerunning[] = { + [ICX_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 }, + [ICX_IMC_DDR] = { 0x2290, 0x8, 0, 2, 48 }, + [ICX_IMC_DDRT] = { 0x22a0, 0x8, 0, 2, 48 }, +}; + +static struct uncore_event_desc icx_uncore_imc_freerunning_events[] = { + INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), + + INTEL_UNCORE_EVENT_DESC(read, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(write, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(write.unit, "MiB"), + + INTEL_UNCORE_EVENT_DESC(ddrt_read, "event=0xff,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(ddrt_read.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(ddrt_write, "event=0xff,umask=0x31"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.scale, "6.103515625e-5"), + INTEL_UNCORE_EVENT_DESC(ddrt_write.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static void icx_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + + SNR_IMC_MMIO_MEM0_OFFSET; + + snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box), + mem_offset, SNR_MC_DEVICE_ID); +} + +static struct intel_uncore_ops icx_uncore_imc_freerunning_ops = { + .init_box = icx_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type icx_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 5, + .num_boxes = 4, + .num_freerunning_types = ICX_IMC_FREERUNNING_TYPE_MAX, + .mmio_map_size = SNR_IMC_MMIO_SIZE, + .freerunning = icx_imc_freerunning, + .ops = &icx_uncore_imc_freerunning_ops, + .event_descs = icx_uncore_imc_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +static struct intel_uncore_type *icx_mmio_uncores[] = { + &icx_uncore_imc, + &icx_uncore_imc_free_running, + NULL, +}; + +void icx_uncore_mmio_init(void) +{ + uncore_mmio_uncores = icx_mmio_uncores; +} + +/* end of ICX uncore support */ + +/* SPR uncore support */ + +static void spr_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config); +} + +static void spr_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, 0); + + wrmsrl(hwc->config_base, 0); +} + +static int spr_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + bool tie_en = !!(event->hw.config & SPR_CHA_PMON_CTL_TID_EN); + struct intel_uncore_type *type = box->pmu->type; + + if (tie_en) { + reg1->reg = SPR_C0_MSR_PMON_BOX_FILTER0 + + HSWEP_CBO_MSR_OFFSET * type->box_ids[box->pmu->pmu_idx]; + reg1->config = event->attr.config1 & SPR_CHA_PMON_BOX_FILTER_TID; + reg1->idx = 0; + } + + return 0; +} + +static struct intel_uncore_ops spr_uncore_chabox_ops = { + .init_box = intel_generic_uncore_msr_init_box, + .disable_box = intel_generic_uncore_msr_disable_box, + .enable_box = intel_generic_uncore_msr_enable_box, + .disable_event = spr_uncore_msr_disable_event, + .enable_event = spr_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = spr_cha_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct attribute *spr_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext4.attr, + &format_attr_tid_en2.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid5.attr, + NULL, +}; +static const struct attribute_group spr_uncore_chabox_format_group = { + .name = "format", + .attrs = spr_uncore_cha_formats_attr, +}; + +static ssize_t alias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(dev); + char pmu_name[UNCORE_PMU_NAME_LEN]; + + uncore_get_alias_name(pmu_name, pmu); + return sysfs_emit(buf, "%s\n", pmu_name); +} + +static DEVICE_ATTR_RO(alias); + +static struct attribute *uncore_alias_attrs[] = { + &dev_attr_alias.attr, + NULL +}; + +ATTRIBUTE_GROUPS(uncore_alias); + +static struct intel_uncore_type spr_uncore_chabox = { + .name = "cha", + .event_mask = SPR_CHA_PMON_EVENT_MASK, + .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, + .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, + .ops = &spr_uncore_chabox_ops, + .format_group = &spr_uncore_chabox_format_group, + .attr_update = uncore_alias_groups, +}; + +static struct intel_uncore_type spr_uncore_iio = { + .name = "iio", + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .event_mask_ext = SNR_IIO_PMON_RAW_EVENT_MASK_EXT, + .format_group = &snr_uncore_iio_format_group, + .attr_update = uncore_alias_groups, + .constraints = icx_uncore_iio_constraints, +}; + +static struct attribute *spr_uncore_raw_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask_ext4.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static const struct attribute_group spr_uncore_raw_format_group = { + .name = "format", + .attrs = spr_uncore_raw_formats_attr, +}; + +#define SPR_UNCORE_COMMON_FORMAT() \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .event_mask_ext = SPR_RAW_EVENT_MASK_EXT, \ + .format_group = &spr_uncore_raw_format_group, \ + .attr_update = uncore_alias_groups + +static struct intel_uncore_type spr_uncore_irp = { + SPR_UNCORE_COMMON_FORMAT(), + .name = "irp", + +}; + +static struct event_constraint spr_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x14, 0x3), + UNCORE_EVENT_CONSTRAINT(0x2d, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type spr_uncore_m2pcie = { + SPR_UNCORE_COMMON_FORMAT(), + .name = "m2pcie", + .constraints = spr_uncore_m2pcie_constraints, +}; + +static struct intel_uncore_type spr_uncore_pcu = { + .name = "pcu", + .attr_update = uncore_alias_groups, +}; + +static void spr_uncore_mmio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!box->io_addr) + return; + + if (uncore_pmc_fixed(hwc->idx)) + writel(SNBEP_PMON_CTL_EN, box->io_addr + hwc->config_base); + else + writel(hwc->config, box->io_addr + hwc->config_base); +} + +static struct intel_uncore_ops spr_uncore_mmio_ops = { + .init_box = intel_generic_uncore_mmio_init_box, + .exit_box = uncore_mmio_exit_box, + .disable_box = intel_generic_uncore_mmio_disable_box, + .enable_box = intel_generic_uncore_mmio_enable_box, + .disable_event = intel_generic_uncore_mmio_disable_event, + .enable_event = spr_uncore_mmio_enable_event, + .read_counter = uncore_mmio_read_counter, +}; + +static struct intel_uncore_type spr_uncore_imc = { + SPR_UNCORE_COMMON_FORMAT(), + .name = "imc", + .fixed_ctr_bits = 48, + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, + .ops = &spr_uncore_mmio_ops, +}; + +static void spr_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base + 4, (u32)(hwc->config >> 32)); + pci_write_config_dword(pdev, hwc->config_base, (u32)hwc->config); +} + +static struct intel_uncore_ops spr_uncore_pci_ops = { + .init_box = intel_generic_uncore_pci_init_box, + .disable_box = intel_generic_uncore_pci_disable_box, + .enable_box = intel_generic_uncore_pci_enable_box, + .disable_event = intel_generic_uncore_pci_disable_event, + .enable_event = spr_uncore_pci_enable_event, + .read_counter = intel_generic_uncore_pci_read_counter, +}; + +#define SPR_UNCORE_PCI_COMMON_FORMAT() \ + SPR_UNCORE_COMMON_FORMAT(), \ + .ops = &spr_uncore_pci_ops + +static struct intel_uncore_type spr_uncore_m2m = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "m2m", +}; + +static struct intel_uncore_type spr_uncore_upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "upi", +}; + +static struct intel_uncore_type spr_uncore_m3upi = { + SPR_UNCORE_PCI_COMMON_FORMAT(), + .name = "m3upi", + .constraints = icx_uncore_m3upi_constraints, +}; + +static struct intel_uncore_type spr_uncore_mdf = { + SPR_UNCORE_COMMON_FORMAT(), + .name = "mdf", +}; + +#define UNCORE_SPR_NUM_UNCORE_TYPES 12 +#define UNCORE_SPR_IIO 1 +#define UNCORE_SPR_IMC 6 + +static struct intel_uncore_type *spr_uncores[UNCORE_SPR_NUM_UNCORE_TYPES] = { + &spr_uncore_chabox, + &spr_uncore_iio, + &spr_uncore_irp, + &spr_uncore_m2pcie, + &spr_uncore_pcu, + NULL, + &spr_uncore_imc, + &spr_uncore_m2m, + &spr_uncore_upi, + &spr_uncore_m3upi, + NULL, + &spr_uncore_mdf, +}; + +enum perf_uncore_spr_iio_freerunning_type_id { + SPR_IIO_MSR_IOCLK, + SPR_IIO_MSR_BW_IN, + SPR_IIO_MSR_BW_OUT, + + SPR_IIO_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters spr_iio_freerunning[] = { + [SPR_IIO_MSR_IOCLK] = { 0x340e, 0x1, 0x10, 1, 48 }, + [SPR_IIO_MSR_BW_IN] = { 0x3800, 0x1, 0x10, 8, 48 }, + [SPR_IIO_MSR_BW_OUT] = { 0x3808, 0x1, 0x10, 8, 48 }, +}; + +static struct uncore_event_desc spr_uncore_iio_freerunning_events[] = { + /* Free-Running IIO CLOCKS Counter */ + INTEL_UNCORE_EVENT_DESC(ioclk, "event=0xff,umask=0x10"), + /* Free-Running IIO BANDWIDTH IN Counters */ + INTEL_UNCORE_EVENT_DESC(bw_in_port0, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1, "event=0xff,umask=0x21"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2, "event=0xff,umask=0x22"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3, "event=0xff,umask=0x23"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4, "event=0xff,umask=0x24"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port4.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5, "event=0xff,umask=0x25"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port5.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6, "event=0xff,umask=0x26"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port6.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7, "event=0xff,umask=0x27"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_in_port7.unit, "MiB"), + /* Free-Running IIO BANDWIDTH OUT Counters */ + INTEL_UNCORE_EVENT_DESC(bw_out_port0, "event=0xff,umask=0x30"), + INTEL_UNCORE_EVENT_DESC(bw_out_port0.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port0.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1, "event=0xff,umask=0x31"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port1.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2, "event=0xff,umask=0x32"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port2.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3, "event=0xff,umask=0x33"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port3.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port4, "event=0xff,umask=0x34"), + INTEL_UNCORE_EVENT_DESC(bw_out_port4.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port4.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port5, "event=0xff,umask=0x35"), + INTEL_UNCORE_EVENT_DESC(bw_out_port5.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port5.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port6, "event=0xff,umask=0x36"), + INTEL_UNCORE_EVENT_DESC(bw_out_port6.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port6.unit, "MiB"), + INTEL_UNCORE_EVENT_DESC(bw_out_port7, "event=0xff,umask=0x37"), + INTEL_UNCORE_EVENT_DESC(bw_out_port7.scale, "3.814697266e-6"), + INTEL_UNCORE_EVENT_DESC(bw_out_port7.unit, "MiB"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type spr_uncore_iio_free_running = { + .name = "iio_free_running", + .num_counters = 17, + .num_freerunning_types = SPR_IIO_FREERUNNING_TYPE_MAX, + .freerunning = spr_iio_freerunning, + .ops = &skx_uncore_iio_freerunning_ops, + .event_descs = spr_uncore_iio_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +enum perf_uncore_spr_imc_freerunning_type_id { + SPR_IMC_DCLK, + SPR_IMC_PQ_CYCLES, + + SPR_IMC_FREERUNNING_TYPE_MAX, +}; + +static struct freerunning_counters spr_imc_freerunning[] = { + [SPR_IMC_DCLK] = { 0x22b0, 0x0, 0, 1, 48 }, + [SPR_IMC_PQ_CYCLES] = { 0x2318, 0x8, 0, 2, 48 }, +}; + +static struct uncore_event_desc spr_uncore_imc_freerunning_events[] = { + INTEL_UNCORE_EVENT_DESC(dclk, "event=0xff,umask=0x10"), + + INTEL_UNCORE_EVENT_DESC(rpq_cycles, "event=0xff,umask=0x20"), + INTEL_UNCORE_EVENT_DESC(wpq_cycles, "event=0xff,umask=0x21"), + { /* end: all zeroes */ }, +}; + +#define SPR_MC_DEVICE_ID 0x3251 + +static void spr_uncore_imc_freerunning_init_box(struct intel_uncore_box *box) +{ + int mem_offset = box->pmu->pmu_idx * ICX_IMC_MEM_STRIDE + SNR_IMC_MMIO_MEM0_OFFSET; + + snr_uncore_mmio_map(box, uncore_mmio_box_ctl(box), + mem_offset, SPR_MC_DEVICE_ID); +} + +static struct intel_uncore_ops spr_uncore_imc_freerunning_ops = { + .init_box = spr_uncore_imc_freerunning_init_box, + .exit_box = uncore_mmio_exit_box, + .read_counter = uncore_mmio_read_counter, + .hw_config = uncore_freerunning_hw_config, +}; + +static struct intel_uncore_type spr_uncore_imc_free_running = { + .name = "imc_free_running", + .num_counters = 3, + .mmio_map_size = SNR_IMC_MMIO_SIZE, + .num_freerunning_types = SPR_IMC_FREERUNNING_TYPE_MAX, + .freerunning = spr_imc_freerunning, + .ops = &spr_uncore_imc_freerunning_ops, + .event_descs = spr_uncore_imc_freerunning_events, + .format_group = &skx_uncore_iio_freerunning_format_group, +}; + +#define UNCORE_SPR_MSR_EXTRA_UNCORES 1 +#define UNCORE_SPR_MMIO_EXTRA_UNCORES 1 + +static struct intel_uncore_type *spr_msr_uncores[UNCORE_SPR_MSR_EXTRA_UNCORES] = { + &spr_uncore_iio_free_running, +}; + +static struct intel_uncore_type *spr_mmio_uncores[UNCORE_SPR_MMIO_EXTRA_UNCORES] = { + &spr_uncore_imc_free_running, +}; + +static void uncore_type_customized_copy(struct intel_uncore_type *to_type, + struct intel_uncore_type *from_type) +{ + if (!to_type || !from_type) + return; + + if (from_type->name) + to_type->name = from_type->name; + if (from_type->fixed_ctr_bits) + to_type->fixed_ctr_bits = from_type->fixed_ctr_bits; + if (from_type->event_mask) + to_type->event_mask = from_type->event_mask; + if (from_type->event_mask_ext) + to_type->event_mask_ext = from_type->event_mask_ext; + if (from_type->fixed_ctr) + to_type->fixed_ctr = from_type->fixed_ctr; + if (from_type->fixed_ctl) + to_type->fixed_ctl = from_type->fixed_ctl; + if (from_type->fixed_ctr_bits) + to_type->fixed_ctr_bits = from_type->fixed_ctr_bits; + if (from_type->num_shared_regs) + to_type->num_shared_regs = from_type->num_shared_regs; + if (from_type->constraints) + to_type->constraints = from_type->constraints; + if (from_type->ops) + to_type->ops = from_type->ops; + if (from_type->event_descs) + to_type->event_descs = from_type->event_descs; + if (from_type->format_group) + to_type->format_group = from_type->format_group; + if (from_type->attr_update) + to_type->attr_update = from_type->attr_update; +} + +static struct intel_uncore_type ** +uncore_get_uncores(enum uncore_access_type type_id, int num_extra, + struct intel_uncore_type **extra) +{ + struct intel_uncore_type **types, **start_types; + int i; + + start_types = types = intel_uncore_generic_init_uncores(type_id, num_extra); + + /* Only copy the customized features */ + for (; *types; types++) { + if ((*types)->type_id >= UNCORE_SPR_NUM_UNCORE_TYPES) + continue; + uncore_type_customized_copy(*types, spr_uncores[(*types)->type_id]); + } + + for (i = 0; i < num_extra; i++, types++) + *types = extra[i]; + + return start_types; +} + +static struct intel_uncore_type * +uncore_find_type_by_id(struct intel_uncore_type **types, int type_id) +{ + for (; *types; types++) { + if (type_id == (*types)->type_id) + return *types; + } + + return NULL; +} + +static int uncore_type_max_boxes(struct intel_uncore_type **types, + int type_id) +{ + struct intel_uncore_type *type; + int i, max = 0; + + type = uncore_find_type_by_id(types, type_id); + if (!type) + return 0; + + for (i = 0; i < type->num_boxes; i++) { + if (type->box_ids[i] > max) + max = type->box_ids[i]; + } + + return max + 1; +} + +void spr_uncore_cpu_init(void) +{ + uncore_msr_uncores = uncore_get_uncores(UNCORE_ACCESS_MSR, + UNCORE_SPR_MSR_EXTRA_UNCORES, + spr_msr_uncores); + + spr_uncore_iio_free_running.num_boxes = uncore_type_max_boxes(uncore_msr_uncores, UNCORE_SPR_IIO); +} + +int spr_uncore_pci_init(void) +{ + uncore_pci_uncores = uncore_get_uncores(UNCORE_ACCESS_PCI, 0, NULL); + return 0; +} + +void spr_uncore_mmio_init(void) +{ + int ret = snbep_pci2phy_map_init(0x3250, SKX_CPUNODEID, SKX_GIDNIDMAP, true); + + if (ret) + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, 0, NULL); + else { + uncore_mmio_uncores = uncore_get_uncores(UNCORE_ACCESS_MMIO, + UNCORE_SPR_MMIO_EXTRA_UNCORES, + spr_mmio_uncores); + + spr_uncore_imc_free_running.num_boxes = uncore_type_max_boxes(uncore_mmio_uncores, UNCORE_SPR_IMC) / 2; + } +} + +/* end of SPR uncore support */ diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a949f6f55991..ecced3a52668 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -68,6 +68,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_BROADWELL_D: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SAPPHIRERAPIDS_X: case INTEL_FAM6_ATOM_SILVERMONT: case INTEL_FAM6_ATOM_SILVERMONT_D: @@ -78,6 +79,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ATOM_GOLDMONT_PLUS: case INTEL_FAM6_ATOM_TREMONT_D: case INTEL_FAM6_ATOM_TREMONT: + case INTEL_FAM6_ATOM_TREMONT_L: case INTEL_FAM6_XEON_PHI_KNL: case INTEL_FAM6_XEON_PHI_KNM: @@ -98,6 +100,13 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ICELAKE_D: case INTEL_FAM6_TIGERLAKE_L: case INTEL_FAM6_TIGERLAKE: + case INTEL_FAM6_ROCKETLAKE: + case INTEL_FAM6_ALDERLAKE: + case INTEL_FAM6_ALDERLAKE_L: + case INTEL_FAM6_ALDERLAKE_N: + case INTEL_FAM6_RAPTORLAKE: + case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f1cd1ca1a77b..332d2e6d8ae4 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -14,7 +14,9 @@ #include <linux/perf_event.h> +#include <asm/fpu/xstate.h> #include <asm/intel_ds.h> +#include <asm/cpu.h> /* To enable MSR tracing please use the generic trace points. */ @@ -62,22 +64,49 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) return ((ecode & c->cmask) - c->code) <= (u64)c->size; } +#define PERF_ARCH(name, val) \ + PERF_X86_EVENT_##name = val, + /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ +enum { +#include "perf_event_flags.h" +}; + +#undef PERF_ARCH + +#define PERF_ARCH(name, val) \ + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ + PERF_X86_EVENT_##name); + +#include "perf_event_flags.h" + +#undef PERF_ARCH + +static inline bool is_topdown_count(struct perf_event *event) +{ + return event->hw.flags & PERF_X86_EVENT_TOPDOWN; +} + +static inline bool is_metric_event(struct perf_event *event) +{ + u64 config = event->attr.config; + + return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) && + ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING) && + ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX); +} + +static inline bool is_slots_event(struct perf_event *event) +{ + return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS; +} + +static inline bool is_topdown_event(struct perf_event *event) +{ + return is_metric_event(event) || is_slots_event(event); +} struct amd_nb { int nb_id; /* NorthBridge id */ @@ -106,7 +135,8 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \ + PERF_SAMPLE_WEIGHT_TYPE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ @@ -179,6 +209,18 @@ struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_INFO = 0x05, + LBR_FORMAT_TIME = 0x06, + LBR_FORMAT_INFO2 = 0x07, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2, +}; + +enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, X86_PERF_KFREE_MAX @@ -190,7 +232,7 @@ struct cpu_hw_events { */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long dirty[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; /* the # of events in the below arrays */ @@ -198,6 +240,8 @@ struct cpu_hw_events { they've never been enabled yet */ int n_txn; /* the # last events in the below arrays; added in the current transaction */ + int n_txn_pair; + int n_txn_metric; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; @@ -226,6 +270,10 @@ struct cpu_hw_events { u64 active_pebs_data_cfg; int pebs_record_size; + /* Intel Fixed counter configuration */ + u64 fixed_ctrl_val; + u64 active_fixed_ctrl_val; + /* * Intel LBR bits */ @@ -233,10 +281,15 @@ struct cpu_hw_events { int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; - struct er_account *lbr_sel; + union { + struct er_account *lbr_sel; + struct er_account *lbr_ctl; + }; u64 br_sel; - struct x86_perf_task_context *last_task_ctx; + void *last_task_ctx; int last_log_id; + int lbr_select; + void *lbr_xsave; /* * Intel host/guest exclude bits @@ -268,14 +321,24 @@ struct cpu_hw_events { u64 tfa_shadow; /* + * Perf Metrics + */ + /* number of accepted metrics events */ + int n_metric; + + /* * AMD specific bits */ struct amd_nb *amd_nb; + int brs_active; /* BRS is enabled */ + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ void *kfree_on_online[X86_PERF_KFREE_MAX]; + + struct pmu *pmu; }; #define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ @@ -359,6 +422,19 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) /* + * The special metric counters do not actually exist. They are calculated from + * the combination of the FxCtr3 + MSR_PERF_METRICS. + * + * The special metric counters are mapped to a dummy offset for the scheduler. + * The sharing between multiple users of the same metric without multiplexing + * is not allowed, even though the hardware supports that in principle. + */ + +#define METRIC_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)), \ + INTEL_ARCH_EVENT_MASK) + +/* * Constraint on the Event code + UMask */ #define INTEL_UEVENT_CONSTRAINT(c, n) \ @@ -380,10 +456,18 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) +#define INTEL_PSD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_STLAT) + #define INTEL_PST_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) +#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID) + /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) @@ -520,8 +604,9 @@ union perf_capabilities { */ u64 full_width_write:1; u64 pebs_baseline:1; - u64 pebs_metrics_available:1; + u64 perf_metrics:1; u64 pebs_output_pt_available:1; + u64 anythread_deprecated:1; }; u64 capabilities; }; @@ -561,6 +646,89 @@ enum { x86_lbr_exclusive_max, }; +#define PERF_PEBS_DATA_SOURCE_MAX 0x10 + +struct x86_hybrid_pmu { + struct pmu pmu; + const char *name; + u8 cpu_type; + cpumask_t supported_cpus; + union perf_capabilities intel_cap; + u64 intel_ctrl; + int max_pebs_events; + int num_counters; + int num_counters_fixed; + struct event_constraint unconstrained; + + u64 hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + u64 hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + struct event_constraint *event_constraints; + struct event_constraint *pebs_constraints; + struct extra_reg *extra_regs; + + unsigned int late_ack :1, + mid_ack :1, + enabled_ack :1; + + u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX]; +}; + +static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu) +{ + return container_of(pmu, struct x86_hybrid_pmu, pmu); +} + +extern struct static_key_false perf_is_hybrid; +#define is_hybrid() static_branch_unlikely(&perf_is_hybrid) + +#define hybrid(_pmu, _field) \ +(*({ \ + typeof(&x86_pmu._field) __Fp = &x86_pmu._field; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = &hybrid_pmu(_pmu)->_field; \ + \ + __Fp; \ +})) + +#define hybrid_var(_pmu, _var) \ +(*({ \ + typeof(&_var) __Fp = &_var; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = &hybrid_pmu(_pmu)->_var; \ + \ + __Fp; \ +})) + +#define hybrid_bit(_pmu, _field) \ +({ \ + bool __Fp = x86_pmu._field; \ + \ + if (is_hybrid() && (_pmu)) \ + __Fp = hybrid_pmu(_pmu)->_field; \ + \ + __Fp; \ +}) + +enum hybrid_pmu_type { + hybrid_big = 0x40, + hybrid_small = 0x20, + + hybrid_big_small = hybrid_big | hybrid_small, +}; + +#define X86_HYBRID_PMU_ATOM_IDX 0 +#define X86_HYBRID_PMU_CORE_IDX 1 + +#define X86_HYBRID_NUM_PMUS 2 + /* * struct x86_pmu - generic x86 pmu */ @@ -575,9 +743,12 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*assign)(struct perf_event *event, int idx); void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); + int (*set_period)(struct perf_event *event); + u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -613,12 +784,12 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; - int perfctr_second_write; - u64 (*limit_period)(struct perf_event *event, u64 l); + void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ unsigned int late_ack :1, - counter_freezing :1; + mid_ack :1, + enabled_ack :1; /* * sysfs attrs */ @@ -659,32 +830,70 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_block :1, + pebs_ept :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; - void (*drain_pebs)(struct pt_regs *regs); + void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + u64 (*pebs_latency_data)(struct perf_event *event, u64 status); unsigned long large_pebs_flags; u64 rtm_abort_event; + u64 pebs_capable; /* * Intel LBR */ - unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ - int lbr_nr; /* hardware stack size */ - u64 lbr_sel_mask; /* LBR_SELECT valid bits */ - const int *lbr_sel_map; /* lbr_select mappings */ + unsigned int lbr_tos, lbr_from, lbr_to, + lbr_info, lbr_nr; /* LBR base regs and size */ + union { + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + u64 lbr_ctl_mask; /* LBR_CTL valid bits */ + }; + union { + const int *lbr_sel_map; /* lbr_select mappings */ + int *lbr_ctl_map; /* LBR_CTL mappings */ + }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ + unsigned int lbr_has_info:1; + unsigned int lbr_has_tsx:1; + unsigned int lbr_from_flags:1; + unsigned int lbr_to_cycles:1; + + /* + * Intel Architectural LBR CPUID Enumeration + */ + unsigned int lbr_depth_mask:8; + unsigned int lbr_deep_c_reset:1; + unsigned int lbr_lip:1; + unsigned int lbr_cpl:1; + unsigned int lbr_filter:1; + unsigned int lbr_call_stack:1; + unsigned int lbr_mispred:1; + unsigned int lbr_timed_lbr:1; + unsigned int lbr_br_type:1; + + void (*lbr_reset)(void); + void (*lbr_read)(struct cpu_hw_events *cpuc); + void (*lbr_save)(void *ctx); + void (*lbr_restore)(void *ctx); + /* * Intel PT/LBR/BTS are exclusive */ atomic_t lbr_exclusive[x86_lbr_exclusive_max]; /* + * Intel perf metrics + */ + int num_topdown_events; + + /* * perf task context (i.e. struct perf_event_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; @@ -707,7 +916,7 @@ struct x86_pmu { /* * Intel host/guest support (KVM) */ - struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr, void *data); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. @@ -715,19 +924,61 @@ struct x86_pmu { int (*check_period) (struct perf_event *event, u64 period); int (*aux_output_match) (struct perf_event *event); + + int (*filter_match)(struct perf_event *event); + /* + * Hybrid support + * + * Most PMU capabilities are the same among different hybrid PMUs. + * The global x86_pmu saves the architecture capabilities, which + * are available for all PMUs. The hybrid_pmu only includes the + * unique capabilities. + */ + int num_hybrid_pmus; + struct x86_hybrid_pmu *hybrid_pmu; + u8 (*get_hybrid_cpu_type) (void); }; -struct x86_perf_task_context { - u64 lbr_from[MAX_LBR_ENTRIES]; - u64 lbr_to[MAX_LBR_ENTRIES]; - u64 lbr_info[MAX_LBR_ENTRIES]; - int tos; - int valid_lbrs; +struct x86_perf_task_context_opt { int lbr_callstack_users; int lbr_stack_state; int log_id; }; +struct x86_perf_task_context { + u64 lbr_sel; + int tos; + int valid_lbrs; + struct x86_perf_task_context_opt opt; + struct lbr_entry lbr[MAX_LBR_ENTRIES]; +}; + +struct x86_perf_task_context_arch_lbr { + struct x86_perf_task_context_opt opt; + struct lbr_entry entries[]; +}; + +/* + * Add padding to guarantee the 64-byte alignment of the state buffer. + * + * The structure is dynamically allocated. The size of the LBR state may vary + * based on the number of LBR registers. + * + * Do not put anything after the LBR state. + */ +struct x86_perf_task_context_arch_lbr_xsave { + struct x86_perf_task_context_opt opt; + + union { + struct xregs_state xsave; + struct { + struct fxregs_state i387; + struct xstate_header header; + struct arch_lbr_state lbr; + } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT))); + }; +}; + #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ @@ -747,6 +998,8 @@ do { \ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define PMU_FL_TFA 0x20 /* deal with TSX force abort */ #define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */ +#define PMU_FL_INSTR_LATENCY 0x80 /* Support Instruction Latency in PEBS Memory Info Record */ +#define PMU_FL_MEM_LOADS_AUX 0x100 /* Require an auxiliary event for the complete memory info */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr @@ -773,9 +1026,36 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } -struct pmu *x86_get_pmu(void); +#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \ +static struct perf_pmu_events_hybrid_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\ + .id = 0, \ + .event_str = str, \ + .pmu_type = _pmu, \ +} + +#define FORMAT_HYBRID_PTR(_id) (&format_attr_hybrid_##_id.attr.attr) + +#define FORMAT_ATTR_HYBRID(_name, _pmu) \ +static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ + .attr = __ATTR_RO(_name), \ + .pmu_type = _pmu, \ +} + +struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; +DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); +DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); + +static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) +{ + if (static_cpu_has(X86_FEATURE_ARCH_LBR)) + return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; + + return &((struct x86_perf_task_context *)ctx)->opt; +} + static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && @@ -783,6 +1063,7 @@ static inline bool x86_pmu_has_lbr_callstack(void) } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); +DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); int x86_perf_event_set_period(struct perf_event *event); @@ -824,6 +1105,9 @@ static inline int x86_pmu_rdpmc_index(int index) return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } +bool check_hw_exists(struct pmu *pmu, int num_counters, + int num_counters_fixed); + int x86_add_exclusive(unsigned int what); void x86_del_exclusive(unsigned int what); @@ -842,6 +1126,11 @@ int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); +static inline bool has_amd_brs(struct hw_perf_event *hwc) +{ + return hwc->flags & PERF_X86_EVENT_AMD_BRS; +} + static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; @@ -875,9 +1164,10 @@ void x86_pmu_stop(struct perf_event *event, int flags); static inline void x86_pmu_disable_event(struct perf_event *event) { + u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); struct hw_perf_event *hwc = &event->hw; - wrmsrl(hwc->config_base, hwc->config); + wrmsrl(hwc->config_base, hwc->config & ~disable_mask); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0); @@ -887,6 +1177,11 @@ void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); +void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, + u64 intel_ctrl); + +void x86_pmu_update_cpu_context(struct pmu *pmu, int cpu); + extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; @@ -920,6 +1215,70 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ + X86_BR_ZERO_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +int common_branch_type(int type); +int branch_type(unsigned long from, unsigned long to, int abort); +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset); + ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); @@ -927,11 +1286,103 @@ ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); +ssize_t events_hybrid_sysfs_show(struct device *dev, + struct device_attribute *attr, + char *page); + +static inline bool fixed_counter_disabled(int i, struct pmu *pmu) +{ + u64 intel_ctrl = hybrid(pmu, intel_ctrl); + + return !(intel_ctrl >> (i + INTEL_PMC_IDX_FIXED)); +} #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); +int amd_pmu_lbr_init(void); +void amd_pmu_lbr_reset(void); +void amd_pmu_lbr_read(void); +void amd_pmu_lbr_add(struct perf_event *event); +void amd_pmu_lbr_del(struct perf_event *event); +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_enable_all(void); +void amd_pmu_lbr_disable_all(void); +int amd_pmu_lbr_hw_config(struct perf_event *event); + +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ + +int amd_brs_init(void); +void amd_brs_disable(void); +void amd_brs_enable(void); +void amd_brs_enable_all(void); +void amd_brs_disable_all(void); +void amd_brs_drain(void); +void amd_brs_lopwr_init(void); +void amd_brs_disable_all(void); +int amd_brs_hw_config(struct perf_event *event); +void amd_brs_reset(void); + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + perf_sched_cb_inc(event->ctx->pmu); + cpuc->lbr_users++; + /* + * No need to reset BRS because it is reset + * on brs_enable() and it is saturating + */ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in); +#else +static inline int amd_brs_init(void) +{ + return 0; +} +static inline void amd_brs_disable(void) {} +static inline void amd_brs_enable(void) {} +static inline void amd_brs_drain(void) {} +static inline void amd_brs_lopwr_init(void) {} +static inline void amd_brs_disable_all(void) {} +static inline int amd_brs_hw_config(struct perf_event *event) +{ + return 0; +} +static inline void amd_brs_reset(void) {} + +static inline void amd_pmu_brs_add(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_del(struct perf_event *event) +{ +} + +static inline void amd_pmu_brs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +#endif + #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) @@ -939,6 +1390,22 @@ static inline int amd_pmu_init(void) return 0; } +static inline int amd_brs_init(void) +{ + return -EOPNOTSUPP; +} + +static inline void amd_brs_drain(void) +{ +} + +static inline void amd_brs_enable_all(void) +{ +} + +static inline void amd_brs_disable_all(void) +{ +} #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) @@ -969,6 +1436,25 @@ static inline bool intel_pmu_has_bts(struct perf_event *event) return intel_pmu_has_bts_period(event, hwc->sample_period); } +static __always_inline void __intel_pmu_pebs_disable_all(void) +{ + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +static __always_inline void __intel_pmu_arch_lbr_disable(void) +{ + wrmsrl(MSR_ARCH_LBR_CTL, 0); +} + +static __always_inline void __intel_pmu_lbr_disable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * @@ -988,7 +1474,12 @@ void release_ds_buffers(void); void reserve_ds_buffers(void); +void release_lbr_buffers(void); + +void reserve_lbr_buffers(void); + extern struct event_constraint bts_constraint; +extern struct event_constraint vlbr_constraint; void intel_pmu_enable_bts(u64 config); @@ -996,6 +1487,8 @@ void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); +u64 adl_latency_data_small(struct perf_event *event, u64 status); + extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; @@ -1006,6 +1499,8 @@ extern struct event_constraint intel_glm_pebs_event_constraints[]; extern struct event_constraint intel_glp_pebs_event_constraints[]; +extern struct event_constraint intel_grt_pebs_event_constraints[]; + extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; @@ -1022,6 +1517,8 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; extern struct event_constraint intel_icl_pebs_event_constraints[]; +extern struct event_constraint intel_spr_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); @@ -1040,7 +1537,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); -void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); +void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); @@ -1053,6 +1550,10 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); +void intel_pmu_lbr_reset_32(void); + +void intel_pmu_lbr_reset_64(void); + void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); @@ -1063,6 +1564,14 @@ void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); +void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); + +void intel_pmu_lbr_save(void *ctx); + +void intel_pmu_lbr_restore(void *ctx); + void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); @@ -1079,10 +1588,18 @@ void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); +void intel_pmu_lbr_init(void); + +void intel_pmu_arch_lbr_init(void); + void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); +void intel_pmu_pebs_data_source_adl(void); + +void intel_pmu_pebs_data_source_grt(void); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); @@ -1114,6 +1631,14 @@ static inline void release_ds_buffers(void) { } +static inline void release_lbr_buffers(void) +{ +} + +static inline void reserve_lbr_buffers(void) +{ +} + static inline int intel_pmu_init(void) { return 0; @@ -1133,3 +1658,12 @@ static inline int is_ht_workaround_enabled(void) return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ + +#if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) +int zhaoxin_pmu_init(void); +#else +static inline int zhaoxin_pmu_init(void) +{ + return 0; +} +#endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/ diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h new file mode 100644 index 000000000000..1dc19b9b4426 --- /dev/null +++ b/arch/x86/events/perf_event_flags.h @@ -0,0 +1,22 @@ + +/* + * struct hw_perf_event.flags flags + */ +PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ + /* 0x00080 */ +PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ diff --git a/arch/x86/events/probe.c b/arch/x86/events/probe.c index c2ede2f3b277..600bf8d15c0c 100644 --- a/arch/x86/events/probe.c +++ b/arch/x86/events/probe.c @@ -10,6 +10,11 @@ not_visible(struct kobject *kobj, struct attribute *attr, int i) return 0; } +/* + * Accepts msr[] array with non populated entries as long as either + * msr[i].msr is 0 or msr[i].grp is NULL. Note that the default sysfs + * visibility is visible when group->is_visible callback is set. + */ unsigned long perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) { @@ -23,16 +28,29 @@ perf_msr_probe(struct perf_msr *msr, int cnt, bool zero, void *data) for (bit = 0; bit < cnt; bit++) { if (!msr[bit].no_check) { struct attribute_group *grp = msr[bit].grp; + u64 mask; + + /* skip entry with no group */ + if (!grp) + continue; grp->is_visible = not_visible; + /* skip unpopulated entry */ + if (!msr[bit].msr) + continue; + if (msr[bit].test && !msr[bit].test(bit, data)) continue; /* Virt sucks; you cannot tell if a R/O MSR is present :/ */ if (rdmsrl_safe(msr[bit].msr, &val)) continue; + + mask = msr[bit].mask; + if (!mask) + mask = ~0ULL; /* Disable zero counters if requested. */ - if (!zero && !val) + if (!zero && !(val & mask)) continue; grp->is_visible = NULL; diff --git a/arch/x86/events/probe.h b/arch/x86/events/probe.h index 4c8e0afc5fb5..261b9bda24e3 100644 --- a/arch/x86/events/probe.h +++ b/arch/x86/events/probe.h @@ -4,10 +4,11 @@ #include <linux/sysfs.h> struct perf_msr { - u64 msr; - struct attribute_group *grp; + u64 msr; + struct attribute_group *grp; bool (*test)(int idx, void *data); - bool no_check; + bool no_check; + u64 mask; }; unsigned long diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/rapl.c index 09913121e726..a829492bca4c 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/rapl.c @@ -1,11 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Support Intel RAPL energy consumption counters + * Support Intel/AMD RAPL energy consumption counters * Copyright (C) 2013 Google, Inc., Stephane Eranian * * Intel RAPL interface is specified in the IA-32 Manual Vol3b * section 14.7.1 (September 2013) * + * AMD RAPL interface for Fam17h is described in the public PPR: + * https://bugzilla.kernel.org/show_bug.cgi?id=206537 + * * RAPL provides more controls than just reporting energy consumption * however here we only expose the 3 energy consumption free running * counters (pp0, pkg, dram). @@ -58,8 +61,8 @@ #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> -#include "../perf_event.h" -#include "../probe.h" +#include "perf_event.h" +#include "probe.h" MODULE_LICENSE("GPL"); @@ -90,18 +93,6 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { * any other bit is reserved */ #define RAPL_EVENT_MASK 0xFFULL - -#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ -static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ - struct kobj_attribute *attr, \ - char *page) \ -{ \ - BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ - return sprintf(page, _format "\n"); \ -} \ -static struct kobj_attribute format_attr_##_var = \ - __ATTR(_name, 0444, __rapl_##_var##_show, NULL) - #define RAPL_CNTR_WIDTH 32 #define RAPL_EVENT_ATTR_STR(_name, v, str) \ @@ -127,9 +118,17 @@ struct rapl_pmus { struct rapl_pmu *pmus[]; }; +enum rapl_unit_quirk { + RAPL_UNIT_QUIRK_NONE, + RAPL_UNIT_QUIRK_INTEL_HSW, + RAPL_UNIT_QUIRK_INTEL_SPR, +}; + struct rapl_model { + struct perf_msr *rapl_msrs; unsigned long events; - bool apply_quirk; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; }; /* 1/2^hw_unit Joule */ @@ -138,7 +137,7 @@ static struct rapl_pmus *rapl_pmus; static cpumask_t rapl_cpu_mask; static unsigned int rapl_cntr_mask; static u64 rapl_timer_ms; -static struct perf_msr rapl_msrs[]; +static struct perf_msr *rapl_msrs; static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) { @@ -430,7 +429,7 @@ static struct attribute_group rapl_pmu_events_group = { .attrs = attrs_empty, }; -DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); +PMU_FORMAT_ATTR(event, "config:0-7"); static struct attribute *rapl_formats_attr[] = { &format_attr_event.attr, NULL, @@ -513,12 +512,36 @@ static bool test_msr(int idx, void *data) return test_bit(idx, (unsigned long *) data); } -static struct perf_msr rapl_msrs[] = { - [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr }, - [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, - [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr }, - [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr }, - [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, +/* Only lower 32bits of the MSR represents the energy counter */ +#define RAPL_MSR_MASK 0xFFFFFFFF + +static struct perf_msr intel_rapl_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, false, RAPL_MSR_MASK }, +}; + +static struct perf_msr intel_rapl_spr_msrs[] = { + [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr, true, RAPL_MSR_MASK }, +}; + +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, + [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 }, }; static int rapl_cpu_offline(unsigned int cpu) @@ -575,25 +598,35 @@ static int rapl_cpu_online(unsigned int cpu) return 0; } -static int rapl_check_hw_unit(bool apply_quirk) +static int rapl_check_hw_unit(struct rapl_model *rm) { u64 msr_rapl_power_unit_bits; int i; /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) return -1; for (i = 0; i < NR_RAPL_DOMAINS; i++) rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + switch (rm->unit_quirk) { /* * DRAM domain on HSW server and KNL has fixed energy unit which can be * different than the unit from power unit MSR. See * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 * of 2. Datasheet, September 2014, Reference Number: 330784-001 " */ - if (apply_quirk) + case RAPL_UNIT_QUIRK_INTEL_HSW: rapl_hw_unit[PERF_RAPL_RAM] = 16; + break; + /* SPR uses a fixed energy unit for Psys domain. */ + case RAPL_UNIT_QUIRK_INTEL_SPR: + rapl_hw_unit[PERF_RAPL_PSYS] = 0; + break; + default: + break; + } + /* * Calculate the timer rate: @@ -639,7 +672,7 @@ static const struct attribute_group *rapl_attr_update[] = { &rapl_events_pkg_group, &rapl_events_ram_group, &rapl_events_gpu_group, - &rapl_events_gpu_group, + &rapl_events_psys_group, NULL, }; @@ -668,21 +701,20 @@ static int __init init_rapl_pmus(void) return 0; } -#define X86_RAPL_MODEL_MATCH(model, init) \ - { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init } - static struct rapl_model model_snb = { .events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_PP1), - .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_snbep = { .events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), - .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsw = { @@ -690,20 +722,25 @@ static struct rapl_model model_hsw = { BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1), - .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_hsx = { .events = BIT(PERF_RAPL_PP0) | BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), - .apply_quirk = true, + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_knl = { .events = BIT(PERF_RAPL_PKG) | BIT(PERF_RAPL_RAM), - .apply_quirk = true, + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, }; static struct rapl_model model_skl = { @@ -712,40 +749,66 @@ static struct rapl_model model_skl = { BIT(PERF_RAPL_RAM) | BIT(PERF_RAPL_PP1) | BIT(PERF_RAPL_PSYS), - .apply_quirk = false, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_msrs, +}; + +static struct rapl_model model_spr = { + .events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PSYS), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, + .msr_power_unit = MSR_RAPL_POWER_UNIT, + .rapl_msrs = intel_rapl_spr_msrs, +}; + +static struct rapl_model model_amd_hygon = { + .events = BIT(PERF_RAPL_PKG), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, }; static const struct x86_cpu_id rapl_model_match[] __initconst = { - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, model_snb), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE_L, model_skl), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_COMETLAKE, model_skl), + X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb), + X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), {}, }; - MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); static int __init rapl_pmu_init(void) @@ -759,10 +822,13 @@ static int __init rapl_pmu_init(void) return -ENODEV; rm = (struct rapl_model *) id->driver_data; + + rapl_msrs = rm->rapl_msrs; + rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, false, (void *) &rm->events); - ret = rapl_check_hw_unit(rm->apply_quirk); + ret = rapl_check_hw_unit(rm); if (ret) return ret; diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c new file mode 100644 index 000000000000..76b1f8bb0fd5 --- /dev/null +++ b/arch/x86/events/utils.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <asm/insn.h> + +#include "perf_event.h" + +static int decode_branch_type(struct insn *insn) +{ + int ext; + + if (insn_get_opcode(insn)) + return X86_BR_ABORT; + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + return X86_BR_SYSCALL; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + return X86_BR_SYSRET; + case 0x80 ... 0x8f: /* conditional */ + return X86_BR_JCC; + } + return X86_BR_NONE; + case 0x70 ... 0x7f: /* conditional */ + return X86_BR_JCC; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + return X86_BR_RET; + case 0xcf: /* iret */ + return X86_BR_IRET; + case 0xcc ... 0xce: /* int */ + return X86_BR_INT; + case 0xe8: /* call near rel */ + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { + /* zero length call */ + return X86_BR_ZERO_CALL; + } + fallthrough; + case 0x9a: /* call far absolute */ + return X86_BR_CALL; + case 0xe0 ... 0xe3: /* loop jmp */ + return X86_BR_JCC; + case 0xe9 ... 0xeb: /* jmp */ + return X86_BR_JMP; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(insn)) + return X86_BR_ABORT; + + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + return X86_BR_IND_CALL; + case 4: + case 5: + return X86_BR_IND_JMP; + } + return X86_BR_NONE; + } + + return X86_BR_NONE; +} + +/* + * return the type of control flow change at address "from" + * instruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + * + * While recording branches, some processors can report the "from" + * address to be that of an instruction preceding the actual branch + * when instruction fusion occurs. If fusion is expected, attempt to + * find the type of the first branch instruction within the next + * MAX_INSN_SIZE bytes and if found, provide the offset between the + * reported "from" address and the actual branch instruction address. + */ +static int get_branch_type(unsigned long from, unsigned long to, int abort, + bool fused, int *offset) +{ + struct insn insn; + void *addr; + int bytes_read, bytes_left, insn_offset; + int ret = X86_BR_NONE; + int to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + /* make sure we initialize offset */ + if (offset) + *offset = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindly read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) { + addr = (void *)from; + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { + return X86_BR_NONE; + } + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); +#endif + insn_init(&insn, addr, bytes_read, is64); + ret = decode_branch_type(&insn); + insn_offset = 0; + + /* Check for the possibility of branch fusion */ + while (fused && ret == X86_BR_NONE) { + /* Check for decoding errors */ + if (insn_get_length(&insn) || !insn.length) + break; + + insn_offset += insn.length; + bytes_read -= insn.length; + if (bytes_read < 0) + break; + + insn_init(&insn, addr + insn_offset, bytes_read, is64); + ret = decode_branch_type(&insn); + } + + if (offset) + *offset = insn_offset; + + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +int branch_type(unsigned long from, unsigned long to, int abort) +{ + return get_branch_type(from, to, abort, false, NULL); +} + +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset) +{ + return get_branch_type(from, to, abort, true, offset); +} + +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_ERET, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_IRQ, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +int common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} diff --git a/arch/x86/events/zhaoxin/Makefile b/arch/x86/events/zhaoxin/Makefile new file mode 100644 index 000000000000..642c1174d662 --- /dev/null +++ b/arch/x86/events/zhaoxin/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += core.o diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c new file mode 100644 index 000000000000..949d845c922b --- /dev/null +++ b/arch/x86/events/zhaoxin/core.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Zhaoxin PMU; like Intel Architectural PerfMon-v2 + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/export.h> +#include <linux/nmi.h> + +#include <asm/cpufeature.h> +#include <asm/hardirq.h> +#include <asm/apic.h> + +#include "../perf_event.h" + +/* + * Zhaoxin PerfMon, used on zxc and later. + */ +static u64 zx_pmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { + + [PERF_COUNT_HW_CPU_CYCLES] = 0x0082, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0515, + [PERF_COUNT_HW_CACHE_MISSES] = 0x051a, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0083, +}; + +static struct event_constraint zxc_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint zxd_event_constraints[] __read_mostly = { + + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* retired instructions */ + FIXED_EVENT_CONSTRAINT(0x0082, 1), /* unhalted core clock cycles */ + FIXED_EVENT_CONSTRAINT(0x0083, 2), /* unhalted bus clock cycles */ + EVENT_CONSTRAINT_END +}; + +static __initconst const u64 zxd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x0538, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0042, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0043, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0700, + [C(RESULT_MISS)] = 0x0709, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static __initconst const u64 zxe_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x054b, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0562, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0300, + [C(RESULT_MISS)] = 0x0301, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x030a, + [C(RESULT_MISS)] = 0x030b, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0568, + [C(RESULT_MISS)] = 0x052c, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0x0669, + [C(RESULT_MISS)] = 0x0530, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0564, + [C(RESULT_MISS)] = 0x0565, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c0, + [C(RESULT_MISS)] = 0x0534, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0028, + [C(RESULT_MISS)] = 0x0029, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + +static void zhaoxin_pmu_disable_all(void) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void zhaoxin_pmu_enable_all(int added) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static inline u64 zhaoxin_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void zhaoxin_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void zxc_pmu_ack_status(u64 ack) +{ + /* + * ZXC needs global control enabled in order to clear status bits. + */ + zhaoxin_pmu_enable_all(0); + zhaoxin_pmu_ack_status(ack); + zhaoxin_pmu_disable_all(); +} + +static void zhaoxin_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); +} + +static void zhaoxin_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void zhaoxin_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + zhaoxin_pmu_enable_fixed(hwc); + return; + } + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int zhaoxin_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + u64 status; + int bit; + + cpuc = this_cpu_ptr(&cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); + zhaoxin_pmu_disable_all(); + status = zhaoxin_pmu_get_status(); + if (!status) + goto done; + +again: + if (x86_pmu.enabled_ack) + zxc_pmu_ack_status(status); + else + zhaoxin_pmu_ack_status(status); + + inc_irq_stat(apic_perf_irqs); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + x86_perf_event_update(event); + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = zhaoxin_pmu_get_status(); + if (status) + goto again; + +done: + zhaoxin_pmu_enable_all(0); + return handled; +} + +static u64 zhaoxin_pmu_event_map(int hw_event) +{ + return zx_pmon_event_map[hw_event]; +} + +static struct event_constraint * +zhaoxin_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &unconstrained; +} + +PMU_FORMAT_ATTR(event, "config:0-7"); +PMU_FORMAT_ATTR(umask, "config:8-15"); +PMU_FORMAT_ATTR(edge, "config:18"); +PMU_FORMAT_ATTR(inv, "config:23"); +PMU_FORMAT_ATTR(cmask, "config:24-31"); + +static struct attribute *zx_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static ssize_t zhaoxin_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static const struct x86_pmu zhaoxin_pmu __initconst = { + .name = "zhaoxin", + .handle_irq = zhaoxin_pmu_handle_irq, + .disable_all = zhaoxin_pmu_disable_all, + .enable_all = zhaoxin_pmu_enable_all, + .enable = zhaoxin_pmu_enable_event, + .disable = zhaoxin_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = zhaoxin_pmu_event_map, + .max_events = ARRAY_SIZE(zx_pmon_event_map), + .apic = 1, + /* + * For zxd/zxe, read/write operation for PMCx MSR is 48 bits. + */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = zhaoxin_get_event_constraints, + + .format_attrs = zx_arch_formats_attr, + .events_sysfs_show = zhaoxin_event_sysfs_show, +}; + +static const struct { int id; char *name; } zx_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void zhaoxin_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not present by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(zx_arch_events_map)) { + zx_pmon_event_map[zx_arch_events_map[bit].id] = 0; + pr_warn("CPUID marked event: \'%s\' unavailable\n", + zx_arch_events_map[bit].name); + } +} + +__init int zhaoxin_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + pr_info("Welcome to zhaoxin pmu!\n"); + + /* + * Check whether the Architectural PerfMon supports + * hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT - 1) + return -ENODEV; + + version = eax.split.version_id; + if (version != 2) + return -ENODEV; + + x86_pmu = zhaoxin_pmu; + pr_info("Version check pass!\n"); + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_add_quirk(zhaoxin_arch_events_quirk); + + switch (boot_cpu_data.x86) { + case 0x06: + if (boot_cpu_data.x86_model == 0x0f || boot_cpu_data.x86_model == 0x19) { + + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; + + /* Clearing status works only if the global control is enable on zxc. */ + x86_pmu.enabled_ack = 1; + + x86_pmu.event_constraints = zxc_event_constraints; + zx_pmon_event_map[PERF_COUNT_HW_INSTRUCTIONS] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_REFERENCES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_CACHE_MISSES] = 0; + zx_pmon_event_map[PERF_COUNT_HW_BUS_CYCLES] = 0; + + pr_cont("ZXC events, "); + break; + } + return -ENODEV; + + case 0x07: + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event = 0x01, .umask = 0x01, .inv = 0x01, .cmask = 0x01); + + zx_pmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event = 0x0f, .umask = 0x04, .inv = 0, .cmask = 0); + + switch (boot_cpu_data.x86_model) { + case 0x1b: + memcpy(hw_cache_event_ids, zxd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0700; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0709; + + pr_cont("ZXD events, "); + break; + case 0x3b: + memcpy(hw_cache_event_ids, zxe_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + x86_pmu.event_constraints = zxd_event_constraints; + + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0028; + zx_pmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x0029; + + pr_cont("ZXE events, "); + break; + default: + return -ENODEV; + } + break; + + default: + return -ENODEV; + } + + x86_pmu.intel_ctrl = (1 << (x86_pmu.num_counters)) - 1; + x86_pmu.intel_ctrl |= ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + return 0; +} + |