From b40d0156f560932d14e3957579b6508f8d065260 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:49 +0530 Subject: perf/x86/amd/brs: Move feature-specific functions Move some of the Branch Sampling (BRS) specific functions out of the Core events sources and into the BRS sources in preparation for adding other mechanisms to record branches. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b60283b57179475d18ee242d117c335c16733693.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/brs.c | 69 ++++++++++++++++++++++++++++++++++++++++++- arch/x86/events/amd/core.c | 70 ++------------------------------------------ arch/x86/events/perf_event.h | 7 +++-- 3 files changed, 76 insertions(+), 70 deletions(-) diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c index bee8765a1e9b..f1bff153d945 100644 --- a/arch/x86/events/amd/brs.c +++ b/arch/x86/events/amd/brs.c @@ -81,7 +81,7 @@ static bool __init amd_brs_detect(void) * a br_sel_map. Software filtering is not supported because it would not correlate well * with a sampling period. */ -int amd_brs_setup_filter(struct perf_event *event) +static int amd_brs_setup_filter(struct perf_event *event) { u64 type = event->attr.branch_sample_type; @@ -96,6 +96,73 @@ int amd_brs_setup_filter(struct perf_event *event) return 0; } +static inline int amd_is_brs_event(struct perf_event *e) +{ + return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; +} + +int amd_brs_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* + * Due to interrupt holding, BRS is not recommended in + * counting mode. + */ + if (!is_sampling_event(event)) + return -EINVAL; + + /* + * Due to the way BRS operates by holding the interrupt until + * lbr_nr entries have been captured, it does not make sense + * to allow sampling on BRS with an event that does not match + * what BRS is capturing, i.e., retired taken branches. + * Otherwise the correlation with the event's period is even + * more loose: + * + * With retired taken branch: + * Effective P = P + 16 + X + * With any other event: + * Effective P = P + Y + X + * + * Where X is the number of taken branches due to interrupt + * skid. Skid is large. + * + * Where Y is the occurences of the event while BRS is + * capturing the lbr_nr entries. + * + * By using retired taken branches, we limit the impact on the + * Y variable. We know it cannot be more than the depth of + * BRS. + */ + if (!amd_is_brs_event(event)) + return -EINVAL; + + /* + * BRS implementation does not work with frequency mode + * reprogramming of the period. + */ + if (event->attr.freq) + return -EINVAL; + /* + * The kernel subtracts BRS depth from period, so it must + * be big enough. + */ + if (event->attr.sample_period <= x86_pmu.lbr_nr) + return -EINVAL; + + /* + * Check if we can allow PERF_SAMPLE_BRANCH_STACK + */ + ret = amd_brs_setup_filter(event); + + /* only set in case of success */ + if (!ret) + event->hw.flags |= PERF_X86_EVENT_AMD_BRS; + + return ret; +} + /* tos = top of stack, i.e., last valid entry written */ static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 9ac3718410ce..e32a27899e11 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,16 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } -#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ -static inline int amd_is_brs_event(struct perf_event *e) -{ - return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT; -} - static int amd_core_hw_config(struct perf_event *event) { - int ret = 0; - if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 @@ -356,66 +348,10 @@ static int amd_core_hw_config(struct perf_event *event) if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw)) event->hw.flags |= PERF_X86_EVENT_PAIR; - /* - * if branch stack is requested - */ - if (has_branch_stack(event)) { - /* - * Due to interrupt holding, BRS is not recommended in - * counting mode. - */ - if (!is_sampling_event(event)) - return -EINVAL; - - /* - * Due to the way BRS operates by holding the interrupt until - * lbr_nr entries have been captured, it does not make sense - * to allow sampling on BRS with an event that does not match - * what BRS is capturing, i.e., retired taken branches. - * Otherwise the correlation with the event's period is even - * more loose: - * - * With retired taken branch: - * Effective P = P + 16 + X - * With any other event: - * Effective P = P + Y + X - * - * Where X is the number of taken branches due to interrupt - * skid. Skid is large. - * - * Where Y is the occurences of the event while BRS is - * capturing the lbr_nr entries. - * - * By using retired taken branches, we limit the impact on the - * Y variable. We know it cannot be more than the depth of - * BRS. - */ - if (!amd_is_brs_event(event)) - return -EINVAL; + if (has_branch_stack(event)) + return amd_brs_hw_config(event); - /* - * BRS implementation does not work with frequency mode - * reprogramming of the period. - */ - if (event->attr.freq) - return -EINVAL; - /* - * The kernel subtracts BRS depth from period, so it must - * be big enough. - */ - if (event->attr.sample_period <= x86_pmu.lbr_nr) - return -EINVAL; - - /* - * Check if we can allow PERF_SAMPLE_BRANCH_STACK - */ - ret = amd_brs_setup_filter(event); - - /* only set in case of success */ - if (!ret) - event->hw.flags |= PERF_X86_EVENT_AMD_BRS; - } - return ret; + return 0; } static inline int amd_is_nb_event(struct hw_perf_event *hwc) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ba3d24a6a4ec..5deb34e42bbd 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,9 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); #ifdef CONFIG_PERF_EVENTS_AMD_BRS + +#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ + int amd_brs_init(void); void amd_brs_disable(void); void amd_brs_enable(void); @@ -1241,7 +1244,7 @@ void amd_brs_disable_all(void); void amd_brs_drain(void); void amd_brs_lopwr_init(void); void amd_brs_disable_all(void); -int amd_brs_setup_filter(struct perf_event *event); +int amd_brs_hw_config(struct perf_event *event); void amd_brs_reset(void); static inline void amd_pmu_brs_add(struct perf_event *event) @@ -1277,7 +1280,7 @@ static inline void amd_brs_enable(void) {} static inline void amd_brs_drain(void) {} static inline void amd_brs_lopwr_init(void) {} static inline void amd_brs_disable_all(void) {} -static inline int amd_brs_setup_filter(struct perf_event *event) +static inline int amd_brs_hw_config(struct perf_event *event) { return 0; } -- cgit v1.2.3-59-g8ed1b From 9603aa79e851c652f6da873205c92213af36b24f Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:50 +0530 Subject: perf/x86/amd/core: Refactor branch attributes AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, reuse the "branches" capability to advertise information about both BRS and LBR but make the "branch-brs" event exclusive to Family 19h processors that support BRS. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ba4a4cde6db79b1c65c49834027bbdb8a915546b.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e32a27899e11..2f524cf84528 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1247,23 +1247,25 @@ static ssize_t branches_show(struct device *cdev, static DEVICE_ATTR_RO(branches); -static struct attribute *amd_pmu_brs_attrs[] = { +static struct attribute *amd_pmu_branches_attrs[] = { &dev_attr_branches.attr, NULL, }; static umode_t -amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i) { return x86_pmu.lbr_nr ? attr->mode : 0; } -static struct attribute_group group_caps_amd_brs = { +static struct attribute_group group_caps_amd_branches = { .name = "caps", - .attrs = amd_pmu_brs_attrs, - .is_visible = amd_brs_is_visible, + .attrs = amd_pmu_branches_attrs, + .is_visible = amd_branches_is_visible, }; +#ifdef CONFIG_PERF_EVENTS_AMD_BRS + EVENT_ATTR_STR(branch-brs, amd_branch_brs, "event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n"); @@ -1272,15 +1274,26 @@ static struct attribute *amd_brs_events_attrs[] = { NULL, }; +static umode_t +amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ? + attr->mode : 0; +} + static struct attribute_group group_events_amd_brs = { .name = "events", .attrs = amd_brs_events_attrs, .is_visible = amd_brs_is_visible, }; +#endif /* CONFIG_PERF_EVENTS_AMD_BRS */ + static const struct attribute_group *amd_attr_update[] = { - &group_caps_amd_brs, + &group_caps_amd_branches, +#ifdef CONFIG_PERF_EVENTS_AMD_BRS &group_events_amd_brs, +#endif NULL, }; -- cgit v1.2.3-59-g8ed1b From 706460a96fc654e80b6bed1f562b00d2ce9f2f4d Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:51 +0530 Subject: perf/x86/amd/core: Add generic branch record interfaces AMD processors that are capable of recording branches support either Branch Sampling (BRS) or Last Branch Record (LBR). In preparation for adding Last Branch Record Extension Version 2 (LbrExtV2) support, introduce new static calls which act as gateways to call into the feature-dependent functions based on what is available on the processor. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b75dbc32663cb395f0d701167e952c6a6b0445a3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 2f524cf84528..ef3520731a20 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -330,6 +330,8 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc) } } +DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config); + static int amd_core_hw_config(struct perf_event *event) { if (event->attr.exclude_host && event->attr.exclude_guest) @@ -349,7 +351,7 @@ static int amd_core_hw_config(struct perf_event *event) event->hw.flags |= PERF_X86_EVENT_PAIR; if (has_branch_stack(event)) - return amd_brs_hw_config(event); + return static_call(amd_pmu_branch_hw_config)(event); return 0; } @@ -518,8 +520,14 @@ static struct amd_nb *amd_alloc_nb(int cpu) return nb; } +typedef void (amd_pmu_branch_reset_t)(void); +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t); + static void amd_pmu_cpu_reset(int cpu) { + if (x86_pmu.lbr_nr) + static_call(amd_pmu_branch_reset)(); + if (x86_pmu.version < 2) return; @@ -576,7 +584,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - amd_brs_reset(); amd_pmu_cpu_reset(cpu); } @@ -771,16 +778,20 @@ static void amd_pmu_v2_disable_all(void) amd_pmu_check_overflow(); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add); + static void amd_pmu_add_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_add(event); + static_call(amd_pmu_branch_add)(event); } +DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del); + static void amd_pmu_del_event(struct perf_event *event) { if (needs_branch_stack(event)) - amd_pmu_brs_del(event); + static_call(amd_pmu_branch_del)(event); } /* @@ -1184,13 +1195,6 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static void amd_pmu_sched_task(struct perf_event_context *ctx, - bool sched_in) -{ - if (sched_in && x86_pmu.lbr_nr) - amd_pmu_brs_sched_task(ctx, sched_in); -} - static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) { /* @@ -1375,8 +1379,14 @@ static int __init amd_core_pmu_init(void) */ if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; - x86_pmu.sched_task = amd_pmu_sched_task; + x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; + + static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config); + static_call_update(amd_pmu_branch_reset, amd_brs_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_brs_add); + static_call_update(amd_pmu_branch_del, amd_pmu_brs_del); + /* * put_event_constraints callback same as Fam17h, set above */ -- cgit v1.2.3-59-g8ed1b From 257449c6a50298bd21dcabd644f66a0296b78532 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:52 +0530 Subject: x86/cpufeatures: Add LbrExtV2 feature bit CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 1 of EAX indicates support for Last Branch Record Extension Version 2 (LbrExtV2) features. If found to be set during PMU initialization, the EBX bits of the same leaf can be used to determine the number of available LBR entries. For better utilization of feature words, LbrExtV2 is added as a scattered feature bit. [peterz: Rename to AMD_LBR_V2] Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Acked-by: Borislav Petkov Link: https://lore.kernel.org/r/172d2b0df39306ed77221c45ee1aa62e8ae0548d.1660211399.git.sandipan.das@amd.com --- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/kernel/cpu/scattered.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 235dc85c91c3..52bdd9465f19 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -96,7 +96,7 @@ #define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ -/* FREE! ( 3*32+17) */ +#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index fd44b54c90d5..fc01f81f6e2a 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit v1.2.3-59-g8ed1b From 703fb765f48897214e3eb110f35dddec80682f60 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:53 +0530 Subject: perf/x86/amd/lbr: Detect LbrExtV2 support AMD Last Branch Record Extension Version 2 (LbrExtV2) is driven by Core PMC overflows. It records recently taken branches up to the moment when the PMC overflow occurs. Detect the feature during PMU initialization and set the branch stack depth using CPUID leaf 0x80000022 EBX. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/fc6e45378ada258f1bab79b0de6e05c393a8f1dd.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/Makefile | 2 +- arch/x86/events/amd/core.c | 9 +++++---- arch/x86/events/amd/lbr.c | 21 +++++++++++++++++++++ arch/x86/events/perf_event.h | 2 ++ arch/x86/include/asm/perf_event.h | 3 ++- 5 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 arch/x86/events/amd/lbr.c diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile index b9f5d4610256..527d947eb76b 100644 --- a/arch/x86/events/amd/Makefile +++ b/arch/x86/events/amd/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_CPU_SUP_AMD) += core.o +obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index ef3520731a20..a3aa67b4fd50 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1374,10 +1374,11 @@ static int __init amd_core_pmu_init(void) x86_pmu.flags |= PMU_FL_PAIR; } - /* - * BRS requires special event constraints and flushing on ctxsw. - */ - if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) { + /* LBR and BRS are mutually exclusive features */ + if (amd_pmu_lbr_init() && !amd_brs_init()) { + /* + * BRS requires special event constraints and flushing on ctxsw. + */ x86_pmu.get_event_constraints = amd_get_event_constraints_f19h; x86_pmu.sched_task = amd_pmu_brs_sched_task; x86_pmu.limit_period = amd_pmu_limit_period; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c new file mode 100644 index 000000000000..4e5b5d35f35a --- /dev/null +++ b/arch/x86/events/amd/lbr.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#include "../perf_event.h" + +__init int amd_pmu_lbr_init(void) +{ + union cpuid_0x80000022_ebx ebx; + + if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2)) + return -EOPNOTSUPP; + + /* Set number of entries */ + ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES); + x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz; + + pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr); + + return 0; +} diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 5deb34e42bbd..82e8a6d87ade 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1232,6 +1232,8 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); +int amd_pmu_lbr_init(void); + #ifdef CONFIG_PERF_EVENTS_AMD_BRS #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f6fc8dd51ef4..9ac46dbe57d4 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -207,7 +207,8 @@ union cpuid_0x80000022_ebx { struct { /* Number of Core Performance Counters */ unsigned int num_core_pmc:4; - unsigned int reserved:6; + /* Number of available LBR Stack Entries */ + unsigned int lbr_v2_stack_sz:6; /* Number of Data Fabric Counters */ unsigned int num_df_pmc:6; } split; -- cgit v1.2.3-59-g8ed1b From ca5b7c0d9621702e107c83216316a6d722878b64 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:54 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 branch record support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, enable it alongside LBR Freeze on PMI when an event requests branch stack i.e. PERF_SAMPLE_BRANCH_STACK. Each branch record is represented by a pair of registers, LBR From and LBR To. The freeze feature prevents any updates to these registers once a PMC overflows. The contents remain unchanged until the freeze bit is cleared by the PMI handler. The branch records are read and copied to sample data before unfreezing. However, only valid entries are copied. There is no additional register to denote which of the register pairs represent the top of the stack (TOS) since internal register renaming always ensures that the first pair (i.e. index 0) is the one representing the most recent branch and so on. The LBR registers are per-thread resources and are cleared explicitly whenever a new task is scheduled in. There are no special implications on the contents of these registers when transitioning to deep C-states. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/d3b8500a3627a0d4d0259b005891ee248f248d91.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 47 ++++++--- arch/x86/events/amd/lbr.c | 203 +++++++++++++++++++++++++++++++++++++++ arch/x86/events/perf_event.h | 8 ++ arch/x86/include/asm/msr-index.h | 5 + 4 files changed, 252 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index a3aa67b4fd50..d799628016c8 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -620,7 +620,7 @@ static inline u64 amd_pmu_get_global_status(void) /* PerfCntrGlobalStatus is read-only */ rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status); - return status & amd_pmu_global_cntr_mask; + return status; } static inline void amd_pmu_ack_global_status(u64 status) @@ -631,8 +631,6 @@ static inline void amd_pmu_ack_global_status(u64 status) * clears the same bit in PerfCntrGlobalStatus */ - /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */ - status &= amd_pmu_global_cntr_mask; wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status); } @@ -742,11 +740,17 @@ static void amd_pmu_v2_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } -static void amd_pmu_v2_enable_all(int added) +static __always_inline void amd_pmu_core_enable_all(void) { amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask); } +static void amd_pmu_v2_enable_all(int added) +{ + amd_pmu_lbr_enable_all(); + amd_pmu_core_enable_all(); +} + static void amd_pmu_disable_event(struct perf_event *event) { x86_pmu_disable_event(event); @@ -771,10 +775,15 @@ static void amd_pmu_disable_all(void) amd_pmu_check_overflow(); } -static void amd_pmu_v2_disable_all(void) +static __always_inline void amd_pmu_core_disable_all(void) { - /* Disable all PMCs */ amd_pmu_set_global_ctl(0); +} + +static void amd_pmu_v2_disable_all(void) +{ + amd_pmu_core_disable_all(); + amd_pmu_lbr_disable_all(); amd_pmu_check_overflow(); } @@ -877,8 +886,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) pmu_enabled = cpuc->enabled; cpuc->enabled = 0; - /* Stop counting */ - amd_pmu_v2_disable_all(); + /* Stop counting but do not disable LBR */ + amd_pmu_core_disable_all(); status = amd_pmu_get_global_status(); @@ -886,6 +895,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; + /* Read branch records before unfreezing */ + if (status & GLOBAL_STATUS_LBRS_FROZEN) { + amd_pmu_lbr_read(); + status &= ~GLOBAL_STATUS_LBRS_FROZEN; + } + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -905,6 +920,9 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); @@ -918,7 +936,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) */ WARN_ON(status > 0); - /* Clear overflow bits */ + /* Clear overflow and freeze bits */ amd_pmu_ack_global_status(~status); /* @@ -932,7 +950,7 @@ done: /* Resume counting only if PMU is active */ if (pmu_enabled) - amd_pmu_v2_enable_all(0); + amd_pmu_core_enable_all(); return amd_pmu_adjust_nmi_window(handled); } @@ -1375,7 +1393,14 @@ static int __init amd_core_pmu_init(void) } /* LBR and BRS are mutually exclusive features */ - if (amd_pmu_lbr_init() && !amd_brs_init()) { + if (!amd_pmu_lbr_init()) { + /* LBR requires flushing on context switch */ + x86_pmu.sched_task = amd_pmu_lbr_sched_task; + static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config); + static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); + static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); + static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); + } else if (!amd_brs_init()) { /* * BRS requires special event constraints and flushing on ctxsw. */ diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 4e5b5d35f35a..1dea66f332ae 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,209 @@ #include "../perf_event.h" +struct branch_entry { + union { + struct { + u64 ip:58; + u64 ip_sign_ext:5; + u64 mispredict:1; + } split; + u64 full; + } from; + + union { + struct { + u64 ip:58; + u64 ip_sign_ext:3; + u64 reserved:1; + u64 spec:1; + u64 valid:1; + } split; + u64 full; + } to; +}; + +static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); +} + +static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val) +{ + wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); +} + +static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val); + + return val; +} + +static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx) +{ + u64 val; + + rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val); + + return val; +} + +static __always_inline u64 sign_ext_branch_ip(u64 ip) +{ + u32 shift = 64 - boot_cpu_data.x86_virt_bits; + + return (u64)(((s64)ip << shift) >> shift); +} + +void amd_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct perf_branch_entry *br = cpuc->lbr_entries; + struct branch_entry entry; + int out = 0, i; + + if (!cpuc->lbr_users) + return; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + entry.from.full = amd_pmu_lbr_get_from(i); + entry.to.full = amd_pmu_lbr_get_to(i); + + /* Check if a branch has been logged */ + if (!entry.to.split.valid) + continue; + + perf_clear_branch_entry_bitfields(br + out); + + br[out].from = sign_ext_branch_ip(entry.from.split.ip); + br[out].to = sign_ext_branch_ip(entry.to.split.ip); + br[out].mispred = entry.from.split.mispredict; + br[out].predicted = !br[out].mispred; + out++; + } + + cpuc->lbr_stack.nr = out; + + /* + * Internal register renaming always ensures that LBR From[0] and + * LBR To[0] always represent the TOS + */ + cpuc->lbr_stack.hw_idx = 0; +} + +static int amd_pmu_lbr_setup_filter(struct perf_event *event) +{ + /* No LBR support */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + return 0; +} + +int amd_pmu_lbr_hw_config(struct perf_event *event) +{ + int ret = 0; + + /* LBR is not recommended in counting mode */ + if (!is_sampling_event(event)) + return -EINVAL; + + ret = amd_pmu_lbr_setup_filter(event); + if (!ret) + event->attach_state |= PERF_ATTACH_SCHED_CB; + + return ret; +} + +void amd_pmu_lbr_reset(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + if (!x86_pmu.lbr_nr) + return; + + /* Reset all branch records individually */ + for (i = 0; i < x86_pmu.lbr_nr; i++) { + amd_pmu_lbr_set_from(i, 0); + amd_pmu_lbr_set_to(i, 0); + } + + cpuc->last_task_ctx = NULL; + cpuc->last_log_id = 0; +} + +void amd_pmu_lbr_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + perf_sched_cb_inc(event->ctx->pmu); + + if (!cpuc->lbr_users++ && !event->total_time_running) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); +} + +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * A context switch can flip the address space and LBR entries are + * not tagged with an identifier. Hence, branches cannot be resolved + * from the old address space and the LBR records should be wiped. + */ + if (cpuc->lbr_users && sched_in) + amd_pmu_lbr_reset(); +} + +void amd_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); +} + +void amd_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 dbg_ctl, dbg_extn_cfg; + + if (!cpuc->lbr_users || !x86_pmu.lbr_nr) + return; + + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + + wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); +} + __init int amd_pmu_lbr_init(void) { union cpuid_0x80000022_ebx ebx; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 82e8a6d87ade..e8930414cba5 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1233,6 +1233,14 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu) int amd_pmu_init(void); int amd_pmu_lbr_init(void); +void amd_pmu_lbr_reset(void); +void amd_pmu_lbr_read(void); +void amd_pmu_lbr_add(struct perf_event *event); +void amd_pmu_lbr_del(struct perf_event *event); +void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); +void amd_pmu_lbr_enable_all(void); +void amd_pmu_lbr_disable_all(void); +int amd_pmu_lbr_hw_config(struct perf_event *event); #ifdef CONFIG_PERF_EVENTS_AMD_BRS diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6674bdb096f3..109c404e6137 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -585,6 +585,9 @@ #define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301 #define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302 +/* AMD Last Branch Record MSRs */ +#define MSR_AMD64_LBR_SELECT 0xc000010e + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 @@ -756,6 +759,8 @@ #define MSR_AMD_DBG_EXTN_CFG 0xc000010f #define MSR_AMD_SAMP_BR_FROM 0xc0010300 +#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6) + #define MSR_IA32_MPERF 0x000000e7 #define MSR_IA32_APERF 0x000000e8 -- cgit v1.2.3-59-g8ed1b From f4f925dae7419fc7a10af539c073871927ce3a24 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:55 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 hardware branch filter support If AMD Last Branch Record Extension Version 2 (LbrExtV2) is detected, convert the requested branch filter (PERF_SAMPLE_BRANCH_* flags) to the corresponding hardware filter value and stash it in the event data when a branch stack is requested. The hardware filter value is also saved in per-CPU areas for use during event scheduling. Hardware filtering is provided by the LBR Branch Select register. It has bits which when set, suppress recording of the following types of branches: * CPL = 0 (Kernel only) * CPL > 0 (Userspace only) * Conditional Branches * Near Relative Calls * Near Indirect Calls * Near Returns * Near Indirect Jumps (excluding Near Indirect Calls and Near Returns) * Near Relative Jumps (excluding Near Relative Calls) * Far Branches Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/9336af5c9785b8e14c62220fc0e6cfb10ab97de3.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/core.c | 21 ++++++++--- arch/x86/events/amd/lbr.c | 94 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index d799628016c8..36bede1d7b1e 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -542,16 +542,24 @@ static int amd_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL, + cpu_to_node(cpu)); + if (!cpuc->lbr_sel) + return -ENOMEM; + WARN_ON_ONCE(cpuc->amd_nb); if (!x86_pmu.amd_nb_constraints) return 0; cpuc->amd_nb = amd_alloc_nb(cpu); - if (!cpuc->amd_nb) - return -ENOMEM; + if (cpuc->amd_nb) + return 0; - return 0; + kfree(cpuc->lbr_sel); + cpuc->lbr_sel = NULL; + + return -ENOMEM; } static void amd_pmu_cpu_starting(int cpu) @@ -589,13 +597,14 @@ static void amd_pmu_cpu_starting(int cpu) static void amd_pmu_cpu_dead(int cpu) { - struct cpu_hw_events *cpuhw; + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + kfree(cpuhw->lbr_sel); + cpuhw->lbr_sel = NULL; if (!x86_pmu.amd_nb_constraints) return; - cpuhw = &per_cpu(cpu_hw_events, cpu); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1dea66f332ae..bb79b43b7cd8 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -4,6 +4,39 @@ #include "../perf_event.h" +/* LBR Branch Select valid bits */ +#define LBR_SELECT_MASK 0x1ff + +/* + * LBR Branch Select filter bits which when set, ensures that the + * corresponding type of branches are not recorded + */ +#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */ +#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */ +#define LBR_SELECT_JCC 2 /* Conditional branches */ +#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */ +#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */ +#define LBR_SELECT_RET_NEAR 5 /* Near returns */ +#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */ +#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */ +#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */ + +#define LBR_KERNEL BIT(LBR_SELECT_KERNEL) +#define LBR_USER BIT(LBR_SELECT_USER) +#define LBR_JCC BIT(LBR_SELECT_JCC) +#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL) +#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND) +#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR) +#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL) +#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND) +#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH) +#define LBR_NOT_SUPP -1 /* unsupported filter */ +#define LBR_IGNORE 0 + +#define LBR_ANY \ + (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \ + LBR_REL_JMP | LBR_IND_JMP | LBR_FAR) + struct branch_entry { union { struct { @@ -97,12 +130,56 @@ void amd_pmu_lbr_read(void) cpuc->lbr_stack.hw_idx = 0; } +static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { + [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, + + [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, + [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + + [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, + [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL, + + [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, + [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, + + [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, +}; + static int amd_pmu_lbr_setup_filter(struct perf_event *event) { + struct hw_perf_event_extra *reg = &event->hw.branch_reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, v; + int i; + /* No LBR support */ if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { + if (!(br_type & BIT_ULL(i))) + continue; + + v = lbr_select_map[i]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGNORE) + mask |= v; + } + + /* Filter bits operate in suppress mode */ + reg->config = mask ^ LBR_SELECT_MASK; + return 0; } @@ -137,6 +214,7 @@ void amd_pmu_lbr_reset(void) cpuc->last_task_ctx = NULL; cpuc->last_log_id = 0; + wrmsrl(MSR_AMD64_LBR_SELECT, 0); } void amd_pmu_lbr_add(struct perf_event *event) @@ -146,6 +224,11 @@ void amd_pmu_lbr_add(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) { + cpuc->lbr_select = 1; + cpuc->lbr_sel->config = event->hw.branch_reg.config; + } + perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) @@ -159,6 +242,9 @@ void amd_pmu_lbr_del(struct perf_event *event) if (!x86_pmu.lbr_nr) return; + if (has_branch_stack(event)) + cpuc->lbr_select = 0; + cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); @@ -180,11 +266,17 @@ void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) void amd_pmu_lbr_enable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - u64 dbg_ctl, dbg_extn_cfg; + u64 lbr_select, dbg_ctl, dbg_extn_cfg; if (!cpuc->lbr_users || !x86_pmu.lbr_nr) return; + /* Set hardware branch filter */ + if (cpuc->lbr_select) { + lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK; + wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); + } + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); -- cgit v1.2.3-59-g8ed1b From 4462fbfe6ec1bfe2196b977010f6ce7b43a32f2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:56 +0530 Subject: perf/x86: Move branch classifier Commit 3e702ff6d1ea ("perf/x86: Add LBR software filter support for Intel CPUs") introduces a software branch filter which complements the hardware branch filter and adds an x86 branch classifier. Move the branch classifier to arch/x86/events/ so that it can be utilized by other vendors for branch record filtering. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/bae5b95470d6bd49f40954bd379f414f5afcb965.1660211399.git.sandipan.das@amd.com --- arch/x86/events/Makefile | 2 +- arch/x86/events/intel/lbr.c | 273 ------------------------------------------- arch/x86/events/perf_event.h | 62 ++++++++++ arch/x86/events/utils.c | 216 ++++++++++++++++++++++++++++++++++ 4 files changed, 279 insertions(+), 274 deletions(-) create mode 100644 arch/x86/events/utils.c diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile index 9933c0e8e97a..86a76efa8bb6 100644 --- a/arch/x86/events/Makefile +++ b/arch/x86/events/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += core.o probe.o +obj-y += core.o probe.o utils.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o obj-y += amd/ obj-$(CONFIG_X86_LOCAL_APIC) += msr.o diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 4f70fb6c2c1e..7dffc0c731a6 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -4,7 +4,6 @@ #include #include -#include #include "../perf_event.h" @@ -65,65 +64,6 @@ #define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59)) -/* - * x86control flow change classification - * x86control flow changes include branches, interrupts, traps, faults - */ -enum { - X86_BR_NONE = 0, /* unknown */ - - X86_BR_USER = 1 << 0, /* branch target is user */ - X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ - - X86_BR_CALL = 1 << 2, /* call */ - X86_BR_RET = 1 << 3, /* return */ - X86_BR_SYSCALL = 1 << 4, /* syscall */ - X86_BR_SYSRET = 1 << 5, /* syscall return */ - X86_BR_INT = 1 << 6, /* sw interrupt */ - X86_BR_IRET = 1 << 7, /* return from interrupt */ - X86_BR_JCC = 1 << 8, /* conditional */ - X86_BR_JMP = 1 << 9, /* jump */ - X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ - X86_BR_IND_CALL = 1 << 11,/* indirect calls */ - X86_BR_ABORT = 1 << 12,/* transaction abort */ - X86_BR_IN_TX = 1 << 13,/* in transaction */ - X86_BR_NO_TX = 1 << 14,/* not in transaction */ - X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ - X86_BR_CALL_STACK = 1 << 16,/* call stack */ - X86_BR_IND_JMP = 1 << 17,/* indirect jump */ - - X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ - -}; - -#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) -#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) - -#define X86_BR_ANY \ - (X86_BR_CALL |\ - X86_BR_RET |\ - X86_BR_SYSCALL |\ - X86_BR_SYSRET |\ - X86_BR_INT |\ - X86_BR_IRET |\ - X86_BR_JCC |\ - X86_BR_JMP |\ - X86_BR_IRQ |\ - X86_BR_ABORT |\ - X86_BR_IND_CALL |\ - X86_BR_IND_JMP |\ - X86_BR_ZERO_CALL) - -#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) - -#define X86_BR_ANY_CALL \ - (X86_BR_CALL |\ - X86_BR_IND_CALL |\ - X86_BR_ZERO_CALL |\ - X86_BR_SYSCALL |\ - X86_BR_IRQ |\ - X86_BR_INT) - /* * Intel LBR_CTL bits * @@ -1143,219 +1083,6 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event) return ret; } -/* - * return the type of control flow change at address "from" - * instruction is not necessarily a branch (in case of interrupt). - * - * The branch type returned also includes the priv level of the - * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). - * - * If a branch type is unknown OR the instruction cannot be - * decoded (e.g., text page not present), then X86_BR_NONE is - * returned. - */ -static int branch_type(unsigned long from, unsigned long to, int abort) -{ - struct insn insn; - void *addr; - int bytes_read, bytes_left; - int ret = X86_BR_NONE; - int ext, to_plm, from_plm; - u8 buf[MAX_INSN_SIZE]; - int is64 = 0; - - to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; - from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; - - /* - * maybe zero if lbr did not fill up after a reset by the time - * we get a PMU interrupt - */ - if (from == 0 || to == 0) - return X86_BR_NONE; - - if (abort) - return X86_BR_ABORT | to_plm; - - if (from_plm == X86_BR_USER) { - /* - * can happen if measuring at the user level only - * and we interrupt in a kernel thread, e.g., idle. - */ - if (!current->mm) - return X86_BR_NONE; - - /* may fail if text not present */ - bytes_left = copy_from_user_nmi(buf, (void __user *)from, - MAX_INSN_SIZE); - bytes_read = MAX_INSN_SIZE - bytes_left; - if (!bytes_read) - return X86_BR_NONE; - - addr = buf; - } else { - /* - * The LBR logs any address in the IP, even if the IP just - * faulted. This means userspace can control the from address. - * Ensure we don't blindly read any address by validating it is - * a known text address. - */ - if (kernel_text_address(from)) { - addr = (void *)from; - /* - * Assume we can get the maximum possible size - * when grabbing kernel data. This is not - * _strictly_ true since we could possibly be - * executing up next to a memory hole, but - * it is very unlikely to be a problem. - */ - bytes_read = MAX_INSN_SIZE; - } else { - return X86_BR_NONE; - } - } - - /* - * decoder needs to know the ABI especially - * on 64-bit systems running 32-bit apps - */ -#ifdef CONFIG_X86_64 - is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); -#endif - insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; - - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; - break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; - break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; - } - /* - * interrupts, traps, faults (and thus ring transition) may - * occur on any instructions. Thus, to classify them correctly, - * we need to first look at the from and to priv levels. If they - * are different and to is in the kernel, then it indicates - * a ring transition. If the from instruction is not a ring - * transition instr (syscall, systenter, int), then it means - * it was a irq, trap or fault. - * - * we have no way of detecting kernel to kernel faults. - */ - if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL - && ret != X86_BR_SYSCALL && ret != X86_BR_INT) - ret = X86_BR_IRQ; - - /* - * branch priv level determined by target as - * is done by HW when LBR_SELECT is implemented - */ - if (ret != X86_BR_NONE) - ret |= to_plm; - - return ret; -} - -#define X86_BR_TYPE_MAP_MAX 16 - -static int branch_map[X86_BR_TYPE_MAP_MAX] = { - PERF_BR_CALL, /* X86_BR_CALL */ - PERF_BR_RET, /* X86_BR_RET */ - PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ - PERF_BR_SYSRET, /* X86_BR_SYSRET */ - PERF_BR_UNKNOWN, /* X86_BR_INT */ - PERF_BR_ERET, /* X86_BR_IRET */ - PERF_BR_COND, /* X86_BR_JCC */ - PERF_BR_UNCOND, /* X86_BR_JMP */ - PERF_BR_IRQ, /* X86_BR_IRQ */ - PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_ABORT */ - PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ - PERF_BR_CALL, /* X86_BR_ZERO_CALL */ - PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ - PERF_BR_IND, /* X86_BR_IND_JMP */ -}; - -static int -common_branch_type(int type) -{ - int i; - - type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ - - if (type) { - i = __ffs(type); - if (i < X86_BR_TYPE_MAP_MAX) - return branch_map[i]; - } - - return PERF_BR_UNKNOWN; -} - enum { ARCH_LBR_BR_TYPE_JCC = 0, ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1, diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e8930414cba5..2de9cd66347e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1210,6 +1210,68 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) regs->ip = ip; } +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ + X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ + X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ + + X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */ + +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ + X86_BR_ZERO_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_ZERO_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +int common_branch_type(int type); +int branch_type(unsigned long from, unsigned long to, int abort); + ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c new file mode 100644 index 000000000000..a32368945462 --- /dev/null +++ b/arch/x86/events/utils.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include "perf_event.h" + +/* + * return the type of control flow change at address "from" + * instruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes_read, bytes_left; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes_left = copy_from_user_nmi(buf, (void __user *)from, + MAX_INSN_SIZE); + bytes_read = MAX_INSN_SIZE - bytes_left; + if (!bytes_read) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindly read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) { + addr = (void *)from; + /* + * Assume we can get the maximum possible size + * when grabbing kernel data. This is not + * _strictly_ true since we could possibly be + * executing up next to a memory hole, but + * it is very unlikely to be a problem. + */ + bytes_read = MAX_INSN_SIZE; + } else { + return X86_BR_NONE; + } + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); +#endif + insn_init(&insn, addr, bytes_read, is64); + if (insn_get_opcode(&insn)) + return X86_BR_ABORT; + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { + /* zero length call */ + ret = X86_BR_ZERO_CALL; + break; + } + fallthrough; + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(&insn)) + return X86_BR_ABORT; + + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_IND_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +#define X86_BR_TYPE_MAP_MAX 16 + +static int branch_map[X86_BR_TYPE_MAP_MAX] = { + PERF_BR_CALL, /* X86_BR_CALL */ + PERF_BR_RET, /* X86_BR_RET */ + PERF_BR_SYSCALL, /* X86_BR_SYSCALL */ + PERF_BR_SYSRET, /* X86_BR_SYSRET */ + PERF_BR_UNKNOWN, /* X86_BR_INT */ + PERF_BR_ERET, /* X86_BR_IRET */ + PERF_BR_COND, /* X86_BR_JCC */ + PERF_BR_UNCOND, /* X86_BR_JMP */ + PERF_BR_IRQ, /* X86_BR_IRQ */ + PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_ABORT */ + PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ + PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_CALL, /* X86_BR_ZERO_CALL */ + PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ + PERF_BR_IND, /* X86_BR_IND_JMP */ +}; + +int common_branch_type(int type) +{ + int i; + + type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */ + + if (type) { + i = __ffs(type); + if (i < X86_BR_TYPE_MAP_MAX) + return branch_map[i]; + } + + return PERF_BR_UNKNOWN; +} -- cgit v1.2.3-59-g8ed1b From f9c732249b110fae9ebf4ce33db4cb3a12c6eae3 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:57 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 software branch filter support With AMD Last Branch Record Extension Version 2 (LbrExtV2), it is necessary to process the branch records further as hardware filtering is not granular enough for identifying certain types of branches. E.g. to record system calls, one should record far branches. The filter captures both far calls and far returns but the irrelevant records are filtered out based on the branch type as seen by the branch classifier. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/e51de057517f77788abd393c832e8dea616d489c.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 92 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index bb79b43b7cd8..1a8d27e0c145 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -94,6 +94,50 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) return (u64)(((s64)ip << shift) >> shift); } +static void amd_pmu_lbr_filter(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int br_sel = cpuc->br_sel, type, i, j; + bool compress = false; + u64 from, to; + + /* If sampling all branches, there is nothing to filter */ + if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && + ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + type = branch_type(from, to, 0); + + /* If type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; /* mark invalid */ + compress = true; + } + + if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE) + cpuc->lbr_entries[i].type = common_branch_type(type); + } + + if (!compress) + return; + + /* Remove all invalid entries */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -128,6 +172,9 @@ void amd_pmu_lbr_read(void) * LBR To[0] always represent the TOS */ cpuc->lbr_stack.hw_idx = 0; + + /* Perform further software filtering */ + amd_pmu_lbr_filter(); } static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -136,8 +183,8 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE, [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY, - [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL, - [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN, + [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP, @@ -150,8 +197,6 @@ static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP, [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP, - - [PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT] = LBR_NOT_SUPP, }; static int amd_pmu_lbr_setup_filter(struct perf_event *event) @@ -165,6 +210,41 @@ static int amd_pmu_lbr_setup_filter(struct perf_event *event) if (!x86_pmu.lbr_nr) return -EOPNOTSUPP; + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* Ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_COND) + mask |= X86_BR_JCC; + + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + + if (br_type & PERF_SAMPLE_BRANCH_CALL) + mask |= X86_BR_CALL | X86_BR_ZERO_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE) + mask |= X86_BR_TYPE_SAVE; + + reg->reg = mask; + mask = 0; + for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) { if (!(br_type & BIT_ULL(i))) continue; @@ -220,13 +300,15 @@ void amd_pmu_lbr_reset(void) void amd_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event_extra *reg = &event->hw.branch_reg; if (!x86_pmu.lbr_nr) return; if (has_branch_stack(event)) { cpuc->lbr_select = 1; - cpuc->lbr_sel->config = event->hw.branch_reg.config; + cpuc->lbr_sel->config = reg->config; + cpuc->br_sel = reg->reg; } perf_sched_cb_inc(event->ctx->pmu); -- cgit v1.2.3-59-g8ed1b From df3e9612f758fb5f9c251cbe262e3c68ffe67b2c Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:58 +0530 Subject: perf/x86: Make branch classifier fusion-aware With branch fusion and other optimizations, branch sampling hardware in some processors can report a branch from address that points to an instruction preceding the actual branch by several bytes. In such cases, the classifier cannot determine the branch type which leads to failures such as with the recently added test from commit b55878c90ab9 ("perf test: Add test for branch stack sampling"). Branch information is also easier to consume and annotate if branch from addresses always point to branch instructions. Add a new variant of the branch classifier that can account for instruction fusion. If fusion is expected and the current branch from address does not point to a branch instruction, it attempts to find the first branch within the next (MAX_INSN_SIZE - 1) bytes and if found, additionally provides the offset between the reported branch from address and the address of the expected branch instruction. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/b6bb0abaa8a54c0b6d716344700ee11a1793d709.1660211399.git.sandipan.das@amd.com --- arch/x86/events/perf_event.h | 2 + arch/x86/events/utils.c | 169 +++++++++++++++++++++++++------------------ 2 files changed, 102 insertions(+), 69 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2de9cd66347e..93263b98cd6e 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1271,6 +1271,8 @@ enum { int common_branch_type(int type); int branch_type(unsigned long from, unsigned long to, int abort); +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset); ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index a32368945462..e013243f360c 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -3,6 +3,68 @@ #include "perf_event.h" +static int decode_branch_type(struct insn *insn) +{ + int ext; + + if (insn_get_opcode(insn)) + return X86_BR_ABORT; + + switch (insn->opcode.bytes[0]) { + case 0xf: + switch (insn->opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + return X86_BR_SYSCALL; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + return X86_BR_SYSRET; + case 0x80 ... 0x8f: /* conditional */ + return X86_BR_JCC; + } + return X86_BR_NONE; + case 0x70 ... 0x7f: /* conditional */ + return X86_BR_JCC; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + return X86_BR_RET; + case 0xcf: /* iret */ + return X86_BR_IRET; + case 0xcc ... 0xce: /* int */ + return X86_BR_INT; + case 0xe8: /* call near rel */ + if (insn_get_immediate(insn) || insn->immediate1.value == 0) { + /* zero length call */ + return X86_BR_ZERO_CALL; + } + fallthrough; + case 0x9a: /* call far absolute */ + return X86_BR_CALL; + case 0xe0 ... 0xe3: /* loop jmp */ + return X86_BR_JCC; + case 0xe9 ... 0xeb: /* jmp */ + return X86_BR_JMP; + case 0xff: /* call near absolute, call far absolute ind */ + if (insn_get_modrm(insn)) + return X86_BR_ABORT; + + ext = (insn->modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + return X86_BR_IND_CALL; + case 4: + case 5: + return X86_BR_IND_JMP; + } + return X86_BR_NONE; + } + + return X86_BR_NONE; +} + /* * return the type of control flow change at address "from" * instruction is not necessarily a branch (in case of interrupt). @@ -13,14 +75,22 @@ * If a branch type is unknown OR the instruction cannot be * decoded (e.g., text page not present), then X86_BR_NONE is * returned. + * + * While recording branches, some processors can report the "from" + * address to be that of an instruction preceding the actual branch + * when instruction fusion occurs. If fusion is expected, attempt to + * find the type of the first branch instruction within the next + * MAX_INSN_SIZE bytes and if found, provide the offset between the + * reported "from" address and the actual branch instruction address. */ -int branch_type(unsigned long from, unsigned long to, int abort) +static int get_branch_type(unsigned long from, unsigned long to, int abort, + bool fused, int *offset) { struct insn insn; void *addr; - int bytes_read, bytes_left; + int bytes_read, bytes_left, insn_offset; int ret = X86_BR_NONE; - int ext, to_plm, from_plm; + int to_plm, from_plm; u8 buf[MAX_INSN_SIZE]; int is64 = 0; @@ -83,77 +153,27 @@ int branch_type(unsigned long from, unsigned long to, int abort) is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs()); #endif insn_init(&insn, addr, bytes_read, is64); - if (insn_get_opcode(&insn)) - return X86_BR_ABORT; + ret = decode_branch_type(&insn); + insn_offset = 0; - switch (insn.opcode.bytes[0]) { - case 0xf: - switch (insn.opcode.bytes[1]) { - case 0x05: /* syscall */ - case 0x34: /* sysenter */ - ret = X86_BR_SYSCALL; - break; - case 0x07: /* sysret */ - case 0x35: /* sysexit */ - ret = X86_BR_SYSRET; - break; - case 0x80 ... 0x8f: /* conditional */ - ret = X86_BR_JCC; - break; - default: - ret = X86_BR_NONE; - } - break; - case 0x70 ... 0x7f: /* conditional */ - ret = X86_BR_JCC; - break; - case 0xc2: /* near ret */ - case 0xc3: /* near ret */ - case 0xca: /* far ret */ - case 0xcb: /* far ret */ - ret = X86_BR_RET; - break; - case 0xcf: /* iret */ - ret = X86_BR_IRET; - break; - case 0xcc ... 0xce: /* int */ - ret = X86_BR_INT; - break; - case 0xe8: /* call near rel */ - if (insn_get_immediate(&insn) || insn.immediate1.value == 0) { - /* zero length call */ - ret = X86_BR_ZERO_CALL; + /* Check for the possibility of branch fusion */ + while (fused && ret == X86_BR_NONE) { + /* Check for decoding errors */ + if (insn_get_length(&insn) || !insn.length) break; - } - fallthrough; - case 0x9a: /* call far absolute */ - ret = X86_BR_CALL; - break; - case 0xe0 ... 0xe3: /* loop jmp */ - ret = X86_BR_JCC; - break; - case 0xe9 ... 0xeb: /* jmp */ - ret = X86_BR_JMP; - break; - case 0xff: /* call near absolute, call far absolute ind */ - if (insn_get_modrm(&insn)) - return X86_BR_ABORT; - ext = (insn.modrm.bytes[0] >> 3) & 0x7; - switch (ext) { - case 2: /* near ind call */ - case 3: /* far ind call */ - ret = X86_BR_IND_CALL; + insn_offset += insn.length; + bytes_read -= insn.length; + if (bytes_read < 0) break; - case 4: - case 5: - ret = X86_BR_IND_JMP; - break; - } - break; - default: - ret = X86_BR_NONE; + + insn_init(&insn, addr + insn_offset, bytes_read, is64); + ret = decode_branch_type(&insn); } + + if (offset) + *offset = insn_offset; + /* * interrupts, traps, faults (and thus ring transition) may * occur on any instructions. Thus, to classify them correctly, @@ -179,6 +199,17 @@ int branch_type(unsigned long from, unsigned long to, int abort) return ret; } +int branch_type(unsigned long from, unsigned long to, int abort) +{ + return get_branch_type(from, to, abort, false, NULL); +} + +int branch_type_fused(unsigned long from, unsigned long to, int abort, + int *offset) +{ + return get_branch_type(from, to, abort, true, offset); +} + #define X86_BR_TYPE_MAP_MAX 16 static int branch_map[X86_BR_TYPE_MAP_MAX] = { -- cgit v1.2.3-59-g8ed1b From 245268c19f701c7222dedcb6a383bc73d63925d4 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 17:59:59 +0530 Subject: perf/x86/amd/lbr: Use fusion-aware branch classifier AMD Last Branch Record Extension Version 2 (LbrExtV2) can report a branch from address that points to an instruction preceding the actual branch by several bytes due to branch fusion and further optimizations in Zen4 processors. In such cases, software should move forward sequentially in the instruction stream from the reported address and the address of the first branch encountered should be used instead. Hence, use the fusion-aware branch classifier to determine the correct branch type and get the offset for adjusting the branch from address. Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/c324d2d0a9c3976da30b9563d09e50bfee0f264d.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 1a8d27e0c145..eb84f196b2ca 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -97,7 +97,7 @@ static __always_inline u64 sign_ext_branch_ip(u64 ip) static void amd_pmu_lbr_filter(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - int br_sel = cpuc->br_sel, type, i, j; + int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; u64 from, to; @@ -109,7 +109,15 @@ static void amd_pmu_lbr_filter(void) for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; to = cpuc->lbr_entries[i].to; - type = branch_type(from, to, 0); + type = branch_type_fused(from, to, 0, &offset); + + /* + * Adjust the branch from address in case of instruction + * fusion where it points to an instruction preceding the + * actual branch + */ + if (offset) + cpuc->lbr_entries[i].from += offset; /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { -- cgit v1.2.3-59-g8ed1b From 93315e46b000fc80fff5d53c3f444417fb3df6de Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:00 +0530 Subject: perf/core: Add speculation info to branch entries Add a new "spec" bitfield to branch entries for providing speculation information. This will be populated using hints provided by branch sampling features on supported hardware. The following cases are covered: * No branch speculation information is available * Branch is speculative but taken on the wrong path * Branch is non-speculative but taken on the correct path * Branch is speculative and taken on the correct path Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/834088c302faf21c7b665031dd111f424e509a64.1660211399.git.sandipan.das@amd.com --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee8b9ecdc03b..ae30c61957d2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1078,6 +1078,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b br->abort = 0; br->cycles = 0; br->type = 0; + br->spec = PERF_BR_SPEC_NA; br->reserved = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 03b370062741..30a4723aefd4 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -256,6 +256,17 @@ enum { PERF_BR_MAX, }; +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1363,6 +1374,7 @@ union perf_mem_data_src { * abort: aborting a hardware transaction * cycles: cycles from last branch (or 0 if not supported) * type: branch type + * spec: branch speculation info (or 0 if not supported) */ struct perf_branch_entry { __u64 from; @@ -1373,7 +1385,8 @@ struct perf_branch_entry { abort:1, /* transaction abort */ cycles:16, /* cycle count to last branch */ type:4, /* branch type */ - reserved:40; + spec:2, /* branch speculation info */ + reserved:38; }; union perf_sample_weight { -- cgit v1.2.3-59-g8ed1b From 0bc3be5b4bfd5b75086c26d63584a6f7aaea87d5 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 11 Aug 2022 18:00:01 +0530 Subject: perf/x86/amd/lbr: Add LbrExtV2 branch speculation info support Provide branch speculation information captured via AMD Last Branch Record Extension Version 2 (LbrExtV2) by setting the speculation info in branch records. The info is based on the "valid" and "spec" bits in the Branch To registers. Suggested-by: Stephane Eranian Signed-off-by: Sandipan Das Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/ddc02f6320464cad0e3ff5bdb2314531568a91bc.1660211399.git.sandipan.das@amd.com --- arch/x86/events/amd/lbr.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index eb84f196b2ca..2e1c1573efe7 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -146,12 +146,19 @@ static void amd_pmu_lbr_filter(void) } } +static const int lbr_spec_map[PERF_BR_SPEC_MAX] = { + PERF_BR_SPEC_NA, + PERF_BR_SPEC_WRONG_PATH, + PERF_BR_NON_SPEC_CORRECT_PATH, + PERF_BR_SPEC_CORRECT_PATH, +}; + void amd_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct perf_branch_entry *br = cpuc->lbr_entries; struct branch_entry entry; - int out = 0, i; + int out = 0, idx, i; if (!cpuc->lbr_users) return; @@ -160,8 +167,11 @@ void amd_pmu_lbr_read(void) entry.from.full = amd_pmu_lbr_get_from(i); entry.to.full = amd_pmu_lbr_get_to(i); - /* Check if a branch has been logged */ - if (!entry.to.split.valid) + /* + * Check if a branch has been logged; if valid = 0, spec = 0 + * then no branch was recorded + */ + if (!entry.to.split.valid && !entry.to.split.spec) continue; perf_clear_branch_entry_bitfields(br + out); @@ -170,6 +180,25 @@ void amd_pmu_lbr_read(void) br[out].to = sign_ext_branch_ip(entry.to.split.ip); br[out].mispred = entry.from.split.mispredict; br[out].predicted = !br[out].mispred; + + /* + * Set branch speculation information using the status of + * the valid and spec bits. + * + * When valid = 0, spec = 0, no branch was recorded and the + * entry is discarded as seen above. + * + * When valid = 0, spec = 1, the recorded branch was + * speculative but took the wrong path. + * + * When valid = 1, spec = 0, the recorded branch was + * non-speculative but took the correct path. + * + * When valid = 1, spec = 1, the recorded branch was + * speculative and took the correct path + */ + idx = (entry.to.split.valid << 1) | entry.to.split.spec; + br[out].spec = lbr_spec_map[idx]; out++; } -- cgit v1.2.3-59-g8ed1b From a724ec82966d57e4b5d36341d3e3dc1a3c011564 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:15 +0530 Subject: perf: Add system error and not in transaction branch types This expands generic branch type classification by adding two more entries there in i.e system error and not in transaction. This also updates the x86 implementation to process X86_BR_NO_TX records as appropriate. This changes branch types reported to user space on x86 platform but it should not be a problem. The possible scenarios and impacts are enumerated here. -------------------------------------------------------------------------- | kernel | perf tool | Impact | -------------------------------------------------------------------------- | old | old | Works as before | -------------------------------------------------------------------------- | old | new | PERF_BR_UNKNOWN is processed | -------------------------------------------------------------------------- | new | old | PERF_BR_NO_TX is blocked via old PERF_BR_MAX | -------------------------------------------------------------------------- | new | new | PERF_BR_NO_TX is recognized | -------------------------------------------------------------------------- When PERF_BR_NO_TX is blocked via old PERF_BR_MAX (new kernel with old perf tool) the user space might throw up an warning complaining about an unrecognized branch types being reported, but it's expected. PERF_BR_SERROR & PERF_BR_NO_TX branch types will be used for BRBE implementation on arm64 platform. PERF_BR_NO_TX complements 'abort' and 'in_tx' elements in perf_branch_entry which represent other transaction states for a given branch record. Because this completes the transaction state classification. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-2-anshuman.khandual@arm.com --- arch/x86/events/utils.c | 2 +- include/uapi/linux/perf_event.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index e013243f360c..5f5617afde79 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -225,7 +225,7 @@ static int branch_map[X86_BR_TYPE_MAP_MAX] = { PERF_BR_IND_CALL, /* X86_BR_IND_CALL */ PERF_BR_UNKNOWN, /* X86_BR_ABORT */ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */ - PERF_BR_UNKNOWN, /* X86_BR_NO_TX */ + PERF_BR_NO_TX, /* X86_BR_NO_TX */ PERF_BR_CALL, /* X86_BR_ZERO_CALL */ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */ PERF_BR_IND, /* X86_BR_IND_JMP */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 30a4723aefd4..a79cc0eb4de7 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -253,6 +253,8 @@ enum { PERF_BR_COND_RET = 10, /* conditional function return */ PERF_BR_ERET = 11, /* exception return */ PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ PERF_BR_MAX, }; -- cgit v1.2.3-59-g8ed1b From b190bc4ac9e6d9763b61654c5a0c085ff77d7a09 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:16 +0530 Subject: perf: Extend branch type classification branch_entry.type now has ran out of space to accommodate more branch types classification. This will prevent perf branch stack implementation on arm64 (via BRBE) to capture all available branch types. Extending this bit field i.e branch_entry.type [4 bits] is not an option as it will break user space ABI both for little and big endian perf tools. Extend branch classification with a new field branch_entry.new_type via a new branch type PERF_BR_EXTEND_ABI in branch_entry.type. Perf tools which could decode PERF_BR_EXTEND_ABI, will then parse branch_entry.new_type as well. branch_entry.new_type is a 4 bit field which can hold upto 16 branch types. The first three branch types will hold various generic page faults followed by five architecture specific branch types, which can be overridden by the platform for specific use cases. These architecture specific branch types gets overridden on arm64 platform for BRBE implementation. New generic branch types - PERF_BR_NEW_FAULT_ALGN - PERF_BR_NEW_FAULT_DATA - PERF_BR_NEW_FAULT_INST New arch specific branch types - PERF_BR_NEW_ARCH_1 - PERF_BR_NEW_ARCH_2 - PERF_BR_NEW_ARCH_3 - PERF_BR_NEW_ARCH_4 - PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-3-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index a79cc0eb4de7..fed60e6b10e5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -255,6 +255,7 @@ enum { PERF_BR_IRQ = 12, /* irq */ PERF_BR_SERROR = 13, /* system error */ PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ PERF_BR_MAX, }; @@ -269,6 +270,18 @@ enum { PERF_BR_SPEC_MAX, }; +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1388,7 +1401,8 @@ struct perf_branch_entry { cycles:16, /* cycle count to last branch */ type:4, /* branch type */ spec:2, /* branch speculation info */ - reserved:38; + new_type:4, /* additional branch type */ + reserved:34; }; union perf_sample_weight { -- cgit v1.2.3-59-g8ed1b From 5402d25aa5710d240040f73fb13d7d5c303ef071 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:17 +0530 Subject: perf: Capture branch privilege information Platforms like arm64 could capture privilege level information for all the branch records. Hence this adds a new element in the struct branch_entry to record the privilege level information, which could be requested through a new event.attr.branch_sample_type based flag PERF_SAMPLE_BRANCH_PRIV_SAVE. This flag helps user choose whether privilege information is captured. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-4-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fed60e6b10e5..1a258d45a3fa 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -204,6 +204,8 @@ enum perf_branch_sample_type_shift { PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ }; @@ -233,6 +235,8 @@ enum perf_branch_sample_type { PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, }; @@ -282,6 +286,13 @@ enum { PERF_BR_NEW_MAX, }; +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ @@ -1402,7 +1413,8 @@ struct perf_branch_entry { type:4, /* branch type */ spec:2, /* branch speculation info */ new_type:4, /* additional branch type */ - reserved:34; + priv:3, /* privilege level */ + reserved:31; }; union perf_sample_weight { -- cgit v1.2.3-59-g8ed1b From f4054e522531038354bea5c924f286fdd8ae77b5 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 24 Aug 2022 10:18:18 +0530 Subject: perf: Add PERF_BR_NEW_ARCH_[N] map for BRBE on arm64 platform BRBE captured branch types will overflow perf_branch_entry.type and generic branch types in perf_branch_entry.new_type. So override each available arch specific branch type in the following manner to comprehensively process all reported branch types in BRBE. PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220824044822.70230-5-anshuman.khandual@arm.com --- include/uapi/linux/perf_event.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1a258d45a3fa..dca16582885f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -293,6 +293,12 @@ enum { PERF_BR_PRIV_HV = 3, }; +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + #define PERF_SAMPLE_BRANCH_PLM_ALL \ (PERF_SAMPLE_BRANCH_USER|\ PERF_SAMPLE_BRANCH_KERNEL|\ -- cgit v1.2.3-59-g8ed1b From 724c299c6a0e412b5679d7ebb9b3f4e00bd2aa78 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:06 +0200 Subject: perf/hw_breakpoint: Add KUnit test for constraints accounting Add KUnit test for hw_breakpoint constraints accounting, with various interesting mixes of breakpoint targets (some care was taken to catch interesting corner cases via bug-injection). The test cannot be built as a module because it requires access to hw_breakpoint_slots(), which is not inlinable or exported on all architectures. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-2-elver@google.com --- kernel/events/Makefile | 1 + kernel/events/hw_breakpoint_test.c | 323 +++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 10 ++ 3 files changed, 334 insertions(+) create mode 100644 kernel/events/hw_breakpoint_test.c diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..433c5c45e2a5 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include +#include +#include +#include +#include +#include + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + return num_online_cpus() < 2 ? -EINVAL : 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver "); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 072e4b289c13..863b51751566 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2530,6 +2530,16 @@ config STACKINIT_KUNIT_TEST CONFIG_GCC_PLUGIN_STRUCTLEAK, CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF, or CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL. +config HW_BREAKPOINT_KUNIT_TEST + bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS + depends on HAVE_HW_BREAKPOINT + depends on KUNIT=y + default KUNIT_ALL_TESTS + help + Tests for hw_breakpoint constraints accounting. + + If unsure, say N. + config TEST_UDELAY tristate "udelay test driver" help -- cgit v1.2.3-59-g8ed1b From c5b81449f915a28bb9c7725e53aebab3ba39b4a2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:07 +0200 Subject: perf/hw_breakpoint: Provide hw_breakpoint_is_used() and use in test Provide hw_breakpoint_is_used() to check if breakpoints are in use on the system. Use it in the KUnit test to verify the global state before and after a test case. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-3-elver@google.com --- include/linux/hw_breakpoint.h | 3 +++ kernel/events/hw_breakpoint.c | 29 +++++++++++++++++++++++++++++ kernel/events/hw_breakpoint_test.c | 12 +++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 78dd7035d1e5..a3fb846705eb 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -74,6 +74,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); +extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -121,6 +122,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } +static inline bool hw_breakpoint_is_used(void) { return false; } + static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..fd5cd1f9e7fc 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -604,6 +604,35 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < nr_slots[type]; ++slot) { + if (info->tsk_pinned[slot]) + return true; + } + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c index 433c5c45e2a5..5ced822df788 100644 --- a/kernel/events/hw_breakpoint_test.c +++ b/kernel/events/hw_breakpoint_test.c @@ -294,7 +294,14 @@ static struct kunit_case hw_breakpoint_test_cases[] = { static int test_init(struct kunit *test) { /* Most test cases want 2 distinct CPUs. */ - return num_online_cpus() < 2 ? -EINVAL : 0; + if (num_online_cpus() < 2) + return -EINVAL; + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + return -EBUSY; + + return 0; } static void test_exit(struct kunit *test) @@ -308,6 +315,9 @@ static void test_exit(struct kunit *test) kthread_stop(__other_task); __other_task = NULL; } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); } static struct kunit_suite hw_breakpoint_test_suite = { -- cgit v1.2.3-59-g8ed1b From 089cdcb0cd1c25343fa56d3eabbe878df31a7c0e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:08 +0200 Subject: perf/hw_breakpoint: Clean up headers Clean up headers: - Remove unused - Remove unused - Remove unused - Remove unused - Add for EXPORT_SYMBOL_GPL(). - Add for mutex. - Sort alphabetically. - Move to top to test it compiles on its own. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-4-elver@google.com --- kernel/events/hw_breakpoint.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index fd5cd1f9e7fc..6076c6346291 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,23 +17,22 @@ * This file contains the arch-independent routines. */ +#include + +#include +#include +#include +#include #include -#include -#include -#include #include #include -#include +#include +#include +#include #include #include -#include #include -#include -#include -#include -#include -#include /* * Constraints data */ -- cgit v1.2.3-59-g8ed1b From 0370dc314df35579b751d1b77c9169f071444962 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:09 +0200 Subject: perf/hw_breakpoint: Optimize list of per-task breakpoints On a machine with 256 CPUs, running the recently added perf breakpoint benchmark results in: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 236.418 [sec] | | 123134.794271 usecs/op | 7880626.833333 usecs/op/cpu The benchmark tests inherited breakpoint perf events across many threads. Looking at a perf profile, we can see that the majority of the time is spent in various hw_breakpoint.c functions, which execute within the 'nr_bp_mutex' critical sections which then results in contention on that mutex as well: 37.27% [kernel] [k] osq_lock 34.92% [kernel] [k] mutex_spin_on_owner 12.15% [kernel] [k] toggle_bp_slot 11.90% [kernel] [k] __reserve_bp_slot The culprit here is task_bp_pinned(), which has a runtime complexity of O(#tasks) due to storing all task breakpoints in the same list and iterating through that list looking for a matching task. Clearly, this does not scale to thousands of tasks. Instead, make use of the "rhashtable" variant "rhltable" which stores multiple items with the same key in a list. This results in average runtime complexity of O(1) for task_bp_pinned(). With the optimization, the benchmark shows: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.208 [sec] | | 108.422396 usecs/op | 6939.033333 usecs/op/cpu On this particular setup that's a speedup of ~1135x. While one option would be to make task_struct a breakpoint list node, this would only further bloat task_struct for infrequently used data. Furthermore, after all optimizations in this series, there's no evidence it would result in better performance: later optimizations make the time spent looking up entries in the hash table negligible (we'll reach the theoretical ideal performance i.e. no constraints). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-5-elver@google.com --- include/linux/perf_event.h | 3 ++- kernel/events/hw_breakpoint.c | 56 +++++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae30c61957d2..1999408a9cbb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -36,6 +36,7 @@ struct perf_guest_info_callbacks { }; #ifdef CONFIG_HAVE_HW_BREAKPOINT +#include #include #endif @@ -178,7 +179,7 @@ struct hw_perf_event { * creation and event initalization. */ struct arch_hw_breakpoint info; - struct list_head bp_list; + struct rhlist_head bp_list; }; #endif struct { /* amd_iommu */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6076c6346291..6d09edc80d19 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -26,10 +26,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -54,7 +54,13 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) } /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; static int constraints_initialized; @@ -103,17 +109,23 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -186,7 +198,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, /* * Add/remove the given breakpoint in our constraint table */ -static void +static int toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { @@ -199,7 +211,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + return 0; } /* Pinned counter task profiling */ @@ -207,9 +219,9 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, toggle_bp_task_slot(bp, cpu, type, weight); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else - list_del(&bp->hw.bp_list); + return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -307,9 +319,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) @@ -334,7 +344,7 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) @@ -707,7 +717,7 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { int cpu, err_cpu; - int i; + int i, ret; for (i = 0; i < TYPE_MAX; i++) nr_slots[i] = hw_breakpoint_slots(i); @@ -718,18 +728,24 @@ int __init init_hw_breakpoint(void) info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; + if (!info->tsk_pinned) { + ret = -ENOMEM; + goto err; + } } } + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + goto err; + constraints_initialized = 1; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - err_alloc: +err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) kfree(get_bp_info(err_cpu, i)->tsk_pinned); @@ -737,7 +753,5 @@ int __init init_hw_breakpoint(void) break; } - return -ENOMEM; + return ret; } - - -- cgit v1.2.3-59-g8ed1b From db5f6f853194c5e02d8551425b5e86b7e0b81806 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:10 +0200 Subject: perf/hw_breakpoint: Mark data __ro_after_init Mark read-only data after initialization as __ro_after_init. While we are here, turn 'constraints_initialized' into a bool. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-6-elver@google.com --- kernel/events/hw_breakpoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 6d09edc80d19..7df46b276452 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -46,7 +46,7 @@ struct bp_cpuinfo { }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; +static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -62,7 +62,7 @@ static const struct rhashtable_params task_bps_ht_params = { .automatic_shrinking = true, }; -static int constraints_initialized; +static bool constraints_initialized __ro_after_init; /* Gather the number of total pinned and un-pinned bp in a cpuset */ struct bp_busy_slots { @@ -739,7 +739,7 @@ int __init init_hw_breakpoint(void) if (ret) goto err; - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); -- cgit v1.2.3-59-g8ed1b From be3f152568cc7f5f573d21d5f86a2c4f3cc047ab Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:11 +0200 Subject: perf/hw_breakpoint: Optimize constant number of breakpoint slots Optimize internal hw_breakpoint state if the architecture's number of breakpoint slots is constant. This avoids several kmalloc() calls and potentially unnecessary failures if the allocations fail, as well as subtly improves code generation and cache locality. The protocol is that if an architecture defines hw_breakpoint_slots via the preprocessor, it must be constant and the same for all types. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-7-elver@google.com --- arch/sh/include/asm/hw_breakpoint.h | 5 +- arch/x86/include/asm/hw_breakpoint.h | 5 +- kernel/events/hw_breakpoint.c | 94 +++++++++++++++++++++++------------- 3 files changed, 63 insertions(+), 41 deletions(-) diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h index 199d17b765f2..361a0f57bdeb 100644 --- a/arch/sh/include/asm/hw_breakpoint.h +++ b/arch/sh/include/asm/hw_breakpoint.h @@ -48,10 +48,7 @@ struct pmu; /* Maximum number of UBC channels */ #define HBP_NUM 2 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) /* arch/sh/kernel/hw_breakpoint.c */ extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw); diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h index a1f0e90d0818..0bc931cd0698 100644 --- a/arch/x86/include/asm/hw_breakpoint.h +++ b/arch/x86/include/asm/hw_breakpoint.h @@ -44,10 +44,7 @@ struct arch_hw_breakpoint { /* Total number of available HW breakpoint registers */ #define HBP_NUM 4 -static inline int hw_breakpoint_slots(int type) -{ - return HBP_NUM; -} +#define hw_breakpoint_slots(type) (HBP_NUM) struct perf_event_attr; struct perf_event; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7df46b276452..9fb66d358d81 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -40,13 +40,16 @@ struct bp_cpuinfo { /* Number of pinned cpu breakpoints in a cpu */ unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +#ifdef hw_breakpoint_slots + unsigned int tsk_pinned[hw_breakpoint_slots(0)]; +#else unsigned int *tsk_pinned; +#endif /* Number of non-pinned cpu/task breakpoints in a cpu */ unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX] __ro_after_init; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { @@ -73,6 +76,54 @@ struct bp_busy_slots { /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + if (!info->tsk_pinned) + goto err; + } + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + kfree(get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + + return -ENOMEM; +} +#endif + __weak int hw_breakpoint_weight(struct perf_event *bp) { return 1; @@ -95,7 +146,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; - for (i = nr_slots[type] - 1; i >= 0; i--) { + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { if (tsk_pinned[i] > 0) return i + 1; } @@ -312,7 +363,7 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) fetch_this_slot(&slots, weight); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); @@ -632,7 +683,7 @@ bool hw_breakpoint_is_used(void) if (info->cpu_pinned) return true; - for (int slot = 0; slot < nr_slots[type]; ++slot) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { if (info->tsk_pinned[slot]) return true; } @@ -716,42 +767,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i, ret; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); - - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); - - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) { - ret = -ENOMEM; - goto err; - } - } - } + int ret; ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); if (ret) - goto err; + return ret; + + ret = init_breakpoint_slots(); + if (ret) + return ret; constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - -err: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return ret; } -- cgit v1.2.3-59-g8ed1b From 9caf87be118f4639537404eeb67dd444a3716e9a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:12 +0200 Subject: perf/hw_breakpoint: Make hw_breakpoint_weight() inlinable Due to being a __weak function, hw_breakpoint_weight() will cause the compiler to always emit a call to it. This generates unnecessarily bad code (register spills etc.) for no good reason; in fact it appears in profiles of `perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512`: ... 0.70% [kernel] [k] hw_breakpoint_weight ... While a small percentage, no architecture defines its own hw_breakpoint_weight() nor are there users outside hw_breakpoint.c, which makes the fact it is currently __weak a poor choice. Change hw_breakpoint_weight()'s definition to follow a similar protocol to hw_breakpoint_slots(), such that if defines hw_breakpoint_weight(), we'll use it instead. The result is that it is inlined and no longer shows up in profiles. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-8-elver@google.com --- include/linux/hw_breakpoint.h | 1 - kernel/events/hw_breakpoint.c | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a3fb846705eb..f319bd26b030 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -80,7 +80,6 @@ extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); -int hw_breakpoint_weight(struct perf_event *bp); int arch_reserve_bp_slot(struct perf_event *bp); void arch_release_bp_slot(struct perf_event *bp); void arch_unregister_hw_breakpoint(struct perf_event *bp); diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9fb66d358d81..9c9bf17666a5 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -124,10 +124,12 @@ err: } #endif -__weak int hw_breakpoint_weight(struct perf_event *bp) +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { -- cgit v1.2.3-59-g8ed1b From 24198ad373ad1e30b638aa147142dc21ab5757e7 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:13 +0200 Subject: perf/hw_breakpoint: Remove useless code related to flexible breakpoints Flexible breakpoints have never been implemented, with bp_cpuinfo::flexible always being 0. Unfortunately, they still occupy 4 bytes in each bp_cpuinfo and bp_busy_slots, as well as computing the max flexible count in fetch_bp_busy_slots(). This again causes suboptimal code generation, when we always know that `!!slots.flexible` will be 0. Just get rid of the flexible "placeholder" and remove all real code related to it. Make a note in the comment related to the constraints algorithm but don't remove them from the algorithm, so that if in future flexible breakpoints need supporting, it should be trivial to revive them (along with reverting this change). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-9-elver@google.com --- kernel/events/hw_breakpoint.c | 57 +++++++++++++------------------------------ 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9c9bf17666a5..8b40fca1a063 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -45,8 +45,6 @@ struct bp_cpuinfo { #else unsigned int *tsk_pinned; #endif - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); @@ -67,12 +65,6 @@ static const struct rhashtable_params task_bps_ht_params = { static bool constraints_initialized __ro_after_init; -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; - /* Serialize accesses to the above constraints */ static DEFINE_MUTEX(nr_bp_mutex); @@ -190,14 +182,14 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; for_each_cpu(cpu, cpumask) { @@ -210,24 +202,10 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; + return pinned_slots; } /* @@ -298,7 +276,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -340,8 +323,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -357,15 +340,9 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > hw_breakpoint_slots_cached(type)) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); -- cgit v1.2.3-59-g8ed1b From f95e5a3d59011eec1257d0e76de1e1f8969d426f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:14 +0200 Subject: powerpc/hw_breakpoint: Avoid relying on caller synchronization Internal data structures (cpu_bps, task_bps) of powerpc's hw_breakpoint implementation have relied on nr_bp_mutex serializing access to them. Before overhauling synchronization of kernel/events/hw_breakpoint.c, introduce 2 spinlocks to synchronize cpu_bps and task_bps respectively, thus avoiding reliance on callers synchronizing powerpc's hw_breakpoint. Reported-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-10-elver@google.com --- arch/powerpc/kernel/hw_breakpoint.c | 53 ++++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 2669f80b3a49..8db1a15d7acb 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,14 @@ struct breakpoint { bool ptrace_bp; }; +/* + * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot + * rely on it safely synchronizing internals here; however, we can rely on it + * not requesting more breakpoints than available. + */ +static DEFINE_SPINLOCK(cpu_bps_lock); static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]); +static DEFINE_SPINLOCK(task_bps_lock); static LIST_HEAD(task_bps); static struct breakpoint *alloc_breakpoint(struct perf_event *bp) @@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&task_bps_lock); list_add(&tmp->list, &task_bps); + spin_unlock(&task_bps_lock); return 0; } @@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp) { struct list_head *pos, *q; + spin_lock(&task_bps_lock); list_for_each_safe(pos, q, &task_bps) { struct breakpoint *tmp = list_entry(pos, struct breakpoint, list); @@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&task_bps_lock); } /* @@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp) static bool all_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { - if (!can_co_exist(tmp, bp)) - return true; + if (!can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } /* @@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp) static bool same_task_bps_check(struct perf_event *bp) { struct breakpoint *tmp; + bool ret = false; + spin_lock(&task_bps_lock); list_for_each_entry(tmp, &task_bps, list) { if (tmp->bp->hw.target == bp->hw.target && - !can_co_exist(tmp, bp)) - return true; + !can_co_exist(tmp, bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&task_bps_lock); + return ret; } static int cpu_bps_add(struct perf_event *bp) @@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp) if (IS_ERR(tmp)) return PTR_ERR(tmp); + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) { @@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); return 0; } @@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp) struct breakpoint **cpu_bp; int i = 0; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu); for (i = 0; i < nr_wp_slots(); i++) { if (!cpu_bp[i]) @@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp) break; } } + spin_unlock(&cpu_bps_lock); } static bool cpu_bps_check(int cpu, struct perf_event *bp) { struct breakpoint **cpu_bp; + bool ret = false; int i; + spin_lock(&cpu_bps_lock); cpu_bp = per_cpu_ptr(cpu_bps, cpu); for (i = 0; i < nr_wp_slots(); i++) { - if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) - return true; + if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) { + ret = true; + break; + } } - return false; + spin_unlock(&cpu_bps_lock); + return ret; } static bool all_cpu_bps_check(struct perf_event *bp) @@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp) return false; } -/* - * We don't use any locks to serialize accesses to cpu_bps or task_bps - * because are already inside nr_bp_mutex. - */ int arch_reserve_bp_slot(struct perf_event *bp) { int ret; -- cgit v1.2.3-59-g8ed1b From 01fe8a3f818e1074a9a95d624be4549ee7ea2b2b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:15 +0200 Subject: locking/percpu-rwsem: Add percpu_is_write_locked() and percpu_is_read_locked() Implement simple accessors to probe percpu-rwsem's locked state: percpu_is_write_locked(), percpu_is_read_locked(). Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-11-elver@google.com --- include/linux/percpu-rwsem.h | 6 ++++++ kernel/locking/percpu-rwsem.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 5fda40f97fe9..36b942b67b7d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +extern bool percpu_is_read_locked(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); +static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem) +{ + return atomic_read(&sem->block); +} + extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any -- cgit v1.2.3-59-g8ed1b From 0912037fec1136d4e4796a3481f4a4ee09a2c325 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:16 +0200 Subject: perf/hw_breakpoint: Reduce contention with large number of tasks While optimizing task_bp_pinned()'s runtime complexity to O(1) on average helps reduce time spent in the critical section, we still suffer due to serializing everything via 'nr_bp_mutex'. Indeed, a profile shows that now contention is the biggest issue: 95.93% [kernel] [k] osq_lock 0.70% [kernel] [k] mutex_spin_on_owner 0.22% [kernel] [k] smp_cfm_core_cond 0.18% [kernel] [k] task_bp_pinned 0.18% [kernel] [k] rhashtable_jhash2 0.15% [kernel] [k] queued_spin_lock_slowpath when running the breakpoint benchmark with (system with 256 CPUs): | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.207 [sec] | | 108.267188 usecs/op | 6929.100000 usecs/op/cpu The main concern for synchronizing the breakpoint constraints data is that a consistent snapshot of the per-CPU and per-task data is observed. The access pattern is as follows: 1. If the target is a task: the task's pinned breakpoints are counted, checked for space, and then appended to; only bp_cpuinfo::cpu_pinned is used to check for conflicts with CPU-only breakpoints; bp_cpuinfo::tsk_pinned are incremented/decremented, but otherwise unused. 2. If the target is a CPU: bp_cpuinfo::cpu_pinned are counted, along with bp_cpuinfo::tsk_pinned; after a successful check, cpu_pinned is incremented. No per-task breakpoints are checked. Since rhltable safely synchronizes insertions/deletions, we can allow concurrency as follows: 1. If the target is a task: independent tasks may update and check the constraints concurrently, but same-task target calls need to be serialized; since bp_cpuinfo::tsk_pinned is only updated, but not checked, these modifications can happen concurrently by switching tsk_pinned to atomic_t. 2. If the target is a CPU: access to the per-CPU constraints needs to be serialized with other CPU-target and task-target callers (to stabilize the bp_cpuinfo::tsk_pinned snapshot). We can allow the above concurrency by introducing a per-CPU constraints data reader-writer lock (bp_cpuinfo_sem), and per-task mutexes (reuses task_struct::perf_event_mutex): 1. If the target is a task: acquires perf_event_mutex, and acquires bp_cpuinfo_sem as a reader. The choice of percpu-rwsem minimizes contention in the presence of many read-lock but few write-lock acquisitions: we assume many orders of magnitude more task target breakpoints creations/destructions than CPU target breakpoints. 2. If the target is a CPU: acquires bp_cpuinfo_sem as a writer. With these changes, contention with thousands of tasks is reduced to the point where waiting on locking no longer dominates the profile: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.077 [sec] | | 40.201563 usecs/op | 2572.900000 usecs/op/cpu 21.54% [kernel] [k] task_bp_pinned 20.18% [kernel] [k] rhashtable_jhash2 6.81% [kernel] [k] toggle_bp_slot 5.47% [kernel] [k] queued_spin_lock_slowpath 3.75% [kernel] [k] smp_cfm_core_cond 3.48% [kernel] [k] bcmp On this particular setup that's a speedup of 2.7x. We're also getting closer to the theoretical ideal performance through optimizations in hw_breakpoint.c -- constraints accounting disabled: | perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.067 [sec] | | 35.286458 usecs/op | 2258.333333 usecs/op/cpu Which means the current implementation is ~12% slower than the theoretical ideal. For reference, performance without any breakpoints: | $> bench -r 30 breakpoint thread -b 0 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 0 breakpoints and 64 parallelism | Total time: 0.060 [sec] | | 31.365625 usecs/op | 2007.400000 usecs/op/cpu On a system with 256 CPUs, the theoretical ideal is only ~12% slower than no breakpoints at all; the current implementation is ~28% slower. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-12-elver@google.com --- kernel/events/hw_breakpoint.c | 161 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 133 insertions(+), 28 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 8b40fca1a063..229c6f4fae75 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -19,6 +19,7 @@ #include +#include #include #include #include @@ -28,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -41,9 +43,9 @@ struct bp_cpuinfo { unsigned int cpu_pinned; /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ #ifdef hw_breakpoint_slots - unsigned int tsk_pinned[hw_breakpoint_slots(0)]; + atomic_t tsk_pinned[hw_breakpoint_slots(0)]; #else - unsigned int *tsk_pinned; + atomic_t *tsk_pinned; #endif }; @@ -65,8 +67,79 @@ static const struct rhashtable_params task_bps_ht_params = { static bool constraints_initialized __ro_after_init; -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); + +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; + + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + mutex_lock(tsk_mtx); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} #ifdef hw_breakpoint_slots /* @@ -97,7 +170,7 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); - info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(int), GFP_KERNEL); + info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL); if (!info->tsk_pinned) goto err; } @@ -137,11 +210,19 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int i; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) + ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */ + if (atomic_read(&tsk_pinned[i]) > 0) return i + 1; } @@ -158,6 +239,11 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) struct perf_event *iter; int count = 0; + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + rcu_read_lock(); head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); if (!head) @@ -214,16 +300,25 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; + atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; int old_idx, new_idx; + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + old_idx = task_bp_pinned(cpu, bp, type) - 1; new_idx = old_idx + weight; if (old_idx >= 0) - tsk_pinned[old_idx]--; + atomic_dec(&tsk_pinned[old_idx]); if (new_idx >= 0) - tsk_pinned[new_idx]++; + atomic_inc(&tsk_pinned[new_idx]); } /* @@ -241,6 +336,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { + lockdep_assert_held_write(&bp_cpuinfo_sem); get_bp_info(bp->cpu, type)->cpu_pinned += weight; return 0; } @@ -249,6 +345,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, for_each_cpu(cpu, cpumask) toggle_bp_task_slot(bp, cpu, type, weight); + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); else @@ -354,14 +455,10 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -379,12 +476,11 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -411,11 +507,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -426,18 +521,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -663,7 +768,7 @@ bool hw_breakpoint_is_used(void) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { - if (info->tsk_pinned[slot]) + if (atomic_read(&info->tsk_pinned[slot])) return true; } } -- cgit v1.2.3-59-g8ed1b From 16db2839a5a59c242df77308cf57342ce0c3768e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:17 +0200 Subject: perf/hw_breakpoint: Introduce bp_slots_histogram Factor out the existing `atomic_t count[N]` into its own struct called 'bp_slots_histogram', to generalize and make its intent clearer in preparation of reusing elsewhere. The basic idea of bucketing "total uses of N slots" resembles a histogram, so calling it such seems most intuitive. No functional change. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-13-elver@google.com --- kernel/events/hw_breakpoint.c | 96 ++++++++++++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 33 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 229c6f4fae75..03ebecf048c0 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -36,19 +36,27 @@ #include /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. */ -struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ +struct bp_slots_histogram { #ifdef hw_breakpoint_slots - atomic_t tsk_pinned[hw_breakpoint_slots(0)]; + atomic_t count[hw_breakpoint_slots(0)]; #else - atomic_t *tsk_pinned; + atomic_t *count; #endif }; +/* + * Per-CPU constraints data. + */ +struct bp_cpuinfo { + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; +}; + static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) @@ -159,6 +167,18 @@ static inline int hw_breakpoint_slots_cached(int type) return __nr_bp_slots[type]; } +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + static __init int init_breakpoint_slots(void) { int i, cpu, err_cpu; @@ -170,8 +190,7 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { struct bp_cpuinfo *info = get_bp_info(cpu, i); - info->tsk_pinned = kcalloc(__nr_bp_slots[i], sizeof(atomic_t), GFP_KERNEL); - if (!info->tsk_pinned) + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) goto err; } } @@ -180,7 +199,7 @@ static __init int init_breakpoint_slots(void) err: for_each_possible_cpu(err_cpu) { for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); if (err_cpu == cpu) break; } @@ -189,6 +208,34 @@ err: } #endif +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { @@ -205,13 +252,11 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * At this point we want to have acquired the bp_cpuinfo_sem as a @@ -219,14 +264,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); - - for (i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { - ASSERT_EXCLUSIVE_WRITER(tsk_pinned[i]); /* Catch unexpected writers. */ - if (atomic_read(&tsk_pinned[i]) > 0) - return i + 1; - } - - return 0; + return bp_slots_histogram_max(tsk_pinned, type); } /* @@ -300,8 +338,7 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) static void toggle_bp_task_slot(struct perf_event *bp, int cpu, enum bp_type_idx type, int weight) { - atomic_t *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; /* * If bp->hw.target, tsk_pinned is only modified, but not used @@ -311,14 +348,7 @@ static void toggle_bp_task_slot(struct perf_event *bp, int cpu, * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. */ lockdep_assert_held_read(&bp_cpuinfo_sem); - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - atomic_dec(&tsk_pinned[old_idx]); - if (new_idx >= 0) - atomic_inc(&tsk_pinned[new_idx]); + bp_slots_histogram_add(tsk_pinned, task_bp_pinned(cpu, bp, type), weight); } /* @@ -768,7 +798,7 @@ bool hw_breakpoint_is_used(void) return true; for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { - if (atomic_read(&info->tsk_pinned[slot])) + if (atomic_read(&info->tsk_pinned.count[slot])) return true; } } -- cgit v1.2.3-59-g8ed1b From 9b1933b864a10e7f66b06d10c39217142baed28b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:18 +0200 Subject: perf/hw_breakpoint: Optimize max_bp_pinned_slots() for CPU-independent task targets Running the perf benchmark with (note: more aggressive parameters vs. preceding changes, but same 256 CPUs host): | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.989 [sec] | | 38.854160 usecs/op | 4973.332500 usecs/op/cpu 20.43% [kernel] [k] queued_spin_lock_slowpath 18.75% [kernel] [k] osq_lock 16.98% [kernel] [k] rhashtable_jhash2 8.34% [kernel] [k] task_bp_pinned 4.23% [kernel] [k] smp_cfm_core_cond 3.65% [kernel] [k] bcmp 2.83% [kernel] [k] toggle_bp_slot 1.87% [kernel] [k] find_next_bit 1.49% [kernel] [k] __reserve_bp_slot We can see that a majority of the time is now spent hashing task pointers to index into task_bps_ht in task_bp_pinned(). Obtaining the max_bp_pinned_slots() for CPU-independent task targets currently is O(#cpus), and calls task_bp_pinned() for each CPU, even if the result of task_bp_pinned() is CPU-independent. The loop in max_bp_pinned_slots() wants to compute the maximum slots across all CPUs. If task_bp_pinned() is CPU-independent, we can do so by obtaining the max slots across all CPUs and adding task_bp_pinned(). To do so in O(1), use a bp_slots_histogram for CPU-pinned slots. After this optimization: | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.930 [sec] | | 37.697832 usecs/op | 4825.322500 usecs/op/cpu 19.13% [kernel] [k] queued_spin_lock_slowpath 18.21% [kernel] [k] rhashtable_jhash2 15.46% [kernel] [k] osq_lock 6.27% [kernel] [k] toggle_bp_slot 5.91% [kernel] [k] task_bp_pinned 5.05% [kernel] [k] smp_cfm_core_cond 1.78% [kernel] [k] update_sg_lb_stats 1.36% [kernel] [k] llist_reverse_order 1.34% [kernel] [k] find_next_bit 1.19% [kernel] [k] bcmp Suggesting that time spent in task_bp_pinned() has been reduced. However, we're still hashing too much, which will be addressed in the subsequent change. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-14-elver@google.com --- kernel/events/hw_breakpoint.c | 57 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 03ebecf048c0..a489f31fe147 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -64,6 +64,9 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; static const struct rhashtable_params task_bps_ht_params = { @@ -194,6 +197,10 @@ static __init int init_breakpoint_slots(void) goto err; } } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + } return 0; err: @@ -203,6 +210,8 @@ err: if (err_cpu == cpu) break; } + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&cpu_pinned[i]); return -ENOMEM; } @@ -270,6 +279,9 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { @@ -288,9 +300,18 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) goto out; rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { - if (find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } out: @@ -316,6 +337,19 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -366,8 +400,11 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, /* Pinned counter cpu profiling */ if (!bp->hw.target) { + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + lockdep_assert_held_write(&bp_cpuinfo_sem); - get_bp_info(bp->cpu, type)->cpu_pinned += weight; + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; return 0; } @@ -804,6 +841,18 @@ bool hw_breakpoint_is_used(void) } } + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + } + } + return false; } -- cgit v1.2.3-59-g8ed1b From ecdfb8896f2ad733097e6309d64f94db4cd1020c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 29 Aug 2022 14:47:19 +0200 Subject: perf/hw_breakpoint: Optimize toggle_bp_slot() for CPU-independent task targets We can still see that a majority of the time is spent hashing task pointers: ... 16.98% [kernel] [k] rhashtable_jhash2 ... Doing the bookkeeping in toggle_bp_slots() is currently O(#cpus), calling task_bp_pinned() for each CPU, even if task_bp_pinned() is CPU-independent. The reason for this is to update the per-CPU 'tsk_pinned' histogram. To optimize the CPU-independent case to O(1), keep a separate CPU-independent 'tsk_pinned_all' histogram. The major source of complexity are transitions between "all CPU-independent task breakpoints" and "mixed CPU-independent and CPU-dependent task breakpoints". The code comments list all cases that require handling. After this optimization: | $> perf bench -r 100 breakpoint thread -b 4 -p 128 -t 512 | # Running 'breakpoint/thread' benchmark: | # Created/joined 100 threads with 4 breakpoints and 128 parallelism | Total time: 1.758 [sec] | | 34.336621 usecs/op | 4395.087500 usecs/op/cpu 38.08% [kernel] [k] queued_spin_lock_slowpath 10.81% [kernel] [k] smp_cfm_core_cond 3.01% [kernel] [k] update_sg_lb_stats 2.58% [kernel] [k] osq_lock 2.57% [kernel] [k] llist_reverse_order 1.45% [kernel] [k] find_next_bit 1.21% [kernel] [k] flush_tlb_func_common 1.01% [kernel] [k] arch_install_hw_breakpoint Showing that the time spent hashing keys has become insignificant. With the given benchmark parameters, that's an improvement of 12% compared with the old O(#cpus) version. And finally, using the less aggressive parameters from the preceding changes, we now observe: | $> perf bench -r 30 breakpoint thread -b 4 -p 64 -t 64 | # Running 'breakpoint/thread' benchmark: | # Created/joined 30 threads with 4 breakpoints and 64 parallelism | Total time: 0.067 [sec] | | 35.292187 usecs/op | 2258.700000 usecs/op/cpu Which is an improvement of 12% compared to without the histogram optimizations (baseline is 40 usecs/op). This is now on par with the theoretical ideal (constraints disabled), and only 12% slower than no breakpoints at all. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Acked-by: Ian Rogers Link: https://lore.kernel.org/r/20220829124719.675715-15-elver@google.com --- kernel/events/hw_breakpoint.c | 155 +++++++++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 31 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a489f31fe147..7ef0e98d31e2 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -66,6 +66,8 @@ static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) /* Number of pinned CPU breakpoints globally. */ static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; /* Keep track of the breakpoints attached to tasks */ static struct rhltable task_bps_ht; @@ -200,6 +202,8 @@ static __init int init_breakpoint_slots(void) for (i = 0; i < TYPE_MAX; i++) { if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; } return 0; @@ -210,8 +214,10 @@ err: if (err_cpu == cpu) break; } - for (i = 0; i < TYPE_MAX; i++) + for (i = 0; i < TYPE_MAX; i++) { bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } return -ENOMEM; } @@ -245,6 +251,26 @@ bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) return 0; } +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + #ifndef hw_breakpoint_weight static inline int hw_breakpoint_weight(struct perf_event *bp) { @@ -273,7 +299,7 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. */ lockdep_assert_held_write(&bp_cpuinfo_sem); - return bp_slots_histogram_max(tsk_pinned, type); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* @@ -366,40 +392,22 @@ max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) return pinned_slots; } -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) -{ - struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; - - /* - * If bp->hw.target, tsk_pinned is only modified, but not used - * otherwise. We can permit concurrent updates as long as there are no - * other uses: having acquired bp_cpuinfo_sem as a reader allows - * concurrent updates here. Uses of tsk_pinned will require acquiring - * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. - */ - lockdep_assert_held_read(&bp_cpuinfo_sem); - bp_slots_histogram_add(tsk_pinned, task_bp_pinned(cpu, bp, type), weight); -} - /* * Add/remove the given breakpoint in our constraint table */ static int -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + int cpu, next_tsk_pinned; if (!enable) weight = -weight; - /* Pinned counter cpu profiling */ if (!bp->hw.target) { + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); lockdep_assert_held_write(&bp_cpuinfo_sem); @@ -408,9 +416,91 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, return 0; } - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ + + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + if (ret) + return ret; + } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); + + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } + } /* * Readers want a stable snapshot of the per-task breakpoint list. @@ -419,8 +509,8 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (enable) return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); - else - return rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -850,6 +940,9 @@ bool hw_breakpoint_is_used(void) */ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; } } -- cgit v1.2.3-59-g8ed1b From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:54 -0700 Subject: perf: Add sample_flags to indicate the PMU-filled sample data On some platforms, some data e.g., timestamps, can be retrieved from the PMU driver. Usually, the data from the PMU driver is more accurate. The current perf kernel should output the PMU-filled sample data if it's available. To check the availability of the PMU-filled sample data, the current perf kernel initializes the related fields in the perf_sample_data_init(). When outputting a sample, the perf checks whether the field is updated by the PMU driver. If yes, the updated value will be output. If not, the perf uses an SW way to calculate the value or just outputs the initialized value if an SW way is unavailable either. With more and more data being provided by the PMU driver, more fields has to be initialized in the perf_sample_data_init(). That will increase the number of cache lines touched in perf_sample_data_init() and be harmful to the performance. Add new "sample_flags" to indicate the PMU-filled sample data. The PMU driver should set the corresponding PERF_SAMPLE_ flag when the field is updated. The initialization of the corresponding field is not required anymore. The following patches will make use of it and remove the corresponding fields from the perf_sample_data_init(), which will further minimize the number of cache lines touched. Only clear the sample flags that have already been done by the PMU driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the other PERF_RECORD_ event type, the sample data is not available. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 2 ++ kernel/events/core.c | 17 +++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1999408a9cbb..0978165a2d87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1008,6 +1008,7 @@ struct perf_sample_data { * Fields set by perf_sample_data_init(), group so as to * minimize the cachelines touched. */ + u64 sample_flags; u64 addr; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -1057,6 +1058,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ + data->sample_flags = 0; data->addr = addr; data->raw = NULL; data->br_stack = NULL; diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..c9b9cb79231a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6794,11 +6794,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +6826,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7303,6 +7302,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7310,7 +7310,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); -- cgit v1.2.3-59-g8ed1b From 47a3aeb39e8dc099ae431cd8b46bdf218f5511b2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:55 -0700 Subject: perf/x86/intel/pebs: Fix PEBS timestamps overwritten The PEBS TSC-based timestamps do not appear correctly in the final perf.data output file from perf record. The data->time field setup by PEBS in the setup_pebs_fixed_sample_data() is later overwritten by perf_events generic code in perf_prepare_sample(). There is an ordering problem. Set the sample flags when the data->time is updated by PEBS. The data->time field will not be overwritten anymore. Reported-by: Andreas Kogler Reported-by: Stephane Eranian Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-3-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index ba60427caa6d..cdd857bd2dd6 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1635,8 +1635,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * We can only do this for the default trace clock. */ if (x86_pmu.intel_cap.pebs_format >= 3 && - event->attr.use_clockid == 0) + event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(pebs->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } if (has_branch_stack(event)) data->br_stack = &cpuc->lbr_stack; @@ -1697,8 +1699,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, perf_sample_data_init(data, 0, event->hw.last_period); data->period = event->hw.last_period; - if (event->attr.use_clockid == 0) + if (event->attr.use_clockid == 0) { data->time = native_sched_clock_from_tsc(basic->tsc); + data->sample_flags |= PERF_SAMPLE_TIME; + } /* * We must however always use iregs for the unwinder to stay sane; the -- cgit v1.2.3-59-g8ed1b From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/amd/core.c | 4 +++- arch/x86/events/core.c | 4 +++- arch/x86/events/intel/core.c | 4 +++- arch/x86/events/intel/ds.c | 5 ++++- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 13919eb96931..1ad1efdb33f9 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2297,6 +2297,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 36bede1d7b1e..bd99d2ae14c3 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -929,8 +929,10 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..bb34a28fa71b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1714,8 +1714,10 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..ba101c28dcc9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2995,8 +2995,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cdd857bd2dd6..0489f750baa0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1640,8 +1640,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_TIME; } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1791,6 +1793,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; diff --git a/kernel/events/core.c b/kernel/events/core.c index c9b9cb79231a..104c0c9f4e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7052,7 +7052,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr @@ -7358,7 +7358,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- arch/x86/events/intel/ds.c | 10 +++++++--- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1ad1efdb33f9..a5c95a2006ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2305,9 +2305,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0489f750baa0..4c51118e4add 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1527,8 +1527,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source @@ -1620,9 +1622,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); @@ -1764,6 +1767,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 104c0c9f4e6f..f0af45db02b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7408,6 +7408,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a5c95a2006ea..6ec7069e6482 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2301,8 +2301,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4c51118e4add..bde73d492889 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1535,8 +1535,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1770,8 +1772,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0af45db02b3..163e2f478e61 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7411,6 +7411,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) data->weight.full = 0; + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bde73d492889..a5275c235c2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,9 +1628,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1780,9 +1782,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 163e2f478e61..15d27b14c827 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = PERF_MEM_NA; + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Sep 2022 14:14:14 +0530 Subject: perf: Consolidate branch sample filter helpers Besides the branch type filtering requests, 'event.attr.branch_sample_type' also contains various flags indicating which additional information should be captured, along with the base branch record. These flags help configure the underlying hardware, and capture the branch records appropriately when required e.g after PMU interrupt. But first, this moves an existing helper perf_sample_save_hw_index() into the header before adding some more helpers for other branch sample filter flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com --- include/linux/perf_event.h | 26 ++++++++++++++++++++++++++ kernel/events/core.c | 9 ++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 581880ddb9ef..a627528e5f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d27b14c827..00389d5f9241 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6966,11 +6966,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7059,7 +7054,7 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7359,7 +7354,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr -- cgit v1.2.3-59-g8ed1b From 7517f08b9a5eef0fa683b976c97d6178d00e6a3d Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:21 +0530 Subject: perf/core: Expand PERF_EVENT_FLAG_ARCH Two hardware event flags on x86 platform has overshot PERF_EVENT_FLAG_ARCH (0x0000ffff). These flags are PERF_X86_EVENT_PEBS_LAT_HYBRID (0x20000) and PERF_X86_EVENT_AMD_BRS (0x10000). Lets expand PERF_EVENT_FLAG_ARCH mask to accommodate those flags, and also create room for two more in the future. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-2-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a627528e5f3a..3e3c07512b75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -138,7 +138,7 @@ struct hw_perf_event_extra { * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific * usage. */ -#define PERF_EVENT_FLAG_ARCH 0x0000ffff +#define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 /** -- cgit v1.2.3-59-g8ed1b From f67dd218fafd9de9a13d095e775b621db76a058f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:22 +0530 Subject: perf/core: Assert PERF_EVENT_FLAG_ARCH does not overlap with generic flags This just ensures that PERF_EVENT_FLAG_ARCH does not overlap with generic hardware event flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-3-anshuman.khandual@arm.com --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3e3c07512b75..f88cb31eaf75 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -141,6 +141,8 @@ struct hw_perf_event_extra { #define PERF_EVENT_FLAG_ARCH 0x000fffff #define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000 +static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0); + /** * struct hw_perf_event - performance event hardware details: */ -- cgit v1.2.3-59-g8ed1b From 91207f62616f9f51b52436364e6d064f002e9112 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:23 +0530 Subject: arm64/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-4-anshuman.khandual@arm.com --- drivers/perf/arm_spe_pmu.c | 4 +++- include/linux/perf/arm_pmu.h | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index b65a7d9640e1..db8a0a841062 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -44,7 +44,9 @@ * This allows us to perform the check, i.e, perfmon_capable(), * in the context of the event owner, once, during the event_init(). */ -#define SPE_PMU_HW_FLAGS_CX BIT(0) +#define SPE_PMU_HW_FLAGS_CX 0x00001 + +static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX); static void set_spe_event_has_cx(struct perf_event *event) { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 0407a38b470a..0356cb6a215d 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -24,10 +24,11 @@ /* * ARM PMU hw_event flags */ -/* Event uses a 64bit counter */ -#define ARMPMU_EVT_64BIT 1 -/* Event uses a 47bit counter */ -#define ARMPMU_EVT_47BIT 2 +#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */ +#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */ + +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT); +static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT); #define HW_OP_UNSUPPORTED 0xFFFF #define C(_x) PERF_COUNT_HW_CACHE_##_x -- cgit v1.2.3-59-g8ed1b From 88081cfb699ce2568e5309c145eb9f9e9497b53f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 7 Sep 2022 14:49:24 +0530 Subject: x86/perf: Assert all platform event flags are within PERF_EVENT_FLAG_ARCH Ensure all platform specific event flags are within PERF_EVENT_FLAG_ARCH. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: James Clark Link: https://lkml.kernel.org/r/20220907091924.439193-5-anshuman.khandual@arm.com --- arch/x86/events/perf_event.h | 34 ++++++++++++++++------------------ arch/x86/events/perf_event_flags.h | 22 ++++++++++++++++++++++ 2 files changed, 38 insertions(+), 18 deletions(-) create mode 100644 arch/x86/events/perf_event_flags.h diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 93263b98cd6e..4a3dde24a1ae 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -64,27 +64,25 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode) return ((ecode & c->cmask) - c->code) <= (u64)c->size; } +#define PERF_ARCH(name, val) \ + PERF_X86_EVENT_##name = val, + /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */ - -#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */ -#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */ -#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */ -#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */ -#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */ -#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */ -#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */ -#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */ +enum { +#include "perf_event_flags.h" +}; + +#undef PERF_ARCH + +#define PERF_ARCH(name, val) \ + static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \ + PERF_X86_EVENT_##name); + +#include "perf_event_flags.h" + +#undef PERF_ARCH static inline bool is_topdown_count(struct perf_event *event) { diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h new file mode 100644 index 000000000000..1dc19b9b4426 --- /dev/null +++ b/arch/x86/events/perf_event_flags.h @@ -0,0 +1,22 @@ + +/* + * struct hw_perf_event.flags flags + */ +PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */ +PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */ +PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */ +PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */ +PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */ +PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */ +PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */ + /* 0x00080 */ +PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */ +PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */ +PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */ +PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */ +PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */ +PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */ +PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */ +PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */ +PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */ +PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */ -- cgit v1.2.3-59-g8ed1b From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2022 18:48:55 +0200 Subject: perf: Add a few assertions While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in perf_output_read_group()") a few spots were found that wanted assertions. Notable for_each_sibling_event() relies on exclusion from modification. This would normally be holding either ctx->lock or ctx->mutex, however due to how things are constructed disabling IRQs is a valid and sufficient substitute for ctx->lock. Another possible site to add assertions would be the various pmu::{add,del,read,..}() methods, but that's not trivially expressable in C -- the best option is wrappers, but those are easy enough to forget. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 17 +++++++++++++++++ kernel/events/core.c | 2 ++ 2 files changed, 19 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f88cb31eaf75..368bdc4f563f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -634,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) || \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00389d5f9241..3e90e454b995 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; -- cgit v1.2.3-59-g8ed1b From 73759c346341d39dfde39701476c0376dea0a98b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:27:22 +0200 Subject: perf/x86: Add two more x86_pmu methods In order to clean up x86_perf_event_{set_period,update)() start by adding them as x86_pmu methods. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.440196408@infradead.org --- arch/x86/events/core.c | 22 +++++++++++++++++----- arch/x86/events/perf_event.h | 5 +++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb34a28fa71b..bb559b79ff23 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,6 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); + DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints); @@ -1518,7 +1521,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) if (flags & PERF_EF_RELOAD) { WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); } event->hw.state = 0; @@ -1610,7 +1613,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) * Drain the remaining delta count out of a event * that we are disabling: */ - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -1700,7 +1703,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) event = cpuc->events[idx]; - val = x86_perf_event_update(event); + val = static_call(x86_pmu_update)(event); if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; @@ -1709,7 +1712,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs) */ handled++; - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; perf_sample_data_init(&data, 0, event->hw.last_period); @@ -2025,6 +2028,9 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_del, x86_pmu.del); static_call_update(x86_pmu_read, x86_pmu.read); + static_call_update(x86_pmu_set_period, x86_pmu.set_period); + static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints); @@ -2044,7 +2050,7 @@ static void x86_pmu_static_call_update(void) static void _x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed, @@ -2151,6 +2157,12 @@ static int __init init_hw_perf_events(void) if (!x86_pmu.guest_get_msrs) x86_pmu.guest_get_msrs = (void *)&__static_call_return0; + if (!x86_pmu.set_period) + x86_pmu.set_period = x86_perf_event_set_period; + + if (!x86_pmu.update) + x86_pmu.update = x86_perf_event_update; + x86_pmu_static_call_update(); /* diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 4a3dde24a1ae..7ae1a6c5368c 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -743,6 +743,8 @@ struct x86_pmu { void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); + int (*set_period)(struct perf_event *event); + u64 (*update)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -1042,6 +1044,9 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\ struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; +DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period); +DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update); + static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) -- cgit v1.2.3-59-g8ed1b From e577bb17a1eaa35b86ee873a786e603be768d668 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:06 +0200 Subject: perf/x86/intel: Move the topdown stuff into the intel driver Use the new x86_pmu::{set_period,update}() methods to push the topdown stuff into the Intel driver, where it belongs. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.505933457@infradead.org --- arch/x86/events/core.c | 7 ------- arch/x86/events/intel/core.c | 26 +++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index bb559b79ff23..b074e71bab21 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -119,9 +119,6 @@ u64 x86_perf_event_update(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); - /* * Careful: an NMI might modify the previous event value. * @@ -1373,10 +1370,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (unlikely(!hwc->event_base)) return 0; - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); - /* * If we are way outside a reasonable range then just skip forward: */ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index ba101c28dcc9..feed732fdf57 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2302,7 +2302,7 @@ static void intel_pmu_nhm_workaround(void) for (i = 0; i < 4; i++) { event = cpuc->events[i]; if (event) - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); } for (i = 0; i < 4; i++) { @@ -2317,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void) event = cpuc->events[i]; if (event) { - x86_perf_event_set_period(event); + static_call(x86_pmu_set_period)(event); __x86_pmu_enable_event(&event->hw, ARCH_PERFMON_EVENTSEL_ENABLE); } else @@ -2794,7 +2794,7 @@ static void intel_pmu_add_event(struct perf_event *event) */ int intel_pmu_save_and_restart(struct perf_event *event) { - x86_perf_event_update(event); + static_call(x86_pmu_update)(event); /* * For a checkpointed counter always reset back to 0. This * avoids a situation where the counter overflows, aborts the @@ -2806,9 +2806,27 @@ int intel_pmu_save_and_restart(struct perf_event *event) wrmsrl(event->hw.event_base, 0); local64_set(&event->hw.prev_count, 0); } + return static_call(x86_pmu_set_period)(event); +} + +static int intel_pmu_set_period(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.set_topdown_event_period) + return x86_pmu.set_topdown_event_period(event); + return x86_perf_event_set_period(event); } +static u64 intel_pmu_update(struct perf_event *event) +{ + if (unlikely(is_topdown_count(event)) && + x86_pmu.update_topdown_event) + return x86_pmu.update_topdown_event(event); + + return x86_perf_event_update(event); +} + static void intel_pmu_reset(void) { struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); @@ -4786,6 +4804,8 @@ static __initconst const struct x86_pmu intel_pmu = { .add = intel_pmu_add_event, .del = intel_pmu_del_event, .read = intel_pmu_read_event, + .set_period = intel_pmu_set_period, + .update = intel_pmu_update, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, -- cgit v1.2.3-59-g8ed1b From 28f0f3c44b5c35be657a4f922dcdfb48285f4373 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:25 +0200 Subject: perf/x86: Change x86_pmu::limit_period signature In preparation for making it a static_call, change the signature. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.573713839@infradead.org --- arch/x86/events/amd/core.c | 8 +++----- arch/x86/events/core.c | 13 ++++++++----- arch/x86/events/intel/core.c | 19 ++++++++----------- arch/x86/events/perf_event.h | 2 +- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index bd99d2ae14c3..8b70237c33f7 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -1224,16 +1224,14 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static u64 amd_pmu_limit_period(struct perf_event *event, u64 left) +static void amd_pmu_limit_period(struct perf_event *event, s64 *left) { /* * Decrease period by the depth of the BRS feature to get the last N * taken branches and approximate the desired period */ - if (has_branch_stack(event) && left > x86_pmu.lbr_nr) - left -= x86_pmu.lbr_nr; - - return left; + if (has_branch_stack(event) && *left > x86_pmu.lbr_nr) + *left -= x86_pmu.lbr_nr; } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index b074e71bab21..1e90bc7ca36f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -621,8 +621,9 @@ int x86_pmu_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; if (event->attr.sample_period && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, event->attr.sample_period) > - event->attr.sample_period) + s64 left = event->attr.sample_period; + x86_pmu.limit_period(event, &left); + if (left > event->attr.sample_period) return -EINVAL; } @@ -1396,9 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event) left = x86_pmu.max_period; if (x86_pmu.limit_period) - left = x86_pmu.limit_period(event, left); + x86_pmu.limit_period(event, &left); - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + this_cpu_write(pmc_prev_left[idx], left); /* * The hw event starts counting from this event offset, @@ -2677,7 +2678,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value) return -EINVAL; if (value && x86_pmu.limit_period) { - if (x86_pmu.limit_period(event, value) > value) + s64 left = value; + x86_pmu.limit_period(event, &left); + if (left > value) return -EINVAL; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index feed732fdf57..92cc390590d1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4344,28 +4344,25 @@ static u8 adl_get_hybrid_cpu_type(void) * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static u64 bdw_limit_period(struct perf_event *event, u64 left) +static void bdw_limit_period(struct perf_event *event, s64 *left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { - if (left < 128) - left = 128; - left &= ~0x3fULL; + if (*left < 128) + *left = 128; + *left &= ~0x3fULL; } - return left; } -static u64 nhm_limit_period(struct perf_event *event, u64 left) +static void nhm_limit_period(struct perf_event *event, s64 *left) { - return max(left, 32ULL); + *left = max(*left, 32LL); } -static u64 spr_limit_period(struct perf_event *event, u64 left) +static void spr_limit_period(struct perf_event *event, s64 *left) { if (event->attr.precise_ip == 3) - return max(left, 128ULL); - - return left; + *left = max(*left, 128LL); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 7ae1a6c5368c..e82d2d212534 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -781,7 +781,7 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; - u64 (*limit_period)(struct perf_event *event, u64 l); + void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ unsigned int late_ack :1, -- cgit v1.2.3-59-g8ed1b From 08b3068fab207e3c7d79799d434e1d648524cac6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 10 May 2022 21:28:18 +0200 Subject: perf/x86: Add a x86_pmu::limit_period static_call Avoid a branch and indirect call. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.640658334@infradead.org --- arch/x86/events/core.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 1e90bc7ca36f..05830bb77d40 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -72,8 +72,9 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add); DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del); DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read); -DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); -DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period); +DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update); +DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period); DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events); DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints); @@ -1396,8 +1397,7 @@ int x86_perf_event_set_period(struct perf_event *event) if (left > x86_pmu.max_period) left = x86_pmu.max_period; - if (x86_pmu.limit_period) - x86_pmu.limit_period(event, &left); + static_call_cond(x86_pmu_limit_period)(event, &left); this_cpu_write(pmc_prev_left[idx], left); @@ -2024,6 +2024,7 @@ static void x86_pmu_static_call_update(void) static_call_update(x86_pmu_set_period, x86_pmu.set_period); static_call_update(x86_pmu_update, x86_pmu.update); + static_call_update(x86_pmu_limit_period, x86_pmu.limit_period); static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events); static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints); -- cgit v1.2.3-59-g8ed1b From 2368516731901c391826f7cf23516173193652fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 16:41:25 +0200 Subject: perf/x86/intel: Remove x86_pmu::set_topdown_event_period Now that it is all internal to the intel driver, remove x86_pmu::set_topdown_event_period. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.706354189@infradead.org --- arch/x86/events/intel/core.c | 16 ++++++++++------ arch/x86/events/perf_event.h | 1 - 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 92cc390590d1..75400ed0eac3 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2521,6 +2521,8 @@ static int adl_set_topdown_event_period(struct perf_event *event) return icl_set_topdown_event_period(event); } +DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period); + static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx) { u32 val; @@ -2811,9 +2813,8 @@ int intel_pmu_save_and_restart(struct perf_event *event) static int intel_pmu_set_period(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.set_topdown_event_period) - return x86_pmu.set_topdown_event_period(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_set_topdown_event_period)(event); return x86_perf_event_set_period(event); } @@ -6292,7 +6293,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Icelake events, "); name = "icelake"; break; @@ -6330,7 +6332,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = icl_update_topdown_event; - x86_pmu.set_topdown_event_period = icl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); name = "sapphire_rapids"; break; @@ -6367,7 +6370,8 @@ __init int intel_pmu_init(void) x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; x86_pmu.update_topdown_event = adl_update_topdown_event; - x86_pmu.set_topdown_event_period = adl_set_topdown_event_period; + static_call_update(intel_pmu_set_topdown_event_period, + &adl_set_topdown_event_period); x86_pmu.filter_match = intel_pmu_filter_match; x86_pmu.get_event_constraints = adl_get_event_constraints; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index e82d2d212534..be27eadc40a3 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -890,7 +890,6 @@ struct x86_pmu { */ int num_topdown_events; u64 (*update_topdown_event)(struct perf_event *event); - int (*set_topdown_event_period)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) -- cgit v1.2.3-59-g8ed1b From 1acab2e01c9c5df00f2fddf3473014dea89dcb5f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 May 2022 17:02:05 +0200 Subject: perf/x86/intel: Remove x86_pmu::update_topdown_event Now that it is all internal to the intel driver, remove x86_pmu::update_topdown_event. Assumes that is_topdown_count(event) can only be true when the hardware has topdown stuff and the function is set. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.771635301@infradead.org --- arch/x86/events/intel/core.c | 22 ++++++++++++---------- arch/x86/events/perf_event.h | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 75400ed0eac3..1e429e86a7f4 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2673,6 +2673,7 @@ static u64 adl_update_topdown_event(struct perf_event *event) return icl_update_topdown_event(event); } +DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update); static void intel_pmu_read_topdown_event(struct perf_event *event) { @@ -2684,7 +2685,7 @@ static void intel_pmu_read_topdown_event(struct perf_event *event) return; perf_pmu_disable(event->pmu); - x86_pmu.update_topdown_event(event); + static_call(intel_pmu_update_topdown_event)(event); perf_pmu_enable(event->pmu); } @@ -2692,7 +2693,7 @@ static void intel_pmu_read_event(struct perf_event *event) { if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) intel_pmu_auto_reload_read(event); - else if (is_topdown_count(event) && x86_pmu.update_topdown_event) + else if (is_topdown_count(event)) intel_pmu_read_topdown_event(event); else x86_perf_event_update(event); @@ -2821,9 +2822,8 @@ static int intel_pmu_set_period(struct perf_event *event) static u64 intel_pmu_update(struct perf_event *event) { - if (unlikely(is_topdown_count(event)) && - x86_pmu.update_topdown_event) - return x86_pmu.update_topdown_event(event); + if (unlikely(is_topdown_count(event))) + return static_call(intel_pmu_update_topdown_event)(event); return x86_perf_event_update(event); } @@ -2990,8 +2990,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) */ if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) { handled++; - if (x86_pmu.update_topdown_event) - x86_pmu.update_topdown_event(NULL); + static_call(intel_pmu_update_topdown_event)(NULL); } /* @@ -6292,7 +6291,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 4; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Icelake events, "); @@ -6331,7 +6331,8 @@ __init int intel_pmu_init(void) x86_pmu.lbr_pt_coexist = true; intel_pmu_pebs_data_source_skl(pmem); x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = icl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &icl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &icl_set_topdown_event_period); pr_cont("Sapphire Rapids events, "); @@ -6369,7 +6370,8 @@ __init int intel_pmu_init(void) intel_pmu_pebs_data_source_adl(); x86_pmu.pebs_latency_data = adl_latency_data_small; x86_pmu.num_topdown_events = 8; - x86_pmu.update_topdown_event = adl_update_topdown_event; + static_call_update(intel_pmu_update_topdown_event, + &adl_update_topdown_event); static_call_update(intel_pmu_set_topdown_event_period, &adl_set_topdown_event_period); diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index be27eadc40a3..386ebfa416da 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -889,7 +889,6 @@ struct x86_pmu { * Intel perf metrics */ int num_topdown_events; - u64 (*update_topdown_event)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) -- cgit v1.2.3-59-g8ed1b From dbf4e792beadafc684ef455453c613ff182c7723 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 May 2022 15:38:43 +0200 Subject: perf/x86/p4: Remove perfctr_second_write quirk Now that we have a x86_pmu::set_period() method, use it to remove the perfctr_second_write quirk from the generic code. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220829101321.839502514@infradead.org --- arch/x86/events/core.c | 12 +----------- arch/x86/events/intel/p4.c | 37 +++++++++++++++++++++++++++---------- arch/x86/events/perf_event.h | 2 +- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 05830bb77d40..b30b8bbcd1e2 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1356,7 +1356,7 @@ static void x86_pmu_enable(struct pmu *pmu) static_call(x86_pmu_enable_all)(added); } -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); +DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -1416,16 +1416,6 @@ int x86_perf_event_set_period(struct perf_event *event) if (is_counter_pair(hwc)) wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff); - /* - * Due to erratum on certan cpu we need - * a second write to be sure the register - * is updated properly - */ - if (x86_pmu.perfctr_second_write) { - wrmsrl(hwc->event_base, - (u64)(-left) & x86_pmu.cntval_mask); - } - perf_event_update_userpage(event); return ret; diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c index 7951a5dc73b6..03bbcc2fa2ff 100644 --- a/arch/x86/events/intel/p4.c +++ b/arch/x86/events/intel/p4.c @@ -1006,6 +1006,29 @@ static void p4_pmu_enable_all(int added) } } +static int p4_pmu_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = this_cpu_read(pmc_prev_left[hwc->idx]); + int ret; + + ret = x86_perf_event_set_period(event); + + if (hwc->event_base) { + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } + + return ret; +} + static int p4_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; @@ -1044,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) /* event overflow for sure */ perf_sample_data_init(&data, 0, hwc->last_period); - if (!x86_perf_event_set_period(event)) + if (!static_call(x86_pmu_set_period)(event)) continue; @@ -1316,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = { .enable_all = p4_pmu_enable_all, .enable = p4_pmu_enable_event, .disable = p4_pmu_disable_event, + + .set_period = p4_pmu_set_period, + .eventsel = MSR_P4_BPU_CCCR0, .perfctr = MSR_P4_BPU_PERFCTR0, .event_map = p4_pmu_event_map, @@ -1334,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = { .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, .hw_config = p4_hw_config, .schedule_events = p4_pmu_schedule_events, - /* - * This handles erratum N15 in intel doc 249199-029, - * the counter may not be updated correctly on write - * so we need a second write operation to do the trick - * (the official workaround didn't work) - * - * the former idea is taken from OProfile code - */ - .perfctr_second_write = 1, .format_attrs = intel_p4_formats_attr, }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 386ebfa416da..20c2ee2128fe 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -780,7 +780,6 @@ struct x86_pmu { struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; - int perfctr_second_write; void (*limit_period)(struct perf_event *event, s64 *l); /* PMI handler bits */ @@ -1060,6 +1059,7 @@ static inline bool x86_pmu_has_lbr_callstack(void) } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); +DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); int x86_perf_event_set_period(struct perf_event *event); -- cgit v1.2.3-59-g8ed1b From fae9ebde9696385fa2e993e752cf68d9781f3ea0 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 4 Aug 2022 07:07:29 -0700 Subject: perf/x86/intel: Optimize FIXED_CTR_CTRL access All the fixed counters share a fixed control register. The current perf reads and re-writes the fixed control register for each fixed counter disable/enable, which is unnecessary. When changing the fixed control register, the entire PMU must be disabled via the global control register. The changing cannot be taken effect until the entire PMU is re-enabled. Only updating the fixed control register once right before the entire PMU re-enabling is enough. The read of the fixed control register is not necessary either. The value can be cached in the per CPU cpu_hw_events. Test results: Counting all the fixed counters with the perf bench sched pipe as below on a SPR machine. $perf stat -e cycles,instructions,ref-cycles,slots --no-inherit -- taskset -c 1 perf bench sched pipe The Total elapsed time reduces from 5.36s (without the patch) to 4.99s (with the patch), which is ~6.9% improvement. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220804140729.2951259-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 22 +++++++++++++--------- arch/x86/events/perf_event.h | 4 ++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1e429e86a7f4..7f4e7e6b45f0 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2190,6 +2190,12 @@ static void __intel_pmu_enable_all(int added, bool pmi) u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl); intel_pmu_lbr_enable_all(pmi); + + if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) { + wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val); + cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val; + } + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, intel_ctrl & ~cpuc->intel_ctrl_guest_mask); @@ -2407,9 +2413,10 @@ static inline void intel_clear_masks(struct perf_event *event, int idx) static void intel_pmu_disable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask; int idx = hwc->idx; + u64 mask; if (is_topdown_idx(idx)) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -2426,9 +2433,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event) intel_clear_masks(event, idx); mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; } static void intel_pmu_disable_event(struct perf_event *event) @@ -2701,8 +2706,9 @@ static void intel_pmu_read_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct perf_event *event) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - u64 ctrl_val, mask, bits = 0; + u64 mask, bits = 0; int idx = hwc->idx; if (is_topdown_idx(idx)) { @@ -2746,10 +2752,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event) mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); } - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - wrmsrl(hwc->config_base, ctrl_val); + cpuc->fixed_ctrl_val &= ~mask; + cpuc->fixed_ctrl_val |= bits; } static void intel_pmu_enable_event(struct perf_event *event) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 20c2ee2128fe..371967054871 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -270,6 +270,10 @@ struct cpu_hw_events { u64 active_pebs_data_cfg; int pebs_record_size; + /* Intel Fixed counter configuration */ + u64 fixed_ctrl_val; + u64 active_fixed_ctrl_val; + /* * Intel LBR bits */ -- cgit v1.2.3-59-g8ed1b From 3749d33e510c3dc695b3a5886b706310890d7ebd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:02 -0700 Subject: perf: Use sample_flags for callchain So that it can call perf_callchain() only if needed. Historically it used __PERF_SAMPLE_CALLCHAIN_EARLY but we can do that with sample_flags in the struct perf_sample_data. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-1-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- kernel/events/core.c | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c251bc44c088..dab094166693 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -798,8 +798,10 @@ fail: * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a5275c235c2a..4ba6ab6d0d92 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1546,8 +1546,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1719,8 +1721,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e90e454b995..c98ecf3e09ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7320,7 +7320,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; -- cgit v1.2.3-59-g8ed1b From 16817ad7e8b31728b44ff9f17d8d894ed8a450d0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:03 -0700 Subject: perf/bpf: Always use perf callchains if exist If the perf_event has PERF_SAMPLE_CALLCHAIN, BPF can use it for stack trace. The problematic cases like PEBS and IBS already handled in the PMU driver and they filled the callchain info in the sample data. For others, we can call perf_callchain() before the BPF handler. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-2-namhyung@kernel.org --- kernel/bpf/stackmap.c | 4 ++-- kernel/events/core.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/events/core.c b/kernel/events/core.c index c98ecf3e09ba..7da5515082f9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10000,8 +10000,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10027,7 +10035,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* -- cgit v1.2.3-59-g8ed1b From b4e12b2d70fd9eccdb3cef8015dc1788ca38e3fd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:04 -0700 Subject: perf: Kill __PERF_SAMPLE_CALLCHAIN_EARLY There's no in-tree user anymore. Let's get rid of it. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-3-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 10 ---------- arch/x86/events/intel/core.c | 3 --- include/uapi/linux/perf_event.h | 2 -- 3 files changed, 15 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index dab094166693..ce5720bfb350 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event) hwc->config_base = perf_ibs->msr; hwc->config = config; - /* - * rip recorded by IbsOpRip will not be consistent with rsp and rbp - * recorded as part of interrupt regs. Thus we need to use rip from - * interrupt regs while unwinding call stack. Setting _EARLY flag - * makes sure we unwind call-stack before perf sample rip is set to - * IbsOpRip. - */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; - return 0; } diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 7f4e7e6b45f0..b16c91ac9219 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3868,9 +3868,6 @@ static int intel_pmu_hw_config(struct perf_event *event) } if (x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY; } if (needs_branch_stack(event)) { diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index dca16582885f..e639c74cf5fb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -164,8 +164,6 @@ enum perf_event_sample_format { PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ - - __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; #define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) -- cgit v1.2.3-59-g8ed1b From dca6344d7a77dd0501a73745f4a9fb1ee2bc9d7c Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 18 Sep 2022 00:41:08 +0100 Subject: perf/core: Convert snprintf() to scnprintf() Coccinelle reports a warning: WARNING: use scnprintf or sprintf This LWN article explains the rationale for this change: https: //lwn.net/Articles/69419/ Ie. snprintf() returns what *would* be the resulting length, while scnprintf() returns the actual length. Adding to that, there has also been some slow migration from snprintf to scnprintf, here's the shift in usage in the past 3.5 years, in all fs/ files: v5.0 v6.0-rc6 -------------------------------------- snprintf() uses: 63 213 scnprintf() uses: 374 186 No intended change in behavior. [ mingo: Improved the changelog & reviewed the usage sites. ] Signed-off-by: Jules Irenge Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 7da5515082f9..c07e9a3ea94c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10952,7 +10952,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10963,7 +10963,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10974,7 +10974,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); -- cgit v1.2.3-59-g8ed1b From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 8 ++++++-- kernel/events/core.c | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4ba6ab6d0d92..d2e9ff16f6ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1621,8 +1621,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ @@ -1783,8 +1785,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_DATA_SRC; } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index c07e9a3ea94c..a91f74db9fe9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,11 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) data->txn = 0; + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_cf.c | 1 + arch/s390/kernel/perf_pai_crypto.c | 1 + arch/x86/events/amd/ibs.c | 1 + include/linux/perf_event.h | 5 ++--- kernel/events/core.c | 3 ++- 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ce5720bfb350..c29a006954c7 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -781,6 +781,7 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a91f74db9fe9..04e19a857d4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7332,7 +7332,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7348,6 +7348,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; -- cgit v1.2.3-59-g8ed1b From 4674ffe2fcad45a9b164401cc0794115702326cf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 27 Sep 2022 19:20:25 +0200 Subject: perf, hw_breakpoint: Fix use-after-free if perf_event_open() fails Local testing revealed that we can trigger a use-after-free during rhashtable lookup as follows: | BUG: KASAN: use-after-free in memcmp lib/string.c:757 | Read of size 8 at addr ffff888107544dc0 by task perf-rhltable-n/1293 | | CPU: 0 PID: 1293 Comm: perf-rhltable-n Not tainted 6.0.0-rc3-00014-g85260862789c #46 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 | Call Trace: | | memcmp lib/string.c:757 | rhashtable_compare include/linux/rhashtable.h:577 [inline] | __rhashtable_lookup include/linux/rhashtable.h:602 [inline] | rhltable_lookup include/linux/rhashtable.h:688 [inline] | task_bp_pinned kernel/events/hw_breakpoint.c:324 | toggle_bp_slot kernel/events/hw_breakpoint.c:462 | __release_bp_slot kernel/events/hw_breakpoint.c:631 [inline] | release_bp_slot kernel/events/hw_breakpoint.c:639 | register_perf_hw_breakpoint kernel/events/hw_breakpoint.c:742 | hw_breakpoint_event_init kernel/events/hw_breakpoint.c:976 | perf_try_init_event kernel/events/core.c:11261 | perf_init_event kernel/events/core.c:11325 [inline] | perf_event_alloc kernel/events/core.c:11619 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | | Allocated by task 1292: | perf_event_alloc kernel/events/core.c:11505 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | Freed by task 1292: | perf_event_alloc kernel/events/core.c:11716 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | The buggy address belongs to the object at ffff888107544c00 | which belongs to the cache perf_event of size 1352 | The buggy address is located 448 bytes inside of | 1352-byte region [ffff888107544c00, ffff888107545148) This happens because the first perf_event_open() managed to reserve a HW breakpoint slot, however, later fails for other reasons and returns. The second perf_event_open() runs concurrently, and during rhltable_lookup() looks up an entry which is being freed: since rhltable_lookup() may run concurrently (under the RCU read lock) with rhltable_remove(), we may end up with a stale entry, for which memory may also have already been freed when being accessed. To fix, only free the failed perf_event after an RCU grace period. This allows subsystems that store references to an event to always access it concurrently under the RCU read lock, even if initialization will fail. Given failure is unlikely and a slow-path, turning the immediate free into a call_rcu()-wrapped free does not affect performance elsewhere. Fixes: 0370dc314df3 ("perf/hw_breakpoint: Optimize list of per-task breakpoints") Reported-by: syzkaller Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220927172025.1636995-1-elver@google.com --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 04e19a857d4b..e1ffdb861b53 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11734,11 +11734,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } -- cgit v1.2.3-59-g8ed1b From cce6a2d7e0e494c453ad73e1e78bd50684f20cca Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 27 Sep 2022 22:32:59 +0200 Subject: bpf: Check flags for branch stack in bpf_read_branch_records helper Recent commit [1] changed branch stack data indication from br_stack pointer to sample_flags in perf_sample_data struct. We need to check sample_flags for PERF_SAMPLE_BRANCH_STACK bit for valid branch stack data. [1] a9a931e26668 ("perf: Use sample_flags for branch stack") Fixes: a9a931e26668 ("perf: Use sample_flags for branch stack") Signed-off-by: Jiri Olsa Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kan Liang Link: https://lore.kernel.org/r/20220927203259.590950-1-jolsa@kernel.org --- kernel/trace/bpf_trace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 68e5cdd24cef..1fcd1234607e 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1507,6 +1507,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; -- cgit v1.2.3-59-g8ed1b From 50b0c97bf00e4815aee09cace28b940ebb060e69 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:28 -0700 Subject: perf/x86: Add new Raptor Lake S support From PMU's perspective, the new Raptor Lake S is the same as the other of hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-1-kan.liang@linux.intel.com --- arch/x86/events/intel/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index b2d8def4d03c..3939debb27e7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -6344,6 +6344,7 @@ __init int intel_pmu_init(void) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: /* * Alder Lake has 2 types of CPU, core and atom. * -- cgit v1.2.3-59-g8ed1b From 193c888b7ffe4da97346950c0e98dd77cc629f24 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:29 -0700 Subject: perf/x86/msr: Add new Raptor Lake S support The same as the other hybrid {ALDER,RAPTOP}LAKE, the new Raptor Lake S also support PPERF and SMI_COUNT MSRs. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-2-kan.liang@linux.intel.com --- arch/x86/events/msr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index ac542f98c070..ecced3a52668 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -106,6 +106,7 @@ static bool test_intel(int idx, void *data) case INTEL_FAM6_ALDERLAKE_N: case INTEL_FAM6_RAPTORLAKE: case INTEL_FAM6_RAPTORLAKE_P: + case INTEL_FAM6_RAPTORLAKE_S: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; -- cgit v1.2.3-59-g8ed1b From d12940d2ead51c6978e7d38b2abf12b833270b2a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:30 -0700 Subject: perf/x86/cstate: Add new Raptor Lake S support From the perspective of Intel cstate residency counters, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-3-kan.liang@linux.intel.com --- arch/x86/events/intel/cstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 8ec23f47fee9..a2834bc93149 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -685,6 +685,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); -- cgit v1.2.3-59-g8ed1b From e04a1607c9c3c0e2dc48715aeb570f2581f514bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 28 Sep 2022 08:33:31 -0700 Subject: perf/x86/uncore: Add new Raptor Lake S support From the perspective of the uncore PMU, the new Raptor Lake S is the same as the other hybrid {ALDER,RAPTOP}LAKE. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220928153331.3757388-4-kan.liang@linux.intel.com --- arch/x86/events/intel/uncore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index db6c31bca809..6f1ccc57a692 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1831,6 +1831,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init), + X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init), X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init), {}, -- cgit v1.2.3-59-g8ed1b From ee3e88dfec23153d0675b5d00522297b9adf657c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:51 +0530 Subject: perf/mem: Introduce PERF_MEM_LVLNUM_{EXTN_MEM|IO} PERF_MEM_LVLNUM_EXTN_MEM which can be used to indicate accesses to extension memory like CXL etc. PERF_MEM_LVL_IO can be used for IO accesses but it can not distinguish between local and remote IO. Introduce new field PERF_MEM_LVLNUM_IO which can be clubbed with PERF_MEM_REMOTE_REMOTE to indicate Remote IO accesses. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-2-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e639c74cf5fb..4ae3c249f675 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1336,7 +1336,9 @@ union perf_mem_data_src { #define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ #define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ #define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ -/* 5-0xa available */ +/* 5-0x8 available */ +#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ #define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ #define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ #define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ -- cgit v1.2.3-59-g8ed1b From 610c238041fbc682936d34132362a54a802600fe Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:52 +0530 Subject: perf/x86/amd: Add IBS OP_DATA2 DataSrc bit definitions IBS_OP_DATA2 DataSrc provides detail about location of the data being accessed from by load ops. Define macros for legacy and extended DataSrc values. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-3-ravi.bangoria@amd.com --- arch/x86/include/asm/amd-ibs.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index f3eb098d63d4..cb2a5e113daa 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -6,6 +6,22 @@ #include +/* IBS_OP_DATA2 DataSrc */ +#define IBS_DATA_SRC_LOC_CACHE 2 +#define IBS_DATA_SRC_DRAM 3 +#define IBS_DATA_SRC_REM_CACHE 4 +#define IBS_DATA_SRC_IO 7 + +/* IBS_OP_DATA2 DataSrc Extension */ +#define IBS_DATA_SRC_EXT_LOC_CACHE 1 +#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2 +#define IBS_DATA_SRC_EXT_DRAM 3 +#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5 +#define IBS_DATA_SRC_EXT_PMEM 6 +#define IBS_DATA_SRC_EXT_IO 7 +#define IBS_DATA_SRC_EXT_EXT_MEM 8 +#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12 + /* * IBS Hardware MSRs */ -- cgit v1.2.3-59-g8ed1b From 7c10dd0a88b1cc6ae4637fffb494c5e080027eb6 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:53 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_DATA_SRC struct perf_mem_data_src is used to pass arch specific memory access details into generic form. These details gets consumed by tools like perf mem and c2c. IBS tagged load/store sample provides most of the information needed for these tools. Add a logic to convert IBS specific raw data into perf_mem_data_src. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-4-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 318 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 312 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c29a006954c7..e20caa5cf02f 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -678,6 +678,312 @@ static struct perf_ibs perf_ibs_op = { .get_count = get_ibs_op_count, }; +static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_op = PERF_MEM_OP_NA; + + if (op_data3->ld_op) + data_src->mem_op = PERF_MEM_OP_LOAD; + else if (op_data3->st_op) + data_src->mem_op = PERF_MEM_OP_STORE; +} + +/* + * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has + * more fine granular DataSrc encodings. Others have coarse. + */ +static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2) +{ + if (ibs_caps & IBS_CAPS_ZEN4) + return (op_data2->data_src_hi << 3) | op_data2->data_src_lo; + + return op_data2->data_src_lo; +} + +static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src = perf_ibs_data_src(op_data2); + + data_src->mem_lvl = 0; + + /* + * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached + * memory accesses. So, check DcUcMemAcc bit early. + */ + if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT; + return; + } + + /* L1 Hit */ + if (op_data3->dc_miss == 0) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + return; + } + + /* L2 Hit */ + if (op_data3->l2_miss == 0) { + /* Erratum #1293 */ + if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF || + !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* + * OP_DATA2 is valid only for load ops. Skip all checks which + * uses OP_DATA2[DataSrc]. + */ + if (data_src->mem_op != PERF_MEM_OP_LOAD) + goto check_mab; + + /* L3 Hit */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 | + PERF_MEM_LVL_HIT; + return; + } + } + + /* A peer cache in a near CCX */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT; + return; + } + + /* A peer cache in a far CCX */ + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } else { + if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) { + data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT; + return; + } + } + + /* DRAM */ + if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) { + if (op_data2->rmt_node == 0) + data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT; + else + data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT; + return; + } + + /* PMEM */ + if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* Extension Memory */ + if (ibs_caps & IBS_CAPS_ZEN4 && + ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) { + data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + + /* IO */ + if (ibs_data_src == IBS_DATA_SRC_EXT_IO) { + data_src->mem_lvl = PERF_MEM_LVL_IO; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO; + if (op_data2->rmt_node) { + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + /* IBS doesn't provide Remote socket detail */ + data_src->mem_hops = PERF_MEM_HOPS_1; + } + return; + } + +check_mab: + /* + * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding + * DC misses. However, such data may come from any level in mem + * hierarchy. IBS provides detail about both MAB as well as actual + * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set + * MAB only when IBS fails to provide DataSrc. + */ + if (op_data3->dc_miss_no_mab_alloc) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + return; + } + + data_src->mem_lvl = PERF_MEM_LVL_NA; +} + +static bool perf_ibs_cache_hit_st_valid(void) +{ + /* 0: Uninitialized, 1: Valid, -1: Invalid */ + static int cache_hit_st_valid; + + if (unlikely(!cache_hit_st_valid)) { + if (boot_cpu_data.x86 == 0x19 && + (boot_cpu_data.x86_model <= 0xF || + (boot_cpu_data.x86_model >= 0x20 && + boot_cpu_data.x86_model <= 0x5F))) { + cache_hit_st_valid = -1; + } else { + cache_hit_st_valid = 1; + } + } + + return cache_hit_st_valid == 1; +} + +static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + u8 ibs_data_src; + + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + + if (!perf_ibs_cache_hit_st_valid() || + data_src->mem_op != PERF_MEM_OP_LOAD || + data_src->mem_lvl & PERF_MEM_LVL_L1 || + data_src->mem_lvl & PERF_MEM_LVL_L2 || + op_data2->cache_hit_st) + return; + + ibs_data_src = perf_ibs_data_src(op_data2); + + if (ibs_caps & IBS_CAPS_ZEN4) { + if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE || + ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) { + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + } +} + +static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_dtlb = PERF_MEM_TLB_NA; + + if (!op_data3->dc_lin_addr_valid) + return; + + if (!op_data3->dc_l1tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; + return; + } + + if (!op_data3->dc_l2tlb_miss) { + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT; + return; + } + + data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS; +} + +static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, + struct perf_sample_data *data) +{ + union perf_mem_data_src *data_src = &data->data_src; + + data_src->mem_lock = PERF_MEM_LOCK_NA; + + if (op_data3->dc_locked_op) + data_src->mem_lock = PERF_MEM_LOCK_LOCKED; +} + +#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) + +static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, + struct perf_sample_data *data, + union ibs_op_data2 *op_data2, + union ibs_op_data3 *op_data3) +{ + perf_ibs_get_mem_lvl(op_data2, op_data3, data); + perf_ibs_get_mem_snoop(op_data2, data); + perf_ibs_get_tlb_lvl(op_data3, data); + perf_ibs_get_mem_lock(op_data3, data); +} + +static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data, + union ibs_op_data3 *op_data3) +{ + __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)]; + + /* Erratum #1293 */ + if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF && + (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) { + /* + * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode. + * DataSrc=0 is 'No valid status' and RmtNode is invalid when + * DataSrc=0. + */ + val = 0; + } + return val; +} + +static void perf_ibs_parse_ld_st_data(__u64 sample_type, + struct perf_ibs_data *ibs_data, + struct perf_sample_data *data) +{ + union ibs_op_data3 op_data3; + union ibs_op_data2 op_data2; + + data->data_src.val = PERF_MEM_NA; + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + + perf_ibs_get_mem_op(&op_data3, data); + if (data->data_src.mem_op != PERF_MEM_OP_LOAD && + data->data_src.mem_op != PERF_MEM_OP_STORE) + return; + + op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3); + + if (sample_type & PERF_SAMPLE_DATA_SRC) { + perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, + int check_rip) +{ + if (sample_type & PERF_SAMPLE_RAW || + (perf_ibs == &perf_ibs_op && + sample_type & PERF_SAMPLE_DATA_SRC)) + return perf_ibs->offset_max; + else if (check_rip) + return 3; + return 1; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -725,12 +1031,9 @@ fail: size = 1; offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - if (event->attr.sample_type & PERF_SAMPLE_RAW) - offset_max = perf_ibs->offset_max; - else if (check_rip) - offset_max = 3; - else - offset_max = 1; + + offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + do { rdmsrl(msr + offset, *buf++); size++; @@ -784,6 +1087,9 @@ fail: data.sample_flags |= PERF_SAMPLE_RAW; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- cgit v1.2.3-59-g8ed1b From 6b2ae4952ef8ac23b467bc10776404092b581143 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:54 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_{WEIGHT|WEIGHT_STRUCT} IbsDcMissLat indicates the number of clock cycles from when a miss is detected in the data cache to when the data was delivered to the core. Similarly, IbsTagToRetCtr provides number of cycles from when the op was tagged to when the op was retired. Consider these fields for sample->weight. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-5-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index e20caa5cf02f..d883694e0fd4 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -955,6 +955,7 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, { union ibs_op_data3 op_data3; union ibs_op_data2 op_data2; + union ibs_op_data op_data; data->data_src.val = PERF_MEM_NA; op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; @@ -970,6 +971,19 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3); data->sample_flags |= PERF_SAMPLE_DATA_SRC; } + + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss && + data->data_src.mem_op == PERF_MEM_OP_LOAD) { + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) { + data->weight.var1_dw = op_data3.dc_miss_lat; + data->weight.var2_w = op_data.tag_to_ret_ctr; + } else if (sample_type & PERF_SAMPLE_WEIGHT) { + data->weight.full = op_data3.dc_miss_lat; + } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -977,7 +991,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, { if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && - sample_type & PERF_SAMPLE_DATA_SRC)) + (sample_type & PERF_SAMPLE_DATA_SRC || + sample_type & PERF_SAMPLE_WEIGHT_TYPE))) return perf_ibs->offset_max; else if (check_rip) return 3; -- cgit v1.2.3-59-g8ed1b From cb2bb85f7ed8740ab5fc06bbec386faa39ba44ef Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:55 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_ADDR IBS_DC_LINADDR provides the linear data address for the tagged load/ store operation. Populate perf sample address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-6-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index d883694e0fd4..0ad49105c154 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -984,6 +984,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } + + if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) { + data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -992,7 +997,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, if (sample_type & PERF_SAMPLE_RAW || (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE))) + sample_type & PERF_SAMPLE_WEIGHT_TYPE || + sample_type & PERF_SAMPLE_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; -- cgit v1.2.3-59-g8ed1b From 5b26af6d2b7854639ddf893366bbca7e74fa7c54 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:56 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_PHY_ADDR IBS_DC_PHYSADDR provides the physical data address for the tagged load/ store operation. Populate perf sample physical address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-7-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- kernel/events/core.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0ad49105c154..3271735f0070 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -989,6 +989,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; data->sample_flags |= PERF_SAMPLE_ADDR; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -998,7 +1003,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR))) + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; diff --git a/kernel/events/core.c b/kernel/events/core.c index e1ffdb861b53..49bc3b5e6c8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7435,7 +7435,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF -- cgit v1.2.3-59-g8ed1b From cfef80bad4cf79cdc964a53c98254dfa462be83f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:57 +0530 Subject: perf/uapi: Define PERF_MEM_SNOOPX_PEER in kernel header file PERF_MEM_SNOOPX_PEER is defined only in tools uapi header. Although it's used only by perf tool, not defining it in kernel header can create problems in future. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-8-ravi.bangoria@amd.com --- include/uapi/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 4ae3c249f675..85be78e0e7f6 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -1356,7 +1356,7 @@ union perf_mem_data_src { #define PERF_MEM_SNOOP_SHIFT 19 #define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ -/* 1 free */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ #define PERF_MEM_SNOOPX_SHIFT 38 /* locked instruction */ -- cgit v1.2.3-59-g8ed1b From 117ceeb1f4f87331e45a77e71f18303d15ec882e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:42 -0700 Subject: perf/x86/utils: Fix uninitialized var in get_branch_type() offset is passed as a pointer and on certain call path is not set by the function. If the caller does not re-initialize offset between calls, value could be inherited between calls. Prevent this by initializing offset on each call. This impacts the code in amd_pmu_lbr_filter() which does: for(i=0; ...) { ret = get_branch_type_fused(..., &offset); if (offset) lbr_entries[i].from += offset; } Fixes: df3e9612f758 ("perf/x86: Make branch classifier fusion-aware") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-2-eranian@google.com --- arch/x86/events/utils.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c index 5f5617afde79..76b1f8bb0fd5 100644 --- a/arch/x86/events/utils.c +++ b/arch/x86/events/utils.c @@ -94,6 +94,10 @@ static int get_branch_type(unsigned long from, unsigned long to, int abort, u8 buf[MAX_INSN_SIZE]; int is64 = 0; + /* make sure we initialize offset */ + if (offset) + *offset = 0; + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; -- cgit v1.2.3-59-g8ed1b From 3f9a1b3591003b122a6ea2d69f89a0fd96ec58b9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 28 Sep 2022 11:40:43 -0700 Subject: perf/x86/amd/lbr: Adjust LBR regardless of filtering In case of fused compare and taken branch instructions, the AMD LBR points to the compare instruction instead of the branch. Users of LBR usually expects the from address to point to a branch instruction. The kernel has code to adjust the from address via get_branch_type_fused(). However this correction is only applied when a branch filter is applied. That means that if no filter is present, the quality of the data is lower. Fix the problem by applying the adjustment regardless of the filter setting, bringing the AMD LBR to the same level as other LBR implementations. Fixes: 245268c19f70 ("perf/x86/amd/lbr: Use fusion-aware branch classifier") Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sandipan Das Link: https://lore.kernel.org/r/20220928184043.408364-3-eranian@google.com --- arch/x86/events/amd/lbr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index 2e1c1573efe7..38a75216c12c 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -99,12 +99,13 @@ static void amd_pmu_lbr_filter(void) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); int br_sel = cpuc->br_sel, offset, type, i, j; bool compress = false; + bool fused_only = false; u64 from, to; /* If sampling all branches, there is nothing to filter */ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) && ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE)) - return; + fused_only = true; for (i = 0; i < cpuc->lbr_stack.nr; i++) { from = cpuc->lbr_entries[i].from; @@ -116,8 +117,11 @@ static void amd_pmu_lbr_filter(void) * fusion where it points to an instruction preceding the * actual branch */ - if (offset) + if (offset) { cpuc->lbr_entries[i].from += offset; + if (fused_only) + continue; + } /* If type does not correspond, then discard */ if (type == X86_BR_NONE || (br_sel & type) != type) { -- cgit v1.2.3-59-g8ed1b From 0ce38047e82a02017839b6cae837f13a1383a3a0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 10:46:58 +0200 Subject: perf: Fix lockdep_assert_event_ctx() I'm a flamin' moron; because even after Mark told me it should be '&&' I still got it wrong in the final commit. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Borislav Petkov Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Tested-by: Borislav Petkov Link: https://lkml.kernel.org/r/YvvIWmDBWdIUCMZj@FVFF77S0Q05N --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e9b151cde491..853f64b6c8c2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -644,7 +644,7 @@ struct pmu_event_list { #ifdef CONFIG_PROVE_LOCKING #define lockdep_assert_event_ctx(event) \ WARN_ON_ONCE(__lockdep_enabled && \ - (this_cpu_read(hardirqs_enabled) || \ + (this_cpu_read(hardirqs_enabled) && \ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) #else #define lockdep_assert_event_ctx(event) -- cgit v1.2.3-59-g8ed1b From 7be51cc1c68dfa180ef84e71bcb4204237bb5620 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 11:03:47 +0200 Subject: perf: Fix pmu_filter_match() Mark reported that the new for_each_sibling_event() assertion triggers in pmu_filter_match() -- which isn't always called with IRQs disabled or ctx->mutex held. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YvvJq2f/7eFVcnNy@FVFF77S0Q05N --- kernel/events/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 49bc3b5e6c8a..b981b879bcd8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2226,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int -- cgit v1.2.3-59-g8ed1b From 82aad7ff7ac25c8cf09d491ae23b9823f1901486 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 12:20:39 +0200 Subject: perf/hw_breakpoint: Annotate tsk->perf_event_mutex vs ctx->mutex Perf fuzzer gifted a lockdep splat: perf_event_init_context() mutex_lock(parent_ctx->mutex); (B) inherit_task_group() inherit_group() inherit_event() perf_event_alloc() perf_try_init_event() := hw_breakpoint_event_init() register_perf_hw_breakpoint() mutex_lock(child->perf_event_mutex); (A) Which is against the normal (documented) order. Now, this is a false positive in that child is not published yet, but also inherited events never end up on ->perf_event_list. Annotate this one away. Fixes: 0912037fec11 ("perf/hw_breakpoint: Reduce contention with large number of tasks") Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/hw_breakpoint.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 7ef0e98d31e2..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -117,7 +117,17 @@ static struct mutex *bp_constraints_lock(struct perf_event *bp) struct mutex *tsk_mtx = get_task_bps_mutex(bp); if (tsk_mtx) { - mutex_lock(tsk_mtx); + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); percpu_down_read(&bp_cpuinfo_sem); } else { percpu_down_write(&bp_cpuinfo_sem); -- cgit v1.2.3-59-g8ed1b