aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/powerpc/kernel/hw_breakpoint.c53
-rw-r--r--arch/powerpc/perf/core-book3s.c10
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c1
-rw-r--r--arch/s390/kernel/perf_pai_crypto.c1
-rw-r--r--arch/sh/include/asm/hw_breakpoint.h5
-rw-r--r--arch/x86/events/Makefile2
-rw-r--r--arch/x86/events/amd/Makefile2
-rw-r--r--arch/x86/events/amd/brs.c69
-rw-r--r--arch/x86/events/amd/core.c210
-rw-r--r--arch/x86/events/amd/ibs.c360
-rw-r--r--arch/x86/events/amd/lbr.c439
-rw-r--r--arch/x86/events/core.c61
-rw-r--r--arch/x86/events/intel/core.c101
-rw-r--r--arch/x86/events/intel/cstate.c1
-rw-r--r--arch/x86/events/intel/ds.c55
-rw-r--r--arch/x86/events/intel/lbr.c273
-rw-r--r--arch/x86/events/intel/p4.c37
-rw-r--r--arch/x86/events/intel/uncore.c1
-rw-r--r--arch/x86/events/msr.c1
-rw-r--r--arch/x86/events/perf_event.h130
-rw-r--r--arch/x86/events/perf_event_flags.h22
-rw-r--r--arch/x86/events/utils.c251
-rw-r--r--arch/x86/include/asm/amd-ibs.h16
-rw-r--r--arch/x86/include/asm/cpufeatures.h2
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h5
-rw-r--r--arch/x86/include/asm/msr-index.h5
-rw-r--r--arch/x86/include/asm/perf_event.h3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--drivers/perf/arm_spe_pmu.c4
-rw-r--r--include/linux/hw_breakpoint.h4
-rw-r--r--include/linux/percpu-rwsem.h6
-rw-r--r--include/linux/perf/arm_pmu.h9
-rw-r--r--include/linux/perf_event.h77
-rw-r--r--include/uapi/linux/perf_event.h57
-rw-r--r--kernel/bpf/stackmap.c4
-rw-r--r--kernel/events/Makefile1
-rw-r--r--kernel/events/core.c88
-rw-r--r--kernel/events/hw_breakpoint.c648
-rw-r--r--kernel/events/hw_breakpoint_test.c333
-rw-r--r--kernel/locking/percpu-rwsem.c6
-rw-r--r--kernel/trace/bpf_trace.c3
-rw-r--r--lib/Kconfig.debug10
42 files changed, 2613 insertions, 754 deletions
diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
index 2669f80b3a49..8db1a15d7acb 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/init.h>
@@ -129,7 +130,14 @@ struct breakpoint {
bool ptrace_bp;
};
+/*
+ * While kernel/events/hw_breakpoint.c does its own synchronization, we cannot
+ * rely on it safely synchronizing internals here; however, we can rely on it
+ * not requesting more breakpoints than available.
+ */
+static DEFINE_SPINLOCK(cpu_bps_lock);
static DEFINE_PER_CPU(struct breakpoint *, cpu_bps[HBP_NUM_MAX]);
+static DEFINE_SPINLOCK(task_bps_lock);
static LIST_HEAD(task_bps);
static struct breakpoint *alloc_breakpoint(struct perf_event *bp)
@@ -174,7 +182,9 @@ static int task_bps_add(struct perf_event *bp)
if (IS_ERR(tmp))
return PTR_ERR(tmp);
+ spin_lock(&task_bps_lock);
list_add(&tmp->list, &task_bps);
+ spin_unlock(&task_bps_lock);
return 0;
}
@@ -182,6 +192,7 @@ static void task_bps_remove(struct perf_event *bp)
{
struct list_head *pos, *q;
+ spin_lock(&task_bps_lock);
list_for_each_safe(pos, q, &task_bps) {
struct breakpoint *tmp = list_entry(pos, struct breakpoint, list);
@@ -191,6 +202,7 @@ static void task_bps_remove(struct perf_event *bp)
break;
}
}
+ spin_unlock(&task_bps_lock);
}
/*
@@ -200,12 +212,17 @@ static void task_bps_remove(struct perf_event *bp)
static bool all_task_bps_check(struct perf_event *bp)
{
struct breakpoint *tmp;
+ bool ret = false;
+ spin_lock(&task_bps_lock);
list_for_each_entry(tmp, &task_bps, list) {
- if (!can_co_exist(tmp, bp))
- return true;
+ if (!can_co_exist(tmp, bp)) {
+ ret = true;
+ break;
+ }
}
- return false;
+ spin_unlock(&task_bps_lock);
+ return ret;
}
/*
@@ -215,13 +232,18 @@ static bool all_task_bps_check(struct perf_event *bp)
static bool same_task_bps_check(struct perf_event *bp)
{
struct breakpoint *tmp;
+ bool ret = false;
+ spin_lock(&task_bps_lock);
list_for_each_entry(tmp, &task_bps, list) {
if (tmp->bp->hw.target == bp->hw.target &&
- !can_co_exist(tmp, bp))
- return true;
+ !can_co_exist(tmp, bp)) {
+ ret = true;
+ break;
+ }
}
- return false;
+ spin_unlock(&task_bps_lock);
+ return ret;
}
static int cpu_bps_add(struct perf_event *bp)
@@ -234,6 +256,7 @@ static int cpu_bps_add(struct perf_event *bp)
if (IS_ERR(tmp))
return PTR_ERR(tmp);
+ spin_lock(&cpu_bps_lock);
cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu);
for (i = 0; i < nr_wp_slots(); i++) {
if (!cpu_bp[i]) {
@@ -241,6 +264,7 @@ static int cpu_bps_add(struct perf_event *bp)
break;
}
}
+ spin_unlock(&cpu_bps_lock);
return 0;
}
@@ -249,6 +273,7 @@ static void cpu_bps_remove(struct perf_event *bp)
struct breakpoint **cpu_bp;
int i = 0;
+ spin_lock(&cpu_bps_lock);
cpu_bp = per_cpu_ptr(cpu_bps, bp->cpu);
for (i = 0; i < nr_wp_slots(); i++) {
if (!cpu_bp[i])
@@ -260,19 +285,25 @@ static void cpu_bps_remove(struct perf_event *bp)
break;
}
}
+ spin_unlock(&cpu_bps_lock);
}
static bool cpu_bps_check(int cpu, struct perf_event *bp)
{
struct breakpoint **cpu_bp;
+ bool ret = false;
int i;
+ spin_lock(&cpu_bps_lock);
cpu_bp = per_cpu_ptr(cpu_bps, cpu);
for (i = 0; i < nr_wp_slots(); i++) {
- if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp))
- return true;
+ if (cpu_bp[i] && !can_co_exist(cpu_bp[i], bp)) {
+ ret = true;
+ break;
+ }
}
- return false;
+ spin_unlock(&cpu_bps_lock);
+ return ret;
}
static bool all_cpu_bps_check(struct perf_event *bp)
@@ -286,10 +317,6 @@ static bool all_cpu_bps_check(struct perf_event *bp)
return false;
}
-/*
- * We don't use any locks to serialize accesses to cpu_bps or task_bps
- * because are already inside nr_bp_mutex.
- */
int arch_reserve_bp_slot(struct perf_event *bp)
{
int ret;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 03e31ae97741..942aa830e110 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2314,16 +2314,20 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
cpuhw = this_cpu_ptr(&cpu_hw_events);
power_pmu_bhrb_read(event, cpuhw);
data.br_stack = &cpuhw->bhrb_stack;
+ data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
- ppmu->get_mem_data_src)
+ ppmu->get_mem_data_src) {
ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
+ data.sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE &&
- ppmu->get_mem_weight)
+ ppmu->get_mem_weight) {
ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type);
-
+ data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
if (perf_event_overflow(event, &data, regs))
power_pmu_stop(event, 0);
} else if (period) {
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index f7dd3c849e68..f043a7ff220b 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event,
raw.frag.data = cpuhw->stop;
raw.size = raw.frag.size;
data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
}
overflow = perf_event_overflow(event, &data, &regs);
diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c
index b38b4ae01589..6826e2a69a21 100644
--- a/arch/s390/kernel/perf_pai_crypto.c
+++ b/arch/s390/kernel/perf_pai_crypto.c
@@ -366,6 +366,7 @@ static int paicrypt_push_sample(void)
raw.frag.data = cpump->save;
raw.size = raw.frag.size;
data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
}
overflow = perf_event_overflow(event, &data, &regs);
diff --git a/arch/sh/include/asm/hw_breakpoint.h b/arch/sh/include/asm/hw_breakpoint.h
index 199d17b765f2..361a0f57bdeb 100644
--- a/arch/sh/include/asm/hw_breakpoint.h
+++ b/arch/sh/include/asm/hw_breakpoint.h
@@ -48,10 +48,7 @@ struct pmu;
/* Maximum number of UBC channels */
#define HBP_NUM 2
-static inline int hw_breakpoint_slots(int type)
-{
- return HBP_NUM;
-}
+#define hw_breakpoint_slots(type) (HBP_NUM)
/* arch/sh/kernel/hw_breakpoint.c */
extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
diff --git a/arch/x86/events/Makefile b/arch/x86/events/Makefile
index 9933c0e8e97a..86a76efa8bb6 100644
--- a/arch/x86/events/Makefile
+++ b/arch/x86/events/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0-only
-obj-y += core.o probe.o
+obj-y += core.o probe.o utils.o
obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += rapl.o
obj-y += amd/
obj-$(CONFIG_X86_LOCAL_APIC) += msr.o
diff --git a/arch/x86/events/amd/Makefile b/arch/x86/events/amd/Makefile
index b9f5d4610256..527d947eb76b 100644
--- a/arch/x86/events/amd/Makefile
+++ b/arch/x86/events/amd/Makefile
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_CPU_SUP_AMD) += core.o
+obj-$(CONFIG_CPU_SUP_AMD) += core.o lbr.o
obj-$(CONFIG_PERF_EVENTS_AMD_BRS) += brs.o
obj-$(CONFIG_PERF_EVENTS_AMD_POWER) += power.o
obj-$(CONFIG_X86_LOCAL_APIC) += ibs.o
diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index bee8765a1e9b..f1bff153d945 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -81,7 +81,7 @@ static bool __init amd_brs_detect(void)
* a br_sel_map. Software filtering is not supported because it would not correlate well
* with a sampling period.
*/
-int amd_brs_setup_filter(struct perf_event *event)
+static int amd_brs_setup_filter(struct perf_event *event)
{
u64 type = event->attr.branch_sample_type;
@@ -96,6 +96,73 @@ int amd_brs_setup_filter(struct perf_event *event)
return 0;
}
+static inline int amd_is_brs_event(struct perf_event *e)
+{
+ return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
+}
+
+int amd_brs_hw_config(struct perf_event *event)
+{
+ int ret = 0;
+
+ /*
+ * Due to interrupt holding, BRS is not recommended in
+ * counting mode.
+ */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ /*
+ * Due to the way BRS operates by holding the interrupt until
+ * lbr_nr entries have been captured, it does not make sense
+ * to allow sampling on BRS with an event that does not match
+ * what BRS is capturing, i.e., retired taken branches.
+ * Otherwise the correlation with the event's period is even
+ * more loose:
+ *
+ * With retired taken branch:
+ * Effective P = P + 16 + X
+ * With any other event:
+ * Effective P = P + Y + X
+ *
+ * Where X is the number of taken branches due to interrupt
+ * skid. Skid is large.
+ *
+ * Where Y is the occurences of the event while BRS is
+ * capturing the lbr_nr entries.
+ *
+ * By using retired taken branches, we limit the impact on the
+ * Y variable. We know it cannot be more than the depth of
+ * BRS.
+ */
+ if (!amd_is_brs_event(event))
+ return -EINVAL;
+
+ /*
+ * BRS implementation does not work with frequency mode
+ * reprogramming of the period.
+ */
+ if (event->attr.freq)
+ return -EINVAL;
+ /*
+ * The kernel subtracts BRS depth from period, so it must
+ * be big enough.
+ */
+ if (event->attr.sample_period <= x86_pmu.lbr_nr)
+ return -EINVAL;
+
+ /*
+ * Check if we can allow PERF_SAMPLE_BRANCH_STACK
+ */
+ ret = amd_brs_setup_filter(event);
+
+ /* only set in case of success */
+ if (!ret)
+ event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
+
+ return ret;
+}
+
/* tos = top of stack, i.e., last valid entry written */
static inline int amd_brs_get_tos(union amd_debug_extn_cfg *cfg)
{
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index 9ac3718410ce..8b70237c33f7 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -330,16 +330,10 @@ static inline bool amd_is_pair_event_code(struct hw_perf_event *hwc)
}
}
-#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
-static inline int amd_is_brs_event(struct perf_event *e)
-{
- return (e->hw.config & AMD64_RAW_EVENT_MASK) == AMD_FAM19H_BRS_EVENT;
-}
+DEFINE_STATIC_CALL_RET0(amd_pmu_branch_hw_config, *x86_pmu.hw_config);
static int amd_core_hw_config(struct perf_event *event)
{
- int ret = 0;
-
if (event->attr.exclude_host && event->attr.exclude_guest)
/*
* When HO == GO == 1 the hardware treats that as GO == HO == 0
@@ -356,66 +350,10 @@ static int amd_core_hw_config(struct perf_event *event)
if ((x86_pmu.flags & PMU_FL_PAIR) && amd_is_pair_event_code(&event->hw))
event->hw.flags |= PERF_X86_EVENT_PAIR;
- /*
- * if branch stack is requested
- */
- if (has_branch_stack(event)) {
- /*
- * Due to interrupt holding, BRS is not recommended in
- * counting mode.
- */
- if (!is_sampling_event(event))
- return -EINVAL;
+ if (has_branch_stack(event))
+ return static_call(amd_pmu_branch_hw_config)(event);
- /*
- * Due to the way BRS operates by holding the interrupt until
- * lbr_nr entries have been captured, it does not make sense
- * to allow sampling on BRS with an event that does not match
- * what BRS is capturing, i.e., retired taken branches.
- * Otherwise the correlation with the event's period is even
- * more loose:
- *
- * With retired taken branch:
- * Effective P = P + 16 + X
- * With any other event:
- * Effective P = P + Y + X
- *
- * Where X is the number of taken branches due to interrupt
- * skid. Skid is large.
- *
- * Where Y is the occurences of the event while BRS is
- * capturing the lbr_nr entries.
- *
- * By using retired taken branches, we limit the impact on the
- * Y variable. We know it cannot be more than the depth of
- * BRS.
- */
- if (!amd_is_brs_event(event))
- return -EINVAL;
-
- /*
- * BRS implementation does not work with frequency mode
- * reprogramming of the period.
- */
- if (event->attr.freq)
- return -EINVAL;
- /*
- * The kernel subtracts BRS depth from period, so it must
- * be big enough.
- */
- if (event->attr.sample_period <= x86_pmu.lbr_nr)
- return -EINVAL;
-
- /*
- * Check if we can allow PERF_SAMPLE_BRANCH_STACK
- */
- ret = amd_brs_setup_filter(event);
-
- /* only set in case of success */
- if (!ret)
- event->hw.flags |= PERF_X86_EVENT_AMD_BRS;
- }
- return ret;
+ return 0;
}
static inline int amd_is_nb_event(struct hw_perf_event *hwc)
@@ -582,8 +520,14 @@ static struct amd_nb *amd_alloc_nb(int cpu)
return nb;
}
+typedef void (amd_pmu_branch_reset_t)(void);
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_reset, amd_pmu_branch_reset_t);
+
static void amd_pmu_cpu_reset(int cpu)
{
+ if (x86_pmu.lbr_nr)
+ static_call(amd_pmu_branch_reset)();
+
if (x86_pmu.version < 2)
return;
@@ -598,16 +542,24 @@ static int amd_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ cpuc->lbr_sel = kzalloc_node(sizeof(struct er_account), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpuc->lbr_sel)
+ return -ENOMEM;
+
WARN_ON_ONCE(cpuc->amd_nb);
if (!x86_pmu.amd_nb_constraints)
return 0;
cpuc->amd_nb = amd_alloc_nb(cpu);
- if (!cpuc->amd_nb)
- return -ENOMEM;
+ if (cpuc->amd_nb)
+ return 0;
- return 0;
+ kfree(cpuc->lbr_sel);
+ cpuc->lbr_sel = NULL;
+
+ return -ENOMEM;
}
static void amd_pmu_cpu_starting(int cpu)
@@ -640,19 +592,19 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
- amd_brs_reset();
amd_pmu_cpu_reset(cpu);
}
static void amd_pmu_cpu_dead(int cpu)
{
- struct cpu_hw_events *cpuhw;
+ struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
+
+ kfree(cpuhw->lbr_sel);
+ cpuhw->lbr_sel = NULL;
if (!x86_pmu.amd_nb_constraints)
return;
- cpuhw = &per_cpu(cpu_hw_events, cpu);
-
if (cpuhw->amd_nb) {
struct amd_nb *nb = cpuhw->amd_nb;
@@ -677,7 +629,7 @@ static inline u64 amd_pmu_get_global_status(void)
/* PerfCntrGlobalStatus is read-only */
rdmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, status);
- return status & amd_pmu_global_cntr_mask;
+ return status;
}
static inline void amd_pmu_ack_global_status(u64 status)
@@ -688,8 +640,6 @@ static inline void amd_pmu_ack_global_status(u64 status)
* clears the same bit in PerfCntrGlobalStatus
*/
- /* Only allow modifications to PerfCntrGlobalStatus.PerfCntrOvfl */
- status &= amd_pmu_global_cntr_mask;
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, status);
}
@@ -799,11 +749,17 @@ static void amd_pmu_v2_enable_event(struct perf_event *event)
__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
-static void amd_pmu_v2_enable_all(int added)
+static __always_inline void amd_pmu_core_enable_all(void)
{
amd_pmu_set_global_ctl(amd_pmu_global_cntr_mask);
}
+static void amd_pmu_v2_enable_all(int added)
+{
+ amd_pmu_lbr_enable_all();
+ amd_pmu_core_enable_all();
+}
+
static void amd_pmu_disable_event(struct perf_event *event)
{
x86_pmu_disable_event(event);
@@ -828,23 +784,32 @@ static void amd_pmu_disable_all(void)
amd_pmu_check_overflow();
}
-static void amd_pmu_v2_disable_all(void)
+static __always_inline void amd_pmu_core_disable_all(void)
{
- /* Disable all PMCs */
amd_pmu_set_global_ctl(0);
+}
+
+static void amd_pmu_v2_disable_all(void)
+{
+ amd_pmu_core_disable_all();
+ amd_pmu_lbr_disable_all();
amd_pmu_check_overflow();
}
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_add, *x86_pmu.add);
+
static void amd_pmu_add_event(struct perf_event *event)
{
if (needs_branch_stack(event))
- amd_pmu_brs_add(event);
+ static_call(amd_pmu_branch_add)(event);
}
+DEFINE_STATIC_CALL_NULL(amd_pmu_branch_del, *x86_pmu.del);
+
static void amd_pmu_del_event(struct perf_event *event)
{
if (needs_branch_stack(event))
- amd_pmu_brs_del(event);
+ static_call(amd_pmu_branch_del)(event);
}
/*
@@ -930,8 +895,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
pmu_enabled = cpuc->enabled;
cpuc->enabled = 0;
- /* Stop counting */
- amd_pmu_v2_disable_all();
+ /* Stop counting but do not disable LBR */
+ amd_pmu_core_disable_all();
status = amd_pmu_get_global_status();
@@ -939,6 +904,12 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!status)
goto done;
+ /* Read branch records before unfreezing */
+ if (status & GLOBAL_STATUS_LBRS_FROZEN) {
+ amd_pmu_lbr_read();
+ status &= ~GLOBAL_STATUS_LBRS_FROZEN;
+ }
+
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -958,6 +929,11 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
if (!x86_perf_event_set_period(event))
continue;
+ if (has_branch_stack(event)) {
+ data.br_stack = &cpuc->lbr_stack;
+ data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ }
+
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -971,7 +947,7 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
*/
WARN_ON(status > 0);
- /* Clear overflow bits */
+ /* Clear overflow and freeze bits */
amd_pmu_ack_global_status(~status);
/*
@@ -985,7 +961,7 @@ done:
/* Resume counting only if PMU is active */
if (pmu_enabled)
- amd_pmu_v2_enable_all(0);
+ amd_pmu_core_enable_all();
return amd_pmu_adjust_nmi_window(handled);
}
@@ -1248,23 +1224,14 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
-static void amd_pmu_sched_task(struct perf_event_context *ctx,
- bool sched_in)
-{
- if (sched_in && x86_pmu.lbr_nr)
- amd_pmu_brs_sched_task(ctx, sched_in);
-}
-
-static u64 amd_pmu_limit_period(struct perf_event *event, u64 left)
+static void amd_pmu_limit_period(struct perf_event *event, s64 *left)
{
/*
* Decrease period by the depth of the BRS feature to get the last N
* taken branches and approximate the desired period
*/
- if (has_branch_stack(event) && left > x86_pmu.lbr_nr)
- left -= x86_pmu.lbr_nr;
-
- return left;
+ if (has_branch_stack(event) && *left > x86_pmu.lbr_nr)
+ *left -= x86_pmu.lbr_nr;
}
static __initconst const struct x86_pmu amd_pmu = {
@@ -1311,23 +1278,25 @@ static ssize_t branches_show(struct device *cdev,
static DEVICE_ATTR_RO(branches);
-static struct attribute *amd_pmu_brs_attrs[] = {
+static struct attribute *amd_pmu_branches_attrs[] = {
&dev_attr_branches.attr,
NULL,
};
static umode_t
-amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+amd_branches_is_visible(struct kobject *kobj, struct attribute *attr, int i)
{
return x86_pmu.lbr_nr ? attr->mode : 0;
}
-static struct attribute_group group_caps_amd_brs = {
+static struct attribute_group group_caps_amd_branches = {
.name = "caps",
- .attrs = amd_pmu_brs_attrs,
- .is_visible = amd_brs_is_visible,
+ .attrs = amd_pmu_branches_attrs,
+ .is_visible = amd_branches_is_visible,
};
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+
EVENT_ATTR_STR(branch-brs, amd_branch_brs,
"event=" __stringify(AMD_FAM19H_BRS_EVENT)"\n");
@@ -1336,15 +1305,26 @@ static struct attribute *amd_brs_events_attrs[] = {
NULL,
};
+static umode_t
+amd_brs_is_visible(struct kobject *kobj, struct attribute *attr, int i)
+{
+ return static_cpu_has(X86_FEATURE_BRS) && x86_pmu.lbr_nr ?
+ attr->mode : 0;
+}
+
static struct attribute_group group_events_amd_brs = {
.name = "events",
.attrs = amd_brs_events_attrs,
.is_visible = amd_brs_is_visible,
};
+#endif /* CONFIG_PERF_EVENTS_AMD_BRS */
+
static const struct attribute_group *amd_attr_update[] = {
- &group_caps_amd_brs,
+ &group_caps_amd_branches,
+#ifdef CONFIG_PERF_EVENTS_AMD_BRS
&group_events_amd_brs,
+#endif
NULL,
};
@@ -1421,13 +1401,27 @@ static int __init amd_core_pmu_init(void)
x86_pmu.flags |= PMU_FL_PAIR;
}
- /*
- * BRS requires special event constraints and flushing on ctxsw.
- */
- if (boot_cpu_data.x86 >= 0x19 && !amd_brs_init()) {
+ /* LBR and BRS are mutually exclusive features */
+ if (!amd_pmu_lbr_init()) {
+ /* LBR requires flushing on context switch */
+ x86_pmu.sched_task = amd_pmu_lbr_sched_task;
+ static_call_update(amd_pmu_branch_hw_config, amd_pmu_lbr_hw_config);
+ static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
+ static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
+ static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
+ } else if (!amd_brs_init()) {
+ /*
+ * BRS requires special event constraints and flushing on ctxsw.
+ */
x86_pmu.get_event_constraints = amd_get_event_constraints_f19h;
- x86_pmu.sched_task = amd_pmu_sched_task;
+ x86_pmu.sched_task = amd_pmu_brs_sched_task;
x86_pmu.limit_period = amd_pmu_limit_period;
+
+ static_call_update(amd_pmu_branch_hw_config, amd_brs_hw_config);
+ static_call_update(amd_pmu_branch_reset, amd_brs_reset);
+ static_call_update(amd_pmu_branch_add, amd_pmu_brs_add);
+ static_call_update(amd_pmu_branch_del, amd_pmu_brs_del);
+
/*
* put_event_constraints callback same as Fam17h, set above
*/
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index c251bc44c088..3271735f0070 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -300,16 +300,6 @@ static int perf_ibs_init(struct perf_event *event)
hwc->config_base = perf_ibs->msr;
hwc->config = config;
- /*
- * rip recorded by IbsOpRip will not be consistent with rsp and rbp
- * recorded as part of interrupt regs. Thus we need to use rip from
- * interrupt regs while unwinding call stack. Setting _EARLY flag
- * makes sure we unwind call-stack before perf sample rip is set to
- * IbsOpRip.
- */
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
-
return 0;
}
@@ -688,6 +678,339 @@ static struct perf_ibs perf_ibs_op = {
.get_count = get_ibs_op_count,
};
+static void perf_ibs_get_mem_op(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_op = PERF_MEM_OP_NA;
+
+ if (op_data3->ld_op)
+ data_src->mem_op = PERF_MEM_OP_LOAD;
+ else if (op_data3->st_op)
+ data_src->mem_op = PERF_MEM_OP_STORE;
+}
+
+/*
+ * Processors having CPUID_Fn8000001B_EAX[11] aka IBS_CAPS_ZEN4 has
+ * more fine granular DataSrc encodings. Others have coarse.
+ */
+static u8 perf_ibs_data_src(union ibs_op_data2 *op_data2)
+{
+ if (ibs_caps & IBS_CAPS_ZEN4)
+ return (op_data2->data_src_hi << 3) | op_data2->data_src_lo;
+
+ return op_data2->data_src_lo;
+}
+
+static void perf_ibs_get_mem_lvl(union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+ u8 ibs_data_src = perf_ibs_data_src(op_data2);
+
+ data_src->mem_lvl = 0;
+
+ /*
+ * DcMiss, L2Miss, DataSrc, DcMissLat etc. are all invalid for Uncached
+ * memory accesses. So, check DcUcMemAcc bit early.
+ */
+ if (op_data3->dc_uc_mem_acc && ibs_data_src != IBS_DATA_SRC_EXT_IO) {
+ data_src->mem_lvl = PERF_MEM_LVL_UNC | PERF_MEM_LVL_HIT;
+ return;
+ }
+
+ /* L1 Hit */
+ if (op_data3->dc_miss == 0) {
+ data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
+ return;
+ }
+
+ /* L2 Hit */
+ if (op_data3->l2_miss == 0) {
+ /* Erratum #1293 */
+ if (boot_cpu_data.x86 != 0x19 || boot_cpu_data.x86_model > 0xF ||
+ !(op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
+ data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
+ return;
+ }
+ }
+
+ /*
+ * OP_DATA2 is valid only for load ops. Skip all checks which
+ * uses OP_DATA2[DataSrc].
+ */
+ if (data_src->mem_op != PERF_MEM_OP_LOAD)
+ goto check_mab;
+
+ /* L3 Hit */
+ if (ibs_caps & IBS_CAPS_ZEN4) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE) {
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
+ return;
+ }
+ } else {
+ if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
+ data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_REM_CCE1 |
+ PERF_MEM_LVL_HIT;
+ return;
+ }
+ }
+
+ /* A peer cache in a near CCX */
+ if (ibs_caps & IBS_CAPS_ZEN4 &&
+ ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE) {
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1 | PERF_MEM_LVL_HIT;
+ return;
+ }
+
+ /* A peer cache in a far CCX */
+ if (ibs_caps & IBS_CAPS_ZEN4) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE) {
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
+ return;
+ }
+ } else {
+ if (ibs_data_src == IBS_DATA_SRC_REM_CACHE) {
+ data_src->mem_lvl = PERF_MEM_LVL_REM_CCE2 | PERF_MEM_LVL_HIT;
+ return;
+ }
+ }
+
+ /* DRAM */
+ if (ibs_data_src == IBS_DATA_SRC_EXT_DRAM) {
+ if (op_data2->rmt_node == 0)
+ data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
+ else
+ data_src->mem_lvl = PERF_MEM_LVL_REM_RAM1 | PERF_MEM_LVL_HIT;
+ return;
+ }
+
+ /* PMEM */
+ if (ibs_caps & IBS_CAPS_ZEN4 && ibs_data_src == IBS_DATA_SRC_EXT_PMEM) {
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_PMEM;
+ if (op_data2->rmt_node) {
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ /* IBS doesn't provide Remote socket detail */
+ data_src->mem_hops = PERF_MEM_HOPS_1;
+ }
+ return;
+ }
+
+ /* Extension Memory */
+ if (ibs_caps & IBS_CAPS_ZEN4 &&
+ ibs_data_src == IBS_DATA_SRC_EXT_EXT_MEM) {
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_EXTN_MEM;
+ if (op_data2->rmt_node) {
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ /* IBS doesn't provide Remote socket detail */
+ data_src->mem_hops = PERF_MEM_HOPS_1;
+ }
+ return;
+ }
+
+ /* IO */
+ if (ibs_data_src == IBS_DATA_SRC_EXT_IO) {
+ data_src->mem_lvl = PERF_MEM_LVL_IO;
+ data_src->mem_lvl_num = PERF_MEM_LVLNUM_IO;
+ if (op_data2->rmt_node) {
+ data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
+ /* IBS doesn't provide Remote socket detail */
+ data_src->mem_hops = PERF_MEM_HOPS_1;
+ }
+ return;
+ }
+
+check_mab:
+ /*
+ * MAB (Miss Address Buffer) Hit. MAB keeps track of outstanding
+ * DC misses. However, such data may come from any level in mem
+ * hierarchy. IBS provides detail about both MAB as well as actual
+ * DataSrc simultaneously. Prioritize DataSrc over MAB, i.e. set
+ * MAB only when IBS fails to provide DataSrc.
+ */
+ if (op_data3->dc_miss_no_mab_alloc) {
+ data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT;
+ return;
+ }
+
+ data_src->mem_lvl = PERF_MEM_LVL_NA;
+}
+
+static bool perf_ibs_cache_hit_st_valid(void)
+{
+ /* 0: Uninitialized, 1: Valid, -1: Invalid */
+ static int cache_hit_st_valid;
+
+ if (unlikely(!cache_hit_st_valid)) {
+ if (boot_cpu_data.x86 == 0x19 &&
+ (boot_cpu_data.x86_model <= 0xF ||
+ (boot_cpu_data.x86_model >= 0x20 &&
+ boot_cpu_data.x86_model <= 0x5F))) {
+ cache_hit_st_valid = -1;
+ } else {
+ cache_hit_st_valid = 1;
+ }
+ }
+
+ return cache_hit_st_valid == 1;
+}
+
+static void perf_ibs_get_mem_snoop(union ibs_op_data2 *op_data2,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+ u8 ibs_data_src;
+
+ data_src->mem_snoop = PERF_MEM_SNOOP_NA;
+
+ if (!perf_ibs_cache_hit_st_valid() ||
+ data_src->mem_op != PERF_MEM_OP_LOAD ||
+ data_src->mem_lvl & PERF_MEM_LVL_L1 ||
+ data_src->mem_lvl & PERF_MEM_LVL_L2 ||
+ op_data2->cache_hit_st)
+ return;
+
+ ibs_data_src = perf_ibs_data_src(op_data2);
+
+ if (ibs_caps & IBS_CAPS_ZEN4) {
+ if (ibs_data_src == IBS_DATA_SRC_EXT_LOC_CACHE ||
+ ibs_data_src == IBS_DATA_SRC_EXT_NEAR_CCX_CACHE ||
+ ibs_data_src == IBS_DATA_SRC_EXT_FAR_CCX_CACHE)
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ } else if (ibs_data_src == IBS_DATA_SRC_LOC_CACHE) {
+ data_src->mem_snoop = PERF_MEM_SNOOP_HITM;
+ }
+}
+
+static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_dtlb = PERF_MEM_TLB_NA;
+
+ if (!op_data3->dc_lin_addr_valid)
+ return;
+
+ if (!op_data3->dc_l1tlb_miss) {
+ data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT;
+ return;
+ }
+
+ if (!op_data3->dc_l2tlb_miss) {
+ data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_HIT;
+ return;
+ }
+
+ data_src->mem_dtlb = PERF_MEM_TLB_L2 | PERF_MEM_TLB_MISS;
+}
+
+static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3,
+ struct perf_sample_data *data)
+{
+ union perf_mem_data_src *data_src = &data->data_src;
+
+ data_src->mem_lock = PERF_MEM_LOCK_NA;
+
+ if (op_data3->dc_locked_op)
+ data_src->mem_lock = PERF_MEM_LOCK_LOCKED;
+}
+
+#define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL)
+
+static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data,
+ struct perf_sample_data *data,
+ union ibs_op_data2 *op_data2,
+ union ibs_op_data3 *op_data3)
+{
+ perf_ibs_get_mem_lvl(op_data2, op_data3, data);
+ perf_ibs_get_mem_snoop(op_data2, data);
+ perf_ibs_get_tlb_lvl(op_data3, data);
+ perf_ibs_get_mem_lock(op_data3, data);
+}
+
+static __u64 perf_ibs_get_op_data2(struct perf_ibs_data *ibs_data,
+ union ibs_op_data3 *op_data3)
+{
+ __u64 val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA2)];
+
+ /* Erratum #1293 */
+ if (boot_cpu_data.x86 == 0x19 && boot_cpu_data.x86_model <= 0xF &&
+ (op_data3->sw_pf || op_data3->dc_miss_no_mab_alloc)) {
+ /*
+ * OP_DATA2 has only two fields on Zen3: DataSrc and RmtNode.
+ * DataSrc=0 is 'No valid status' and RmtNode is invalid when
+ * DataSrc=0.
+ */
+ val = 0;
+ }
+ return val;
+}
+
+static void perf_ibs_parse_ld_st_data(__u64 sample_type,
+ struct perf_ibs_data *ibs_data,
+ struct perf_sample_data *data)
+{
+ union ibs_op_data3 op_data3;
+ union ibs_op_data2 op_data2;
+ union ibs_op_data op_data;
+
+ data->data_src.val = PERF_MEM_NA;
+ op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)];
+
+ perf_ibs_get_mem_op(&op_data3, data);
+ if (data->data_src.mem_op != PERF_MEM_OP_LOAD &&
+ data->data_src.mem_op != PERF_MEM_OP_STORE)
+ return;
+
+ op_data2.val = perf_ibs_get_op_data2(ibs_data, &op_data3);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
+ perf_ibs_get_data_src(ibs_data, data, &op_data2, &op_data3);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_TYPE && op_data3.dc_miss &&
+ data->data_src.mem_op == PERF_MEM_OP_LOAD) {
+ op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)];
+
+ if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) {
+ data->weight.var1_dw = op_data3.dc_miss_lat;
+ data->weight.var2_w = op_data.tag_to_ret_ctr;
+ } else if (sample_type & PERF_SAMPLE_WEIGHT) {
+ data->weight.full = op_data3.dc_miss_lat;
+ }
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+
+ if (sample_type & PERF_SAMPLE_ADDR && op_data3.dc_lin_addr_valid) {
+ data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)];
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) {
+ data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)];
+ data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
+ }
+}
+
+static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type,
+ int check_rip)
+{
+ if (sample_type & PERF_SAMPLE_RAW ||
+ (perf_ibs == &perf_ibs_op &&
+ (sample_type & PERF_SAMPLE_DATA_SRC ||
+ sample_type & PERF_SAMPLE_WEIGHT_TYPE ||
+ sample_type & PERF_SAMPLE_ADDR ||
+ sample_type & PERF_SAMPLE_PHYS_ADDR)))
+ return perf_ibs->offset_max;
+ else if (check_rip)
+ return 3;
+ return 1;
+}
+
static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
{
struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
@@ -735,12 +1058,9 @@ fail:
size = 1;
offset = 1;
check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
- if (event->attr.sample_type & PERF_SAMPLE_RAW)
- offset_max = perf_ibs->offset_max;
- else if (check_rip)
- offset_max = 3;
- else
- offset_max = 1;
+
+ offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip);
+
do {
rdmsrl(msr + offset, *buf++);
size++;
@@ -791,15 +1111,21 @@ fail:
},
};
data.raw = &raw;
+ data.sample_flags |= PERF_SAMPLE_RAW;
}
+ if (perf_ibs == &perf_ibs_op)
+ perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data);
+
/*
* rip recorded by IbsOpRip will not be consistent with rsp and rbp
* recorded as part of interrupt regs. Thus we need to use rip from
* interrupt regs while unwinding call stack.
*/
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
data.callchain = perf_callchain(event, iregs);
+ data.sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
throttle = perf_event_overflow(event, &data, &regs);
out:
diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c
new file mode 100644
index 000000000000..38a75216c12c
--- /dev/null
+++ b/arch/x86/events/amd/lbr.c
@@ -0,0 +1,439 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+
+#include "../perf_event.h"
+
+/* LBR Branch Select valid bits */
+#define LBR_SELECT_MASK 0x1ff
+
+/*
+ * LBR Branch Select filter bits which when set, ensures that the
+ * corresponding type of branches are not recorded
+ */
+#define LBR_SELECT_KERNEL 0 /* Branches ending in CPL = 0 */
+#define LBR_SELECT_USER 1 /* Branches ending in CPL > 0 */
+#define LBR_SELECT_JCC 2 /* Conditional branches */
+#define LBR_SELECT_CALL_NEAR_REL 3 /* Near relative calls */
+#define LBR_SELECT_CALL_NEAR_IND 4 /* Indirect relative calls */
+#define LBR_SELECT_RET_NEAR 5 /* Near returns */
+#define LBR_SELECT_JMP_NEAR_IND 6 /* Near indirect jumps (excl. calls and returns) */
+#define LBR_SELECT_JMP_NEAR_REL 7 /* Near relative jumps (excl. calls) */
+#define LBR_SELECT_FAR_BRANCH 8 /* Far branches */
+
+#define LBR_KERNEL BIT(LBR_SELECT_KERNEL)
+#define LBR_USER BIT(LBR_SELECT_USER)
+#define LBR_JCC BIT(LBR_SELECT_JCC)
+#define LBR_REL_CALL BIT(LBR_SELECT_CALL_NEAR_REL)
+#define LBR_IND_CALL BIT(LBR_SELECT_CALL_NEAR_IND)
+#define LBR_RETURN BIT(LBR_SELECT_RET_NEAR)
+#define LBR_REL_JMP BIT(LBR_SELECT_JMP_NEAR_REL)
+#define LBR_IND_JMP BIT(LBR_SELECT_JMP_NEAR_IND)
+#define LBR_FAR BIT(LBR_SELECT_FAR_BRANCH)
+#define LBR_NOT_SUPP -1 /* unsupported filter */
+#define LBR_IGNORE 0
+
+#define LBR_ANY \
+ (LBR_JCC | LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | \
+ LBR_REL_JMP | LBR_IND_JMP | LBR_FAR)
+
+struct branch_entry {
+ union {
+ struct {
+ u64 ip:58;
+ u64 ip_sign_ext:5;
+ u64 mispredict:1;
+ } split;
+ u64 full;
+ } from;
+
+ union {
+ struct {
+ u64 ip:58;
+ u64 ip_sign_ext:3;
+ u64 reserved:1;
+ u64 spec:1;
+ u64 valid:1;
+ } split;
+ u64 full;
+ } to;
+};
+
+static __always_inline void amd_pmu_lbr_set_from(unsigned int idx, u64 val)
+{
+ wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val);
+}
+
+static __always_inline void amd_pmu_lbr_set_to(unsigned int idx, u64 val)
+{
+ wrmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val);
+}
+
+static __always_inline u64 amd_pmu_lbr_get_from(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2, val);
+
+ return val;
+}
+
+static __always_inline u64 amd_pmu_lbr_get_to(unsigned int idx)
+{
+ u64 val;
+
+ rdmsrl(MSR_AMD_SAMP_BR_FROM + idx * 2 + 1, val);
+
+ return val;
+}
+
+static __always_inline u64 sign_ext_branch_ip(u64 ip)
+{
+ u32 shift = 64 - boot_cpu_data.x86_virt_bits;
+
+ return (u64)(((s64)ip << shift) >> shift);
+}
+
+static void amd_pmu_lbr_filter(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int br_sel = cpuc->br_sel, offset, type, i, j;
+ bool compress = false;
+ bool fused_only = false;
+ u64 from, to;
+
+ /* If sampling all branches, there is nothing to filter */
+ if (((br_sel & X86_BR_ALL) == X86_BR_ALL) &&
+ ((br_sel & X86_BR_TYPE_SAVE) != X86_BR_TYPE_SAVE))
+ fused_only = true;
+
+ for (i = 0; i < cpuc->lbr_stack.nr; i++) {
+ from = cpuc->lbr_entries[i].from;
+ to = cpuc->lbr_entries[i].to;
+ type = branch_type_fused(from, to, 0, &offset);
+
+ /*
+ * Adjust the branch from address in case of instruction
+ * fusion where it points to an instruction preceding the
+ * actual branch
+ */
+ if (offset) {
+ cpuc->lbr_entries[i].from += offset;
+ if (fused_only)
+ continue;
+ }
+
+ /* If type does not correspond, then discard */
+ if (type == X86_BR_NONE || (br_sel & type) != type) {
+ cpuc->lbr_entries[i].from = 0; /* mark invalid */
+ compress = true;
+ }
+
+ if ((br_sel & X86_BR_TYPE_SAVE) == X86_BR_TYPE_SAVE)
+ cpuc->lbr_entries[i].type = common_branch_type(type);
+ }
+
+ if (!compress)
+ return;
+
+ /* Remove all invalid entries */
+ for (i = 0; i < cpuc->lbr_stack.nr; ) {
+ if (!cpuc->lbr_entries[i].from) {
+ j = i;
+ while (++j < cpuc->lbr_stack.nr)
+ cpuc->lbr_entries[j - 1] = cpuc->lbr_entries[j];
+ cpuc->lbr_stack.nr--;
+ if (!cpuc->lbr_entries[i].from)
+ continue;
+ }
+ i++;
+ }
+}
+
+static const int lbr_spec_map[PERF_BR_SPEC_MAX] = {
+ PERF_BR_SPEC_NA,
+ PERF_BR_SPEC_WRONG_PATH,
+ PERF_BR_NON_SPEC_CORRECT_PATH,
+ PERF_BR_SPEC_CORRECT_PATH,
+};
+
+void amd_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct perf_branch_entry *br = cpuc->lbr_entries;
+ struct branch_entry entry;
+ int out = 0, idx, i;
+
+ if (!cpuc->lbr_users)
+ return;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ entry.from.full = amd_pmu_lbr_get_from(i);
+ entry.to.full = amd_pmu_lbr_get_to(i);
+
+ /*
+ * Check if a branch has been logged; if valid = 0, spec = 0
+ * then no branch was recorded
+ */
+ if (!entry.to.split.valid && !entry.to.split.spec)
+ continue;
+
+ perf_clear_branch_entry_bitfields(br + out);
+
+ br[out].from = sign_ext_branch_ip(entry.from.split.ip);
+ br[out].to = sign_ext_branch_ip(entry.to.split.ip);
+ br[out].mispred = entry.from.split.mispredict;
+ br[out].predicted = !br[out].mispred;
+
+ /*
+ * Set branch speculation information using the status of
+ * the valid and spec bits.
+ *
+ * When valid = 0, spec = 0, no branch was recorded and the
+ * entry is discarded as seen above.
+ *
+ * When valid = 0, spec = 1, the recorded branch was
+ * speculative but took the wrong path.
+ *
+ * When valid = 1, spec = 0, the recorded branch was
+ * non-speculative but took the correct path.
+ *
+ * When valid = 1, spec = 1, the recorded branch was
+ * speculative and took the correct path
+ */
+ idx = (entry.to.split.valid << 1) | entry.to.split.spec;
+ br[out].spec = lbr_spec_map[idx];
+ out++;
+ }
+
+ cpuc->lbr_stack.nr = out;
+
+ /*
+ * Internal register renaming always ensures that LBR From[0] and
+ * LBR To[0] always represent the TOS
+ */
+ cpuc->lbr_stack.hw_idx = 0;
+
+ /* Perform further software filtering */
+ amd_pmu_lbr_filter();
+}
+
+static const int lbr_select_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGNORE,
+
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_IN_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_NO_TX_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_REL_CALL,
+
+ [PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT] = LBR_NOT_SUPP,
+ [PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT] = LBR_NOT_SUPP,
+};
+
+static int amd_pmu_lbr_setup_filter(struct perf_event *event)
+{
+ struct hw_perf_event_extra *reg = &event->hw.branch_reg;
+ u64 br_type = event->attr.branch_sample_type;
+ u64 mask = 0, v;
+ int i;
+
+ /* No LBR support */
+ if (!x86_pmu.lbr_nr)
+ return -EOPNOTSUPP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_USER)
+ mask |= X86_BR_USER;
+
+ if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
+ mask |= X86_BR_KERNEL;
+
+ /* Ignore BRANCH_HV here */
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY)
+ mask |= X86_BR_ANY;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
+ mask |= X86_BR_ANY_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
+ mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
+ mask |= X86_BR_IND_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_COND)
+ mask |= X86_BR_JCC;
+
+ if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP)
+ mask |= X86_BR_IND_JMP;
+
+ if (br_type & PERF_SAMPLE_BRANCH_CALL)
+ mask |= X86_BR_CALL | X86_BR_ZERO_CALL;
+
+ if (br_type & PERF_SAMPLE_BRANCH_TYPE_SAVE)
+ mask |= X86_BR_TYPE_SAVE;
+
+ reg->reg = mask;
+ mask = 0;
+
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & BIT_ULL(i)))
+ continue;
+
+ v = lbr_select_map[i];
+ if (v == LBR_NOT_SUPP)
+ return -EOPNOTSUPP;
+
+ if (v != LBR_IGNORE)
+ mask |= v;
+ }
+
+ /* Filter bits operate in suppress mode */
+ reg->config = mask ^ LBR_SELECT_MASK;
+
+ return 0;
+}
+
+int amd_pmu_lbr_hw_config(struct perf_event *event)
+{
+ int ret = 0;
+
+ /* LBR is not recommended in counting mode */
+ if (!is_sampling_event(event))
+ return -EINVAL;
+
+ ret = amd_pmu_lbr_setup_filter(event);
+ if (!ret)
+ event->attach_state |= PERF_ATTACH_SCHED_CB;
+
+ return ret;
+}
+
+void amd_pmu_lbr_reset(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ int i;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /* Reset all branch records individually */
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ amd_pmu_lbr_set_from(i, 0);
+ amd_pmu_lbr_set_to(i, 0);
+ }
+
+ cpuc->last_task_ctx = NULL;
+ cpuc->last_log_id = 0;
+ wrmsrl(MSR_AMD64_LBR_SELECT, 0);
+}
+
+void amd_pmu_lbr_add(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event_extra *reg = &event->hw.branch_reg;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (has_branch_stack(event)) {
+ cpuc->lbr_select = 1;
+ cpuc->lbr_sel->config = reg->config;
+ cpuc->br_sel = reg->reg;
+ }
+
+ perf_sched_cb_inc(event->ctx->pmu);
+
+ if (!cpuc->lbr_users++ && !event->total_time_running)
+ amd_pmu_lbr_reset();
+}
+
+void amd_pmu_lbr_del(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (has_branch_stack(event))
+ cpuc->lbr_select = 0;
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
+}
+
+void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+ /*
+ * A context switch can flip the address space and LBR entries are
+ * not tagged with an identifier. Hence, branches cannot be resolved
+ * from the old address space and the LBR records should be wiped.
+ */
+ if (cpuc->lbr_users && sched_in)
+ amd_pmu_lbr_reset();
+}
+
+void amd_pmu_lbr_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 lbr_select, dbg_ctl, dbg_extn_cfg;
+
+ if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
+ return;
+
+ /* Set hardware branch filter */
+ if (cpuc->lbr_select) {
+ lbr_select = cpuc->lbr_sel->config & LBR_SELECT_MASK;
+ wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select);
+ }
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN);
+}
+
+void amd_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 dbg_ctl, dbg_extn_cfg;
+
+ if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
+ return;
+
+ rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
+
+ wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+}
+
+__init int amd_pmu_lbr_init(void)
+{
+ union cpuid_0x80000022_ebx ebx;
+
+ if (x86_pmu.version < 2 || !boot_cpu_has(X86_FEATURE_AMD_LBR_V2))
+ return -EOPNOTSUPP;
+
+ /* Set number of entries */
+ ebx.full = cpuid_ebx(EXT_PERFMON_DEBUG_FEATURES);
+ x86_pmu.lbr_nr = ebx.split.lbr_v2_stack_sz;
+
+ pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
+
+ return 0;
+}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index f969410d0c90..b30b8bbcd1e2 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -72,6 +72,10 @@ DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
+DEFINE_STATIC_CALL_NULL(x86_pmu_set_period, *x86_pmu.set_period);
+DEFINE_STATIC_CALL_NULL(x86_pmu_update, *x86_pmu.update);
+DEFINE_STATIC_CALL_NULL(x86_pmu_limit_period, *x86_pmu.limit_period);
+
DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
@@ -116,9 +120,6 @@ u64 x86_perf_event_update(struct perf_event *event)
if (unlikely(!hwc->event_base))
return 0;
- if (unlikely(is_topdown_count(event)) && x86_pmu.update_topdown_event)
- return x86_pmu.update_topdown_event(event);
-
/*
* Careful: an NMI might modify the previous event value.
*
@@ -621,8 +622,9 @@ int x86_pmu_hw_config(struct perf_event *event)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
if (event->attr.sample_period && x86_pmu.limit_period) {
- if (x86_pmu.limit_period(event, event->attr.sample_period) >
- event->attr.sample_period)
+ s64 left = event->attr.sample_period;
+ x86_pmu.limit_period(event, &left);
+ if (left > event->attr.sample_period)
return -EINVAL;
}
@@ -1354,7 +1356,7 @@ static void x86_pmu_enable(struct pmu *pmu)
static_call(x86_pmu_enable_all)(added);
}
-static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
+DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
/*
* Set the next IRQ period, based on the hwc->period_left value.
@@ -1370,10 +1372,6 @@ int x86_perf_event_set_period(struct perf_event *event)
if (unlikely(!hwc->event_base))
return 0;
- if (unlikely(is_topdown_count(event)) &&
- x86_pmu.set_topdown_event_period)
- return x86_pmu.set_topdown_event_period(event);
-
/*
* If we are way outside a reasonable range then just skip forward:
*/
@@ -1399,10 +1397,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
- if (x86_pmu.limit_period)
- left = x86_pmu.limit_period(event, left);
+ static_call_cond(x86_pmu_limit_period)(event, &left);
- per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
+ this_cpu_write(pmc_prev_left[idx], left);
/*
* The hw event starts counting from this event offset,
@@ -1419,16 +1416,6 @@ int x86_perf_event_set_period(struct perf_event *event)
if (is_counter_pair(hwc))
wrmsrl(x86_pmu_event_addr(idx + 1), 0xffff);
- /*
- * Due to erratum on certan cpu we need
- * a second write to be sure the register
- * is updated properly
- */
- if (x86_pmu.perfctr_second_write) {
- wrmsrl(hwc->event_base,
- (u64)(-left) & x86_pmu.cntval_mask);
- }
-
perf_event_update_userpage(event);
return ret;
@@ -1518,7 +1505,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
if (flags & PERF_EF_RELOAD) {
WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
- x86_perf_event_set_period(event);
+ static_call(x86_pmu_set_period)(event);
}
event->hw.state = 0;
@@ -1610,7 +1597,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
* Drain the remaining delta count out of a event
* that we are disabling:
*/
- x86_perf_event_update(event);
+ static_call(x86_pmu_update)(event);
hwc->state |= PERF_HES_UPTODATE;
}
}
@@ -1700,7 +1687,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
event = cpuc->events[idx];
- val = x86_perf_event_update(event);
+ val = static_call(x86_pmu_update)(event);
if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
continue;
@@ -1709,13 +1696,15 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
*/
handled++;
- if (!x86_perf_event_set_period(event))
+ if (!static_call(x86_pmu_set_period)(event))
continue;
perf_sample_data_init(&data, 0, event->hw.last_period);
- if (has_branch_stack(event))
+ if (has_branch_stack(event)) {
data.br_stack = &cpuc->lbr_stack;
+ data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ }
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -2023,6 +2012,10 @@ static void x86_pmu_static_call_update(void)
static_call_update(x86_pmu_del, x86_pmu.del);
static_call_update(x86_pmu_read, x86_pmu.read);
+ static_call_update(x86_pmu_set_period, x86_pmu.set_period);
+ static_call_update(x86_pmu_update, x86_pmu.update);
+ static_call_update(x86_pmu_limit_period, x86_pmu.limit_period);
+
static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
@@ -2042,7 +2035,7 @@ static void x86_pmu_static_call_update(void)
static void _x86_pmu_read(struct perf_event *event)
{
- x86_perf_event_update(event);
+ static_call(x86_pmu_update)(event);
}
void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
@@ -2149,6 +2142,12 @@ static int __init init_hw_perf_events(void)
if (!x86_pmu.guest_get_msrs)
x86_pmu.guest_get_msrs = (void *)&__static_call_return0;
+ if (!x86_pmu.set_period)
+ x86_pmu.set_period = x86_perf_event_set_period;
+
+ if (!x86_pmu.update)
+ x86_pmu.update = x86_perf_event_update;
+
x86_pmu_static_call_update();
/*
@@ -2670,7 +2669,9 @@ static int x86_pmu_check_period(struct perf_event *event, u64 value)
return -EINVAL;
if (value && x86_pmu.limit_period) {
- if (x86_pmu.limit_period(event, value) > value)
+ s64 left = value;
+ x86_pmu.limit_period(event, &left);
+ if (left > value)
return -EINVAL;
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index c20d8cd47c48..a646a5f9a235 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2199,6 +2199,12 @@ static void __intel_pmu_enable_all(int added, bool pmi)
u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
intel_pmu_lbr_enable_all(pmi);
+
+ if (cpuc->fixed_ctrl_val != cpuc->active_fixed_ctrl_val) {
+ wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, cpuc->fixed_ctrl_val);
+ cpuc->active_fixed_ctrl_val = cpuc->fixed_ctrl_val;
+ }
+
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
@@ -2311,7 +2317,7 @@ static void intel_pmu_nhm_workaround(void)
for (i = 0; i < 4; i++) {
event = cpuc->events[i];
if (event)
- x86_perf_event_update(event);
+ static_call(x86_pmu_update)(event);
}
for (i = 0; i < 4; i++) {
@@ -2326,7 +2332,7 @@ static void intel_pmu_nhm_workaround(void)
event = cpuc->events[i];
if (event) {
- x86_perf_event_set_period(event);
+ static_call(x86_pmu_set_period)(event);
__x86_pmu_enable_event(&event->hw,
ARCH_PERFMON_EVENTSEL_ENABLE);
} else
@@ -2416,9 +2422,10 @@ static inline void intel_clear_masks(struct perf_event *event, int idx)
static void intel_pmu_disable_fixed(struct perf_event *event)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- u64 ctrl_val, mask;
int idx = hwc->idx;
+ u64 mask;
if (is_topdown_idx(idx)) {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -2435,9 +2442,7 @@ static void intel_pmu_disable_fixed(struct perf_event *event)
intel_clear_masks(event, idx);
mask = 0xfULL << ((idx - INTEL_PMC_IDX_FIXED) * 4);
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- wrmsrl(hwc->config_base, ctrl_val);
+ cpuc->fixed_ctrl_val &= ~mask;
}
static void intel_pmu_disable_event(struct perf_event *event)
@@ -2530,6 +2535,8 @@ static int adl_set_topdown_event_period(struct perf_event *event)
return icl_set_topdown_event_period(event);
}
+DEFINE_STATIC_CALL(intel_pmu_set_topdown_event_period, x86_perf_event_set_period);
+
static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
{
u32 val;
@@ -2680,6 +2687,7 @@ static u64 adl_update_topdown_event(struct perf_event *event)
return icl_update_topdown_event(event);
}
+DEFINE_STATIC_CALL(intel_pmu_update_topdown_event, x86_perf_event_update);
static void intel_pmu_read_topdown_event(struct perf_event *event)
{
@@ -2691,7 +2699,7 @@ static void intel_pmu_read_topdown_event(struct perf_event *event)
return;
perf_pmu_disable(event->pmu);
- x86_pmu.update_topdown_event(event);
+ static_call(intel_pmu_update_topdown_event)(event);
perf_pmu_enable(event->pmu);
}
@@ -2699,7 +2707,7 @@ static void intel_pmu_read_event(struct perf_event *event)
{
if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
intel_pmu_auto_reload_read(event);
- else if (is_topdown_count(event) && x86_pmu.update_topdown_event)
+ else if (is_topdown_count(event))
intel_pmu_read_topdown_event(event);
else
x86_perf_event_update(event);
@@ -2707,8 +2715,9 @@ static void intel_pmu_read_event(struct perf_event *event)
static void intel_pmu_enable_fixed(struct perf_event *event)
{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
- u64 ctrl_val, mask, bits = 0;
+ u64 mask, bits = 0;
int idx = hwc->idx;
if (is_topdown_idx(idx)) {
@@ -2752,10 +2761,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4);
}
- rdmsrl(hwc->config_base, ctrl_val);
- ctrl_val &= ~mask;
- ctrl_val |= bits;
- wrmsrl(hwc->config_base, ctrl_val);
+ cpuc->fixed_ctrl_val &= ~mask;
+ cpuc->fixed_ctrl_val |= bits;
}
static void intel_pmu_enable_event(struct perf_event *event)
@@ -2803,7 +2810,7 @@ static void intel_pmu_add_event(struct perf_event *event)
*/
int intel_pmu_save_and_restart(struct perf_event *event)
{
- x86_perf_event_update(event);
+ static_call(x86_pmu_update)(event);
/*
* For a checkpointed counter always reset back to 0. This
* avoids a situation where the counter overflows, aborts the
@@ -2815,9 +2822,25 @@ int intel_pmu_save_and_restart(struct perf_event *event)
wrmsrl(event->hw.event_base, 0);
local64_set(&event->hw.prev_count, 0);
}
+ return static_call(x86_pmu_set_period)(event);
+}
+
+static int intel_pmu_set_period(struct perf_event *event)
+{
+ if (unlikely(is_topdown_count(event)))
+ return static_call(intel_pmu_set_topdown_event_period)(event);
+
return x86_perf_event_set_period(event);
}
+static u64 intel_pmu_update(struct perf_event *event)
+{
+ if (unlikely(is_topdown_count(event)))
+ return static_call(intel_pmu_update_topdown_event)(event);
+
+ return x86_perf_event_update(event);
+}
+
static void intel_pmu_reset(void)
{
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
@@ -2980,8 +3003,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
*/
if (__test_and_clear_bit(GLOBAL_STATUS_PERF_METRICS_OVF_BIT, (unsigned long *)&status)) {
handled++;
- if (x86_pmu.update_topdown_event)
- x86_pmu.update_topdown_event(NULL);
+ static_call(intel_pmu_update_topdown_event)(NULL);
}
/*
@@ -3004,8 +3026,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status)
perf_sample_data_init(&data, 0, event->hw.last_period);
- if (has_branch_stack(event))
+ if (has_branch_stack(event)) {
data.br_stack = &cpuc->lbr_stack;
+ data.sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ }
if (perf_event_overflow(event, &data, regs))
x86_pmu_stop(event, 0);
@@ -3853,9 +3877,6 @@ static int intel_pmu_hw_config(struct perf_event *event)
}
if (x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
-
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- event->attr.sample_type |= __PERF_SAMPLE_CALLCHAIN_EARLY;
}
if (needs_branch_stack(event)) {
@@ -4334,28 +4355,25 @@ static u8 adl_get_hybrid_cpu_type(void)
* Therefore the effective (average) period matches the requested period,
* despite coarser hardware granularity.
*/
-static u64 bdw_limit_period(struct perf_event *event, u64 left)
+static void bdw_limit_period(struct perf_event *event, s64 *left)
{
if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
X86_CONFIG(.event=0xc0, .umask=0x01)) {
- if (left < 128)
- left = 128;
- left &= ~0x3fULL;
+ if (*left < 128)
+ *left = 128;
+ *left &= ~0x3fULL;
}
- return left;
}
-static u64 nhm_limit_period(struct perf_event *event, u64 left)
+static void nhm_limit_period(struct perf_event *event, s64 *left)
{
- return max(left, 32ULL);
+ *left = max(*left, 32LL);
}
-static u64 spr_limit_period(struct perf_event *event, u64 left)
+static void spr_limit_period(struct perf_event *event, s64 *left)
{
if (event->attr.precise_ip == 3)
- return max(left, 128ULL);
-
- return left;
+ *left = max(*left, 128LL);
}
PMU_FORMAT_ATTR(event, "config:0-7" );
@@ -4794,6 +4812,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.add = intel_pmu_add_event,
.del = intel_pmu_del_event,
.read = intel_pmu_read_event,
+ .set_period = intel_pmu_set_period,
+ .update = intel_pmu_update,
.hw_config = intel_pmu_hw_config,
.schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
@@ -6312,8 +6332,10 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
x86_pmu.num_topdown_events = 4;
- x86_pmu.update_topdown_event = icl_update_topdown_event;
- x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ static_call_update(intel_pmu_update_topdown_event,
+ &icl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &icl_set_topdown_event_period);
pr_cont("Icelake events, ");
name = "icelake";
break;
@@ -6348,8 +6370,10 @@ __init int intel_pmu_init(void)
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(pmem);
x86_pmu.num_topdown_events = 8;
- x86_pmu.update_topdown_event = icl_update_topdown_event;
- x86_pmu.set_topdown_event_period = icl_set_topdown_event_period;
+ static_call_update(intel_pmu_update_topdown_event,
+ &icl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &icl_set_topdown_event_period);
pr_cont("Sapphire Rapids events, ");
name = "sapphire_rapids";
break;
@@ -6358,6 +6382,7 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ALDERLAKE_L:
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
+ case INTEL_FAM6_RAPTORLAKE_S:
/*
* Alder Lake has 2 types of CPU, core and atom.
*
@@ -6382,8 +6407,10 @@ __init int intel_pmu_init(void)
intel_pmu_pebs_data_source_adl();
x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.num_topdown_events = 8;
- x86_pmu.update_topdown_event = adl_update_topdown_event;
- x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
+ static_call_update(intel_pmu_update_topdown_event,
+ &adl_update_topdown_event);
+ static_call_update(intel_pmu_set_topdown_event_period,
+ &adl_set_topdown_event_period);
x86_pmu.filter_match = intel_pmu_filter_match;
x86_pmu.get_event_constraints = adl_get_event_constraints;
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 8ec23f47fee9..a2834bc93149 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -685,6 +685,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_cstates),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_cstates),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index ac973c6f82ad..7839507b3844 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -1540,14 +1540,18 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
/*
* Use latency for weight (only avail with PEBS-LL)
*/
- if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE))
+ if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) {
data->weight.full = pebs->lat;
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
/*
* data.data_src encodes the data source
*/
- if (sample_type & PERF_SAMPLE_DATA_SRC)
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = get_data_src(event, pebs->dse);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
/*
* We must however always use iregs for the unwinder to stay sane; the
@@ -1555,8 +1559,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, iregs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
/*
* We use the interrupt regs as a base because the PEBS record does not
@@ -1628,17 +1634,22 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
if ((sample_type & PERF_SAMPLE_ADDR_TYPE) &&
- x86_pmu.intel_cap.pebs_format >= 1)
+ x86_pmu.intel_cap.pebs_format >= 1) {
data->addr = pebs->dla;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
if (x86_pmu.intel_cap.pebs_format >= 2) {
/* Only set the TSX weight when no memory weight. */
- if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll)
+ if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) {
data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning);
-
- if (sample_type & PERF_SAMPLE_TRANSACTION)
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
+ }
+ if (sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = intel_get_tsx_transaction(pebs->tsx_tuning,
pebs->ax);
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
}
/*
@@ -1648,11 +1659,15 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event,
* We can only do this for the default trace clock.
*/
if (x86_pmu.intel_cap.pebs_format >= 3 &&
- event->attr.use_clockid == 0)
+ event->attr.use_clockid == 0) {
data->time = native_sched_clock_from_tsc(pebs->tsc);
+ data->sample_flags |= PERF_SAMPLE_TIME;
+ }
- if (has_branch_stack(event))
+ if (has_branch_stack(event)) {
data->br_stack = &cpuc->lbr_stack;
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
+ }
}
static void adaptive_pebs_save_regs(struct pt_regs *regs,
@@ -1710,8 +1725,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
perf_sample_data_init(data, 0, event->hw.last_period);
data->period = event->hw.last_period;
- if (event->attr.use_clockid == 0)
+ if (event->attr.use_clockid == 0) {
data->time = native_sched_clock_from_tsc(basic->tsc);
+ data->sample_flags |= PERF_SAMPLE_TIME;
+ }
/*
* We must however always use iregs for the unwinder to stay sane; the
@@ -1719,8 +1736,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
* previous PMI context or an (I)RET happened between the record and
* PMI.
*/
- if (sample_type & PERF_SAMPLE_CALLCHAIN)
+ if (sample_type & PERF_SAMPLE_CALLCHAIN) {
data->callchain = perf_callchain(event, iregs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
*regs = *iregs;
/* The ip in basic is EventingIP */
@@ -1771,17 +1790,24 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?:
intel_get_tsx_weight(meminfo->tsx_tuning);
}
+ data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
}
- if (sample_type & PERF_SAMPLE_DATA_SRC)
+ if (sample_type & PERF_SAMPLE_DATA_SRC) {
data->data_src.val = get_data_src(event, meminfo->aux);
+ data->sample_flags |= PERF_SAMPLE_DATA_SRC;
+ }
- if (sample_type & PERF_SAMPLE_ADDR_TYPE)
+ if (sample_type & PERF_SAMPLE_ADDR_TYPE) {
data->addr = meminfo->address;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
- if (sample_type & PERF_SAMPLE_TRANSACTION)
+ if (sample_type & PERF_SAMPLE_TRANSACTION) {
data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning,
gprs ? gprs->ax : 0);
+ data->sample_flags |= PERF_SAMPLE_TRANSACTION;
+ }
}
if (format_size & PEBS_DATACFG_XMMS) {
@@ -1800,6 +1826,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
if (has_branch_stack(event)) {
intel_pmu_store_pebs_lbrs(lbr);
data->br_stack = &cpuc->lbr_stack;
+ data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}
}
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 47fca6a7a8bc..4fce1a4226e3 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -4,7 +4,6 @@
#include <asm/perf_event.h>
#include <asm/msr.h>
-#include <asm/insn.h>
#include "../perf_event.h"
@@ -66,65 +65,6 @@
#define LBR_FROM_SIGNEXT_2MSB (BIT_ULL(60) | BIT_ULL(59))
/*
- * x86control flow change classification
- * x86control flow changes include branches, interrupts, traps, faults
- */
-enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
- X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
- X86_BR_CALL_STACK = 1 << 16,/* call stack */
- X86_BR_IND_JMP = 1 << 17,/* indirect jump */
-
- X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
-
-};
-
-#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
-#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
-
-#define X86_BR_ANY \
- (X86_BR_CALL |\
- X86_BR_RET |\
- X86_BR_SYSCALL |\
- X86_BR_SYSRET |\
- X86_BR_INT |\
- X86_BR_IRET |\
- X86_BR_JCC |\
- X86_BR_JMP |\
- X86_BR_IRQ |\
- X86_BR_ABORT |\
- X86_BR_IND_CALL |\
- X86_BR_IND_JMP |\
- X86_BR_ZERO_CALL)
-
-#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
-
-#define X86_BR_ANY_CALL \
- (X86_BR_CALL |\
- X86_BR_IND_CALL |\
- X86_BR_ZERO_CALL |\
- X86_BR_SYSCALL |\
- X86_BR_IRQ |\
- X86_BR_INT)
-
-/*
* Intel LBR_CTL bits
*
* Hardware branch filter for Arch LBR
@@ -1151,219 +1091,6 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
return ret;
}
-/*
- * return the type of control flow change at address "from"
- * instruction is not necessarily a branch (in case of interrupt).
- *
- * The branch type returned also includes the priv level of the
- * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
- *
- * If a branch type is unknown OR the instruction cannot be
- * decoded (e.g., text page not present), then X86_BR_NONE is
- * returned.
- */
-static int branch_type(unsigned long from, unsigned long to, int abort)
-{
- struct insn insn;
- void *addr;
- int bytes_read, bytes_left;
- int ret = X86_BR_NONE;
- int ext, to_plm, from_plm;
- u8 buf[MAX_INSN_SIZE];
- int is64 = 0;
-
- to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
- from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
-
- /*
- * maybe zero if lbr did not fill up after a reset by the time
- * we get a PMU interrupt
- */
- if (from == 0 || to == 0)
- return X86_BR_NONE;
-
- if (abort)
- return X86_BR_ABORT | to_plm;
-
- if (from_plm == X86_BR_USER) {
- /*
- * can happen if measuring at the user level only
- * and we interrupt in a kernel thread, e.g., idle.
- */
- if (!current->mm)
- return X86_BR_NONE;
-
- /* may fail if text not present */
- bytes_left = copy_from_user_nmi(buf, (void __user *)from,
- MAX_INSN_SIZE);
- bytes_read = MAX_INSN_SIZE - bytes_left;
- if (!bytes_read)
- return X86_BR_NONE;
-
- addr = buf;
- } else {
- /*
- * The LBR logs any address in the IP, even if the IP just
- * faulted. This means userspace can control the from address.
- * Ensure we don't blindly read any address by validating it is
- * a known text address.
- */
- if (kernel_text_address(from)) {
- addr = (void *)from;
- /*
- * Assume we can get the maximum possible size
- * when grabbing kernel data. This is not
- * _strictly_ true since we could possibly be
- * executing up next to a memory hole, but
- * it is very unlikely to be a problem.
- */
- bytes_read = MAX_INSN_SIZE;
- } else {
- return X86_BR_NONE;
- }
- }
-
- /*
- * decoder needs to know the ABI especially
- * on 64-bit systems running 32-bit apps
- */
-#ifdef CONFIG_X86_64
- is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
-#endif
- insn_init(&insn, addr, bytes_read, is64);
- if (insn_get_opcode(&insn))
- return X86_BR_ABORT;
-
- switch (insn.opcode.bytes[0]) {
- case 0xf:
- switch (insn.opcode.bytes[1]) {
- case 0x05: /* syscall */
- case 0x34: /* sysenter */
- ret = X86_BR_SYSCALL;
- break;
- case 0x07: /* sysret */
- case 0x35: /* sysexit */
- ret = X86_BR_SYSRET;
- break;
- case 0x80 ... 0x8f: /* conditional */
- ret = X86_BR_JCC;
- break;
- default:
- ret = X86_BR_NONE;
- }
- break;
- case 0x70 ... 0x7f: /* conditional */
- ret = X86_BR_JCC;
- break;
- case 0xc2: /* near ret */
- case 0xc3: /* near ret */
- case 0xca: /* far ret */
- case 0xcb: /* far ret */
- ret = X86_BR_RET;
- break;
- case 0xcf: /* iret */
- ret = X86_BR_IRET;
- break;
- case 0xcc ... 0xce: /* int */
- ret = X86_BR_INT;
- break;
- case 0xe8: /* call near rel */
- if (insn_get_immediate(&insn) || insn.immediate1.value == 0) {
- /* zero length call */
- ret = X86_BR_ZERO_CALL;
- break;
- }
- fallthrough;
- case 0x9a: /* call far absolute */
- ret = X86_BR_CALL;
- break;
- case 0xe0 ... 0xe3: /* loop jmp */
- ret = X86_BR_JCC;
- break;
- case 0xe9 ... 0xeb: /* jmp */
- ret = X86_BR_JMP;
- break;
- case 0xff: /* call near absolute, call far absolute ind */
- if (insn_get_modrm(&insn))
- return X86_BR_ABORT;
-
- ext = (insn.modrm.bytes[0] >> 3) & 0x7;
- switch (ext) {
- case 2: /* near ind call */
- case 3: /* far ind call */
- ret = X86_BR_IND_CALL;
- break;
- case 4:
- case 5:
- ret = X86_BR_IND_JMP;
- break;
- }
- break;
- default:
- ret = X86_BR_NONE;
- }
- /*
- * interrupts, traps, faults (and thus ring transition) may
- * occur on any instructions. Thus, to classify them correctly,
- * we need to first look at the from and to priv levels. If they
- * are different and to is in the kernel, then it indicates
- * a ring transition. If the from instruction is not a ring
- * transition instr (syscall, systenter, int), then it means
- * it was a irq, trap or fault.
- *
- * we have no way of detecting kernel to kernel faults.
- */
- if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
- && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
- ret = X86_BR_IRQ;
-
- /*
- * branch priv level determined by target as
- * is done by HW when LBR_SELECT is implemented
- */
- if (ret != X86_BR_NONE)
- ret |= to_plm;
-
- return ret;
-}
-
-#define X86_BR_TYPE_MAP_MAX 16
-
-static int branch_map[X86_BR_TYPE_MAP_MAX] = {
- PERF_BR_CALL, /* X86_BR_CALL */
- PERF_BR_RET, /* X86_BR_RET */
- PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
- PERF_BR_SYSRET, /* X86_BR_SYSRET */
- PERF_BR_UNKNOWN, /* X86_BR_INT */
- PERF_BR_ERET, /* X86_BR_IRET */
- PERF_BR_COND, /* X86_BR_JCC */
- PERF_BR_UNCOND, /* X86_BR_JMP */
- PERF_BR_IRQ, /* X86_BR_IRQ */
- PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
- PERF_BR_UNKNOWN, /* X86_BR_ABORT */
- PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
- PERF_BR_UNKNOWN, /* X86_BR_NO_TX */
- PERF_BR_CALL, /* X86_BR_ZERO_CALL */
- PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
- PERF_BR_IND, /* X86_BR_IND_JMP */
-};
-
-static int
-common_branch_type(int type)
-{
- int i;
-
- type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
-
- if (type) {
- i = __ffs(type);
- if (i < X86_BR_TYPE_MAP_MAX)
- return branch_map[i];
- }
-
- return PERF_BR_UNKNOWN;
-}
-
enum {
ARCH_LBR_BR_TYPE_JCC = 0,
ARCH_LBR_BR_TYPE_NEAR_IND_JMP = 1,
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index 7951a5dc73b6..03bbcc2fa2ff 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -1006,6 +1006,29 @@ static void p4_pmu_enable_all(int added)
}
}
+static int p4_pmu_set_period(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ s64 left = this_cpu_read(pmc_prev_left[hwc->idx]);
+ int ret;
+
+ ret = x86_perf_event_set_period(event);
+
+ if (hwc->event_base) {
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
+ }
+
+ return ret;
+}
+
static int p4_pmu_handle_irq(struct pt_regs *regs)
{
struct perf_sample_data data;
@@ -1044,7 +1067,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
/* event overflow for sure */
perf_sample_data_init(&data, 0, hwc->last_period);
- if (!x86_perf_event_set_period(event))
+ if (!static_call(x86_pmu_set_period)(event))
continue;
@@ -1316,6 +1339,9 @@ static __initconst const struct x86_pmu p4_pmu = {
.enable_all = p4_pmu_enable_all,
.enable = p4_pmu_enable_event,
.disable = p4_pmu_disable_event,
+
+ .set_period = p4_pmu_set_period,
+
.eventsel = MSR_P4_BPU_CCCR0,
.perfctr = MSR_P4_BPU_PERFCTR0,
.event_map = p4_pmu_event_map,
@@ -1334,15 +1360,6 @@ static __initconst const struct x86_pmu p4_pmu = {
.max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
- /*
- * This handles erratum N15 in intel doc 249199-029,
- * the counter may not be updated correctly on write
- * so we need a second write operation to do the trick
- * (the official workaround didn't work)
- *
- * the former idea is taken from OProfile code
- */
- .perfctr_second_write = 1,
.format_attrs = intel_p4_formats_attr,
};
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index db6c31bca809..6f1ccc57a692 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1831,6 +1831,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &adl_uncore_init),
+ X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &adl_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, &snr_uncore_init),
{},
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index ac542f98c070..ecced3a52668 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -106,6 +106,7 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_ALDERLAKE_N:
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
+ case INTEL_FAM6_RAPTORLAKE_S:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 266143abcbd8..332d2e6d8ae4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -64,27 +64,25 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
return ((ecode & c->cmask) - c->code) <= (u64)c->size;
}
+#define PERF_ARCH(name, val) \
+ PERF_X86_EVENT_##name = val,
+
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x00001 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x00002 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x00004 /* haswell style datala, store */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x00008 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x00010 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL 0x00020 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC 0x00040 /* dynamic alloc'd constraint */
-
-#define PERF_X86_EVENT_EXCL_ACCT 0x00100 /* accounted EXCL event */
-#define PERF_X86_EVENT_AUTO_RELOAD 0x00200 /* use PEBS auto-reload */
-#define PERF_X86_EVENT_LARGE_PEBS 0x00400 /* use large PEBS */
-#define PERF_X86_EVENT_PEBS_VIA_PT 0x00800 /* use PT buffer for PEBS */
-#define PERF_X86_EVENT_PAIR 0x01000 /* Large Increment per Cycle */
-#define PERF_X86_EVENT_LBR_SELECT 0x02000 /* Save/Restore MSR_LBR_SELECT */
-#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
-#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
-#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
-#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */
+enum {
+#include "perf_event_flags.h"
+};
+
+#undef PERF_ARCH
+
+#define PERF_ARCH(name, val) \
+ static_assert((PERF_X86_EVENT_##name & PERF_EVENT_FLAG_ARCH) == \
+ PERF_X86_EVENT_##name);
+
+#include "perf_event_flags.h"
+
+#undef PERF_ARCH
static inline bool is_topdown_count(struct perf_event *event)
{
@@ -272,6 +270,10 @@ struct cpu_hw_events {
u64 active_pebs_data_cfg;
int pebs_record_size;
+ /* Intel Fixed counter configuration */
+ u64 fixed_ctrl_val;
+ u64 active_fixed_ctrl_val;
+
/*
* Intel LBR bits
*/
@@ -745,6 +747,8 @@ struct x86_pmu {
void (*add)(struct perf_event *);
void (*del)(struct perf_event *);
void (*read)(struct perf_event *event);
+ int (*set_period)(struct perf_event *event);
+ u64 (*update)(struct perf_event *event);
int (*hw_config)(struct perf_event *event);
int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
@@ -780,8 +784,7 @@ struct x86_pmu {
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
- int perfctr_second_write;
- u64 (*limit_period)(struct perf_event *event, u64 l);
+ void (*limit_period)(struct perf_event *event, s64 *l);
/* PMI handler bits */
unsigned int late_ack :1,
@@ -889,8 +892,6 @@ struct x86_pmu {
* Intel perf metrics
*/
int num_topdown_events;
- u64 (*update_topdown_event)(struct perf_event *event);
- int (*set_topdown_event_period)(struct perf_event *event);
/*
* perf task context (i.e. struct perf_event_context::task_ctx_data)
@@ -1044,6 +1045,9 @@ static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
struct pmu *x86_get_pmu(unsigned int cpu);
extern struct x86_pmu x86_pmu __read_mostly;
+DECLARE_STATIC_CALL(x86_pmu_set_period, *x86_pmu.set_period);
+DECLARE_STATIC_CALL(x86_pmu_update, *x86_pmu.update);
+
static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
{
if (static_cpu_has(X86_FEATURE_ARCH_LBR))
@@ -1059,6 +1063,7 @@ static inline bool x86_pmu_has_lbr_callstack(void)
}
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
+DECLARE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
int x86_perf_event_set_period(struct perf_event *event);
@@ -1210,6 +1215,70 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
regs->ip = ip;
}
+/*
+ * x86control flow change classification
+ * x86control flow changes include branches, interrupts, traps, faults
+ */
+enum {
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
+ X86_BR_IND_JMP = 1 << 17,/* indirect jump */
+
+ X86_BR_TYPE_SAVE = 1 << 18,/* indicate to save branch type */
+
+};
+
+#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
+
+#define X86_BR_ANY \
+ (X86_BR_CALL |\
+ X86_BR_RET |\
+ X86_BR_SYSCALL |\
+ X86_BR_SYSRET |\
+ X86_BR_INT |\
+ X86_BR_IRET |\
+ X86_BR_JCC |\
+ X86_BR_JMP |\
+ X86_BR_IRQ |\
+ X86_BR_ABORT |\
+ X86_BR_IND_CALL |\
+ X86_BR_IND_JMP |\
+ X86_BR_ZERO_CALL)
+
+#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
+
+#define X86_BR_ANY_CALL \
+ (X86_BR_CALL |\
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
+ X86_BR_SYSCALL |\
+ X86_BR_IRQ |\
+ X86_BR_INT)
+
+int common_branch_type(int type);
+int branch_type(unsigned long from, unsigned long to, int abort);
+int branch_type_fused(unsigned long from, unsigned long to, int abort,
+ int *offset);
+
ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
ssize_t intel_event_sysfs_show(char *page, u64 config);
@@ -1232,7 +1301,20 @@ static inline bool fixed_counter_disabled(int i, struct pmu *pmu)
int amd_pmu_init(void);
+int amd_pmu_lbr_init(void);
+void amd_pmu_lbr_reset(void);
+void amd_pmu_lbr_read(void);
+void amd_pmu_lbr_add(struct perf_event *event);
+void amd_pmu_lbr_del(struct perf_event *event);
+void amd_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+void amd_pmu_lbr_enable_all(void);
+void amd_pmu_lbr_disable_all(void);
+int amd_pmu_lbr_hw_config(struct perf_event *event);
+
#ifdef CONFIG_PERF_EVENTS_AMD_BRS
+
+#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
+
int amd_brs_init(void);
void amd_brs_disable(void);
void amd_brs_enable(void);
@@ -1241,7 +1323,7 @@ void amd_brs_disable_all(void);
void amd_brs_drain(void);
void amd_brs_lopwr_init(void);
void amd_brs_disable_all(void);
-int amd_brs_setup_filter(struct perf_event *event);
+int amd_brs_hw_config(struct perf_event *event);
void amd_brs_reset(void);
static inline void amd_pmu_brs_add(struct perf_event *event)
@@ -1277,7 +1359,7 @@ static inline void amd_brs_enable(void) {}
static inline void amd_brs_drain(void) {}
static inline void amd_brs_lopwr_init(void) {}
static inline void amd_brs_disable_all(void) {}
-static inline int amd_brs_setup_filter(struct perf_event *event)
+static inline int amd_brs_hw_config(struct perf_event *event)
{
return 0;
}
diff --git a/arch/x86/events/perf_event_flags.h b/arch/x86/events/perf_event_flags.h
new file mode 100644
index 000000000000..1dc19b9b4426
--- /dev/null
+++ b/arch/x86/events/perf_event_flags.h
@@ -0,0 +1,22 @@
+
+/*
+ * struct hw_perf_event.flags flags
+ */
+PERF_ARCH(PEBS_LDLAT, 0x00001) /* ld+ldlat data address sampling */
+PERF_ARCH(PEBS_ST, 0x00002) /* st data address sampling */
+PERF_ARCH(PEBS_ST_HSW, 0x00004) /* haswell style datala, store */
+PERF_ARCH(PEBS_LD_HSW, 0x00008) /* haswell style datala, load */
+PERF_ARCH(PEBS_NA_HSW, 0x00010) /* haswell style datala, unknown */
+PERF_ARCH(EXCL, 0x00020) /* HT exclusivity on counter */
+PERF_ARCH(DYNAMIC, 0x00040) /* dynamic alloc'd constraint */
+ /* 0x00080 */
+PERF_ARCH(EXCL_ACCT, 0x00100) /* accounted EXCL event */
+PERF_ARCH(AUTO_RELOAD, 0x00200) /* use PEBS auto-reload */
+PERF_ARCH(LARGE_PEBS, 0x00400) /* use large PEBS */
+PERF_ARCH(PEBS_VIA_PT, 0x00800) /* use PT buffer for PEBS */
+PERF_ARCH(PAIR, 0x01000) /* Large Increment per Cycle */
+PERF_ARCH(LBR_SELECT, 0x02000) /* Save/Restore MSR_LBR_SELECT */
+PERF_ARCH(TOPDOWN, 0x04000) /* Count Topdown slots/metrics events */
+PERF_ARCH(PEBS_STLAT, 0x08000) /* st+stlat data address sampling */
+PERF_ARCH(AMD_BRS, 0x10000) /* AMD Branch Sampling */
+PERF_ARCH(PEBS_LAT_HYBRID, 0x20000) /* ld and st lat for hybrid */
diff --git a/arch/x86/events/utils.c b/arch/x86/events/utils.c
new file mode 100644
index 000000000000..76b1f8bb0fd5
--- /dev/null
+++ b/arch/x86/events/utils.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/insn.h>
+
+#include "perf_event.h"
+
+static int decode_branch_type(struct insn *insn)
+{
+ int ext;
+
+ if (insn_get_opcode(insn))
+ return X86_BR_ABORT;
+
+ switch (insn->opcode.bytes[0]) {
+ case 0xf:
+ switch (insn->opcode.bytes[1]) {
+ case 0x05: /* syscall */
+ case 0x34: /* sysenter */
+ return X86_BR_SYSCALL;
+ case 0x07: /* sysret */
+ case 0x35: /* sysexit */
+ return X86_BR_SYSRET;
+ case 0x80 ... 0x8f: /* conditional */
+ return X86_BR_JCC;
+ }
+ return X86_BR_NONE;
+ case 0x70 ... 0x7f: /* conditional */
+ return X86_BR_JCC;
+ case 0xc2: /* near ret */
+ case 0xc3: /* near ret */
+ case 0xca: /* far ret */
+ case 0xcb: /* far ret */
+ return X86_BR_RET;
+ case 0xcf: /* iret */
+ return X86_BR_IRET;
+ case 0xcc ... 0xce: /* int */
+ return X86_BR_INT;
+ case 0xe8: /* call near rel */
+ if (insn_get_immediate(insn) || insn->immediate1.value == 0) {
+ /* zero length call */
+ return X86_BR_ZERO_CALL;
+ }
+ fallthrough;
+ case 0x9a: /* call far absolute */
+ return X86_BR_CALL;
+ case 0xe0 ... 0xe3: /* loop jmp */
+ return X86_BR_JCC;
+ case 0xe9 ... 0xeb: /* jmp */
+ return X86_BR_JMP;
+ case 0xff: /* call near absolute, call far absolute ind */
+ if (insn_get_modrm(insn))
+ return X86_BR_ABORT;
+
+ ext = (insn->modrm.bytes[0] >> 3) & 0x7;
+ switch (ext) {
+ case 2: /* near ind call */
+ case 3: /* far ind call */
+ return X86_BR_IND_CALL;
+ case 4:
+ case 5:
+ return X86_BR_IND_JMP;
+ }
+ return X86_BR_NONE;
+ }
+
+ return X86_BR_NONE;
+}
+
+/*
+ * return the type of control flow change at address "from"
+ * instruction is not necessarily a branch (in case of interrupt).
+ *
+ * The branch type returned also includes the priv level of the
+ * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
+ *
+ * If a branch type is unknown OR the instruction cannot be
+ * decoded (e.g., text page not present), then X86_BR_NONE is
+ * returned.
+ *
+ * While recording branches, some processors can report the "from"
+ * address to be that of an instruction preceding the actual branch
+ * when instruction fusion occurs. If fusion is expected, attempt to
+ * find the type of the first branch instruction within the next
+ * MAX_INSN_SIZE bytes and if found, provide the offset between the
+ * reported "from" address and the actual branch instruction address.
+ */
+static int get_branch_type(unsigned long from, unsigned long to, int abort,
+ bool fused, int *offset)
+{
+ struct insn insn;
+ void *addr;
+ int bytes_read, bytes_left, insn_offset;
+ int ret = X86_BR_NONE;
+ int to_plm, from_plm;
+ u8 buf[MAX_INSN_SIZE];
+ int is64 = 0;
+
+ /* make sure we initialize offset */
+ if (offset)
+ *offset = 0;
+
+ to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+ from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
+
+ /*
+ * maybe zero if lbr did not fill up after a reset by the time
+ * we get a PMU interrupt
+ */
+ if (from == 0 || to == 0)
+ return X86_BR_NONE;
+
+ if (abort)
+ return X86_BR_ABORT | to_plm;
+
+ if (from_plm == X86_BR_USER) {
+ /*
+ * can happen if measuring at the user level only
+ * and we interrupt in a kernel thread, e.g., idle.
+ */
+ if (!current->mm)
+ return X86_BR_NONE;
+
+ /* may fail if text not present */
+ bytes_left = copy_from_user_nmi(buf, (void __user *)from,
+ MAX_INSN_SIZE);
+ bytes_read = MAX_INSN_SIZE - bytes_left;
+ if (!bytes_read)
+ return X86_BR_NONE;
+
+ addr = buf;
+ } else {
+ /*
+ * The LBR logs any address in the IP, even if the IP just
+ * faulted. This means userspace can control the from address.
+ * Ensure we don't blindly read any address by validating it is
+ * a known text address.
+ */
+ if (kernel_text_address(from)) {
+ addr = (void *)from;
+ /*
+ * Assume we can get the maximum possible size
+ * when grabbing kernel data. This is not
+ * _strictly_ true since we could possibly be
+ * executing up next to a memory hole, but
+ * it is very unlikely to be a problem.
+ */
+ bytes_read = MAX_INSN_SIZE;
+ } else {
+ return X86_BR_NONE;
+ }
+ }
+
+ /*
+ * decoder needs to know the ABI especially
+ * on 64-bit systems running 32-bit apps
+ */
+#ifdef CONFIG_X86_64
+ is64 = kernel_ip((unsigned long)addr) || any_64bit_mode(current_pt_regs());
+#endif
+ insn_init(&insn, addr, bytes_read, is64);
+ ret = decode_branch_type(&insn);
+ insn_offset = 0;
+
+ /* Check for the possibility of branch fusion */
+ while (fused && ret == X86_BR_NONE) {
+ /* Check for decoding errors */
+ if (insn_get_length(&insn) || !insn.length)
+ break;
+
+ insn_offset += insn.length;
+ bytes_read -= insn.length;
+ if (bytes_read < 0)
+ break;
+
+ insn_init(&insn, addr + insn_offset, bytes_read, is64);
+ ret = decode_branch_type(&insn);
+ }
+
+ if (offset)
+ *offset = insn_offset;
+
+ /*
+ * interrupts, traps, faults (and thus ring transition) may
+ * occur on any instructions. Thus, to classify them correctly,
+ * we need to first look at the from and to priv levels. If they
+ * are different and to is in the kernel, then it indicates
+ * a ring transition. If the from instruction is not a ring
+ * transition instr (syscall, systenter, int), then it means
+ * it was a irq, trap or fault.
+ *
+ * we have no way of detecting kernel to kernel faults.
+ */
+ if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
+ && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
+ ret = X86_BR_IRQ;
+
+ /*
+ * branch priv level determined by target as
+ * is done by HW when LBR_SELECT is implemented
+ */
+ if (ret != X86_BR_NONE)
+ ret |= to_plm;
+
+ return ret;
+}
+
+int branch_type(unsigned long from, unsigned long to, int abort)
+{
+ return get_branch_type(from, to, abort, false, NULL);
+}
+
+int branch_type_fused(unsigned long from, unsigned long to, int abort,
+ int *offset)
+{
+ return get_branch_type(from, to, abort, true, offset);
+}
+
+#define X86_BR_TYPE_MAP_MAX 16
+
+static int branch_map[X86_BR_TYPE_MAP_MAX] = {
+ PERF_BR_CALL, /* X86_BR_CALL */
+ PERF_BR_RET, /* X86_BR_RET */
+ PERF_BR_SYSCALL, /* X86_BR_SYSCALL */
+ PERF_BR_SYSRET, /* X86_BR_SYSRET */
+ PERF_BR_UNKNOWN, /* X86_BR_INT */
+ PERF_BR_ERET, /* X86_BR_IRET */
+ PERF_BR_COND, /* X86_BR_JCC */
+ PERF_BR_UNCOND, /* X86_BR_JMP */
+ PERF_BR_IRQ, /* X86_BR_IRQ */
+ PERF_BR_IND_CALL, /* X86_BR_IND_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_ABORT */
+ PERF_BR_UNKNOWN, /* X86_BR_IN_TX */
+ PERF_BR_NO_TX, /* X86_BR_NO_TX */
+ PERF_BR_CALL, /* X86_BR_ZERO_CALL */
+ PERF_BR_UNKNOWN, /* X86_BR_CALL_STACK */
+ PERF_BR_IND, /* X86_BR_IND_JMP */
+};
+
+int common_branch_type(int type)
+{
+ int i;
+
+ type >>= 2; /* skip X86_BR_USER and X86_BR_KERNEL */
+
+ if (type) {
+ i = __ffs(type);
+ if (i < X86_BR_TYPE_MAP_MAX)
+ return branch_map[i];
+ }
+
+ return PERF_BR_UNKNOWN;
+}
diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h
index f3eb098d63d4..cb2a5e113daa 100644
--- a/arch/x86/include/asm/amd-ibs.h
+++ b/arch/x86/include/asm/amd-ibs.h
@@ -6,6 +6,22 @@
#include <asm/msr-index.h>
+/* IBS_OP_DATA2 DataSrc */
+#define IBS_DATA_SRC_LOC_CACHE 2
+#define IBS_DATA_SRC_DRAM 3
+#define IBS_DATA_SRC_REM_CACHE 4
+#define IBS_DATA_SRC_IO 7
+
+/* IBS_OP_DATA2 DataSrc Extension */
+#define IBS_DATA_SRC_EXT_LOC_CACHE 1
+#define IBS_DATA_SRC_EXT_NEAR_CCX_CACHE 2
+#define IBS_DATA_SRC_EXT_DRAM 3
+#define IBS_DATA_SRC_EXT_FAR_CCX_CACHE 5
+#define IBS_DATA_SRC_EXT_PMEM 6
+#define IBS_DATA_SRC_EXT_IO 7
+#define IBS_DATA_SRC_EXT_EXT_MEM 8
+#define IBS_DATA_SRC_EXT_PEER_AGENT_MEM 12
+
/*
* IBS Hardware MSRs
*/
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index ef4775c6db01..b71f4f2ecdd5 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -96,7 +96,7 @@
#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
-/* FREE! ( 3*32+17) */
+#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
index a1f0e90d0818..0bc931cd0698 100644
--- a/arch/x86/include/asm/hw_breakpoint.h
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -44,10 +44,7 @@ struct arch_hw_breakpoint {
/* Total number of available HW breakpoint registers */
#define HBP_NUM 4
-static inline int hw_breakpoint_slots(int type)
-{
- return HBP_NUM;
-}
+#define hw_breakpoint_slots(type) (HBP_NUM)
struct perf_event_attr;
struct perf_event;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 1e086b37a307..10ac52705892 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -590,6 +590,9 @@
#define MSR_AMD64_PERF_CNTR_GLOBAL_CTL 0xc0000301
#define MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR 0xc0000302
+/* AMD Last Branch Record MSRs */
+#define MSR_AMD64_LBR_SELECT 0xc000010e
+
/* Fam 17h MSRs */
#define MSR_F17H_IRPERF 0xc00000e9
@@ -761,6 +764,8 @@
#define MSR_AMD_DBG_EXTN_CFG 0xc000010f
#define MSR_AMD_SAMP_BR_FROM 0xc0010300
+#define DBG_EXTN_CFG_LBRV2EN BIT_ULL(6)
+
#define MSR_IA32_MPERF 0x000000e7
#define MSR_IA32_APERF 0x000000e8
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index f6fc8dd51ef4..9ac46dbe57d4 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -207,7 +207,8 @@ union cpuid_0x80000022_ebx {
struct {
/* Number of Core Performance Counters */
unsigned int num_core_pmc:4;
- unsigned int reserved:6;
+ /* Number of available LBR Stack Entries */
+ unsigned int lbr_v2_stack_sz:6;
/* Number of Data Fabric Counters */
unsigned int num_df_pmc:6;
} split;
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index fd44b54c90d5..fc01f81f6e2a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 },
+ { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 },
{ 0, 0, 0, 0, 0 }
};
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 6ce05ef4844d..00e3a637f7b6 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -44,7 +44,9 @@
* This allows us to perform the check, i.e, perfmon_capable(),
* in the context of the event owner, once, during the event_init().
*/
-#define SPE_PMU_HW_FLAGS_CX BIT(0)
+#define SPE_PMU_HW_FLAGS_CX 0x00001
+
+static_assert((PERF_EVENT_FLAG_ARCH & SPE_PMU_HW_FLAGS_CX) == SPE_PMU_HW_FLAGS_CX);
static void set_spe_event_has_cx(struct perf_event *event)
{
diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h
index 78dd7035d1e5..f319bd26b030 100644
--- a/include/linux/hw_breakpoint.h
+++ b/include/linux/hw_breakpoint.h
@@ -74,12 +74,12 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
extern int register_perf_hw_breakpoint(struct perf_event *bp);
extern void unregister_hw_breakpoint(struct perf_event *bp);
extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events);
+extern bool hw_breakpoint_is_used(void);
extern int dbg_reserve_bp_slot(struct perf_event *bp);
extern int dbg_release_bp_slot(struct perf_event *bp);
extern int reserve_bp_slot(struct perf_event *bp);
extern void release_bp_slot(struct perf_event *bp);
-int hw_breakpoint_weight(struct perf_event *bp);
int arch_reserve_bp_slot(struct perf_event *bp);
void arch_release_bp_slot(struct perf_event *bp);
void arch_unregister_hw_breakpoint(struct perf_event *bp);
@@ -121,6 +121,8 @@ register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; }
static inline void unregister_hw_breakpoint(struct perf_event *bp) { }
static inline void
unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { }
+static inline bool hw_breakpoint_is_used(void) { return false; }
+
static inline int
reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; }
static inline void release_bp_slot(struct perf_event *bp) { }
diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 5fda40f97fe9..36b942b67b7d 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -121,9 +121,15 @@ static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
preempt_enable();
}
+extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);
+static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
+{
+ return atomic_read(&sem->block);
+}
+
extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
const char *, struct lock_class_key *);
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index 0407a38b470a..0356cb6a215d 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -24,10 +24,11 @@
/*
* ARM PMU hw_event flags
*/
-/* Event uses a 64bit counter */
-#define ARMPMU_EVT_64BIT 1
-/* Event uses a 47bit counter */
-#define ARMPMU_EVT_47BIT 2
+#define ARMPMU_EVT_64BIT 0x00001 /* Event uses a 64bit counter */
+#define ARMPMU_EVT_47BIT 0x00002 /* Event uses a 47bit counter */
+
+static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_64BIT) == ARMPMU_EVT_64BIT);
+static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_47BIT) == ARMPMU_EVT_47BIT);
#define HW_OP_UNSUPPORTED 0xFFFF
#define C(_x) PERF_COUNT_HW_CACHE_##_x
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee8b9ecdc03b..853f64b6c8c2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -36,6 +36,7 @@ struct perf_guest_info_callbacks {
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
+#include <linux/rhashtable-types.h>
#include <asm/hw_breakpoint.h>
#endif
@@ -60,6 +61,7 @@ struct perf_guest_info_callbacks {
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
+#include <linux/lockdep.h>
#include <asm/local.h>
struct perf_callchain_entry {
@@ -137,9 +139,11 @@ struct hw_perf_event_extra {
* PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
* usage.
*/
-#define PERF_EVENT_FLAG_ARCH 0x0000ffff
+#define PERF_EVENT_FLAG_ARCH 0x000fffff
#define PERF_EVENT_FLAG_USER_READ_CNT 0x80000000
+static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);
+
/**
* struct hw_perf_event - performance event hardware details:
*/
@@ -178,7 +182,7 @@ struct hw_perf_event {
* creation and event initalization.
*/
struct arch_hw_breakpoint info;
- struct list_head bp_list;
+ struct rhlist_head bp_list;
};
#endif
struct { /* amd_iommu */
@@ -631,7 +635,23 @@ struct pmu_event_list {
struct list_head list;
};
+/*
+ * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
+ * as such iteration must hold either lock. However, since ctx->lock is an IRQ
+ * safe lock, and is only held by the CPU doing the modification, having IRQs
+ * disabled is sufficient since it will hold-off the IPIs.
+ */
+#ifdef CONFIG_PROVE_LOCKING
+#define lockdep_assert_event_ctx(event) \
+ WARN_ON_ONCE(__lockdep_enabled && \
+ (this_cpu_read(hardirqs_enabled) && \
+ lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
+#else
+#define lockdep_assert_event_ctx(event)
+#endif
+
#define for_each_sibling_event(sibling, event) \
+ lockdep_assert_event_ctx(event); \
if ((event)->group_leader == (event)) \
list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)
@@ -1007,18 +1027,20 @@ struct perf_sample_data {
* Fields set by perf_sample_data_init(), group so as to
* minimize the cachelines touched.
*/
- u64 addr;
- struct perf_raw_record *raw;
- struct perf_branch_stack *br_stack;
+ u64 sample_flags;
u64 period;
- union perf_sample_weight weight;
- u64 txn;
- union perf_mem_data_src data_src;
/*
* The other fields, optionally {set,used} by
* perf_{prepare,output}_sample().
*/
+ struct perf_branch_stack *br_stack;
+ union perf_sample_weight weight;
+ union perf_mem_data_src data_src;
+ u64 txn;
+ u64 addr;
+ struct perf_raw_record *raw;
+
u64 type;
u64 ip;
struct {
@@ -1056,13 +1078,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
u64 addr, u64 period)
{
/* remaining struct members initialized in perf_prepare_sample() */
- data->addr = addr;
- data->raw = NULL;
- data->br_stack = NULL;
+ data->sample_flags = PERF_SAMPLE_PERIOD;
data->period = period;
- data->weight.full = 0;
- data->data_src.val = PERF_MEM_NA;
- data->txn = 0;
+
+ if (addr) {
+ data->addr = addr;
+ data->sample_flags |= PERF_SAMPLE_ADDR;
+ }
}
/*
@@ -1078,6 +1100,7 @@ static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *b
br->abort = 0;
br->cycles = 0;
br->type = 0;
+ br->spec = PERF_BR_SPEC_NA;
br->reserved = 0;
}
@@ -1684,4 +1707,30 @@ static inline void perf_lopwr_cb(bool mode)
}
#endif
+#ifdef CONFIG_PERF_EVENTS
+static inline bool branch_sample_no_flags(const struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
+}
+
+static inline bool branch_sample_no_cycles(const struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
+}
+
+static inline bool branch_sample_type(const struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
+}
+
+static inline bool branch_sample_hw_index(const struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
+static inline bool branch_sample_priv(const struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
+}
+#endif /* CONFIG_PERF_EVENTS */
#endif /* _LINUX_PERF_EVENT_H */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 03b370062741..85be78e0e7f6 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -164,8 +164,6 @@ enum perf_event_sample_format {
PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24,
PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */
-
- __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT)
@@ -204,6 +202,8 @@ enum perf_branch_sample_type_shift {
PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */
+ PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */
+
PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */
};
@@ -233,6 +233,8 @@ enum perf_branch_sample_type {
PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT,
+ PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT,
+
PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
};
@@ -253,9 +255,48 @@ enum {
PERF_BR_COND_RET = 10, /* conditional function return */
PERF_BR_ERET = 11, /* exception return */
PERF_BR_IRQ = 12, /* irq */
+ PERF_BR_SERROR = 13, /* system error */
+ PERF_BR_NO_TX = 14, /* not in transaction */
+ PERF_BR_EXTEND_ABI = 15, /* extend ABI */
PERF_BR_MAX,
};
+/*
+ * Common branch speculation outcome classification
+ */
+enum {
+ PERF_BR_SPEC_NA = 0, /* Not available */
+ PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */
+ PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */
+ PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */
+ PERF_BR_SPEC_MAX,
+};
+
+enum {
+ PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */
+ PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */
+ PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */
+ PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */
+ PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */
+ PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */
+ PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */
+ PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */
+ PERF_BR_NEW_MAX,
+};
+
+enum {
+ PERF_BR_PRIV_UNKNOWN = 0,
+ PERF_BR_PRIV_USER = 1,
+ PERF_BR_PRIV_KERNEL = 2,
+ PERF_BR_PRIV_HV = 3,
+};
+
+#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1
+#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2
+#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3
+#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4
+#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5
+
#define PERF_SAMPLE_BRANCH_PLM_ALL \
(PERF_SAMPLE_BRANCH_USER|\
PERF_SAMPLE_BRANCH_KERNEL|\
@@ -1295,7 +1336,9 @@ union perf_mem_data_src {
#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */
#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */
#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */
-/* 5-0xa available */
+/* 5-0x8 available */
+#define PERF_MEM_LVLNUM_EXTN_MEM 0x09 /* Extension memory */
+#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */
#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */
#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */
#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */
@@ -1313,7 +1356,7 @@ union perf_mem_data_src {
#define PERF_MEM_SNOOP_SHIFT 19
#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */
-/* 1 free */
+#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */
#define PERF_MEM_SNOOPX_SHIFT 38
/* locked instruction */
@@ -1363,6 +1406,7 @@ union perf_mem_data_src {
* abort: aborting a hardware transaction
* cycles: cycles from last branch (or 0 if not supported)
* type: branch type
+ * spec: branch speculation info (or 0 if not supported)
*/
struct perf_branch_entry {
__u64 from;
@@ -1373,7 +1417,10 @@ struct perf_branch_entry {
abort:1, /* transaction abort */
cycles:16, /* cycle count to last branch */
type:4, /* branch type */
- reserved:40;
+ spec:2, /* branch speculation info */
+ new_type:4, /* additional branch type */
+ priv:3, /* privilege level */
+ reserved:31;
};
union perf_sample_weight {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 1adbe67cdb95..aecea7451b61 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
int ret;
/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return bpf_get_stackid((unsigned long)(ctx->regs),
(unsigned long) map, flags, 0, 0);
@@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
int err = -EINVAL;
__u64 nr_kernel;
- if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 8591c180b52b..91a62f566743 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,4 +2,5 @@
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o
obj-$(CONFIG_UPROBES) += uprobes.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ff4bffc502c6..43b0df997d13 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
+ lockdep_assert_held(&ctx->lock);
+
if (adv)
ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
@@ -2224,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event)
static inline int pmu_filter_match(struct perf_event *event)
{
struct perf_event *sibling;
+ unsigned long flags;
+ int ret = 1;
if (!__pmu_filter_match(event))
return 0;
+ local_irq_save(flags);
for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling))
- return 0;
+ if (!__pmu_filter_match(sibling)) {
+ ret = 0;
+ break;
+ }
}
+ local_irq_restore(flags);
- return 1;
+ return ret;
}
static inline int
@@ -6794,11 +6802,10 @@ out_put:
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 sample_type)
{
- u64 sample_type = event->attr.sample_type;
-
- data->type = sample_type;
+ data->type = event->attr.sample_type;
header->size += event->id_header_size;
if (sample_type & PERF_SAMPLE_TID) {
@@ -6827,7 +6834,7 @@ void perf_event_header__init_id(struct perf_event_header *header,
struct perf_event *event)
{
if (event->attr.sample_id_all)
- __perf_event_header__init_id(header, data, event);
+ __perf_event_header__init_id(header, data, event, event->attr.sample_type);
}
static void __perf_event__output_id_sample(struct perf_output_handle *handle,
@@ -6976,11 +6983,6 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
-static inline bool perf_sample_save_hw_index(struct perf_event *event)
-{
- return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
-}
-
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -7062,14 +7064,14 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
- if (data->br_stack) {
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
size_t size;
size = data->br_stack->nr
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
- if (perf_sample_save_hw_index(event))
+ if (branch_sample_hw_index(event))
perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
@@ -7312,6 +7314,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct pt_regs *regs)
{
u64 sample_type = event->attr.sample_type;
+ u64 filtered_sample_type;
header->type = PERF_RECORD_SAMPLE;
header->size = sizeof(*header) + event->header_size;
@@ -7319,7 +7322,12 @@ void perf_prepare_sample(struct perf_event_header *header,
header->misc = 0;
header->misc |= perf_misc_flags(regs);
- __perf_event_header__init_id(header, data, event);
+ /*
+ * Clear the sample flags that have already been done by the
+ * PMU driver.
+ */
+ filtered_sample_type = sample_type & ~data->sample_flags;
+ __perf_event_header__init_id(header, data, event, filtered_sample_type);
if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
data->ip = perf_instruction_pointer(regs);
@@ -7327,7 +7335,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+ if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
data->callchain = perf_callchain(event, regs);
size += data->callchain->nr;
@@ -7339,7 +7347,7 @@ void perf_prepare_sample(struct perf_event_header *header,
struct perf_raw_record *raw = data->raw;
int size;
- if (raw) {
+ if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) {
struct perf_raw_frag *frag = &raw->frag;
u32 sum = 0;
@@ -7355,6 +7363,7 @@ void perf_prepare_sample(struct perf_event_header *header,
frag->pad = raw->size - sum;
} else {
size = sizeof(u64);
+ data->raw = NULL;
}
header->size += size;
@@ -7362,8 +7371,8 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
- if (data->br_stack) {
- if (perf_sample_save_hw_index(event))
+ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) {
+ if (branch_sample_hw_index(event))
size += sizeof(u64);
size += data->br_stack->nr
@@ -7412,6 +7421,20 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE)
+ data->weight.full = 0;
+
+ if (filtered_sample_type & PERF_SAMPLE_DATA_SRC)
+ data->data_src.val = PERF_MEM_NA;
+
+ if (filtered_sample_type & PERF_SAMPLE_TRANSACTION)
+ data->txn = 0;
+
+ if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) {
+ if (filtered_sample_type & PERF_SAMPLE_ADDR)
+ data->addr = 0;
+ }
+
if (sample_type & PERF_SAMPLE_REGS_INTR) {
/* regs dump ABI info */
int size = sizeof(u64);
@@ -7427,7 +7450,8 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
- if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR &&
+ filtered_sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
#ifdef CONFIG_CGROUP_PERF
@@ -9998,8 +10022,16 @@ static void bpf_overflow_handler(struct perf_event *event,
goto out;
rcu_read_lock();
prog = READ_ONCE(event->prog);
- if (prog)
+ if (prog) {
+ if (prog->call_get_stack &&
+ (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
+ !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) {
+ data->callchain = perf_callchain(event, regs);
+ data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
+ }
+
ret = bpf_prog_run(prog, &ctx);
+ }
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
@@ -10025,7 +10057,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event,
if (event->attr.precise_ip &&
prog->call_get_stack &&
- (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+ (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
@@ -10942,7 +10974,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -10953,7 +10985,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -10964,7 +10996,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+ return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -11718,11 +11750,9 @@ err_pmu:
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (event->ns)
- put_pid_ns(event->ns);
if (event->hw.target)
put_task_struct(event->hw.target);
- kmem_cache_free(perf_event_cache, event);
+ call_rcu(&event->rcu_head, free_event_rcu);
return ERR_PTR(err);
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index f32320ac02fd..c3797701339c 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -17,61 +17,276 @@
* This file contains the arch-independent routines.
*/
+#include <linux/hw_breakpoint.h>
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cpu.h>
+#include <linux/export.h>
+#include <linux/init.h>
#include <linux/irqflags.h>
-#include <linux/kallsyms.h>
-#include <linux/notifier.h>
-#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/percpu-rwsem.h>
#include <linux/percpu.h>
+#include <linux/rhashtable.h>
#include <linux/sched.h>
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/cpu.h>
-#include <linux/smp.h>
-#include <linux/bug.h>
-#include <linux/hw_breakpoint.h>
/*
- * Constraints data
+ * Datastructure to track the total uses of N slots across tasks or CPUs;
+ * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots.
+ */
+struct bp_slots_histogram {
+#ifdef hw_breakpoint_slots
+ atomic_t count[hw_breakpoint_slots(0)];
+#else
+ atomic_t *count;
+#endif
+};
+
+/*
+ * Per-CPU constraints data.
*/
struct bp_cpuinfo {
- /* Number of pinned cpu breakpoints in a cpu */
- unsigned int cpu_pinned;
- /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
- unsigned int *tsk_pinned;
- /* Number of non-pinned cpu/task breakpoints in a cpu */
- unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
+ /* Number of pinned CPU breakpoints in a CPU. */
+ unsigned int cpu_pinned;
+ /* Histogram of pinned task breakpoints in a CPU. */
+ struct bp_slots_histogram tsk_pinned;
};
static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static int nr_slots[TYPE_MAX];
static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
{
return per_cpu_ptr(bp_cpuinfo + type, cpu);
}
+/* Number of pinned CPU breakpoints globally. */
+static struct bp_slots_histogram cpu_pinned[TYPE_MAX];
+/* Number of pinned CPU-independent task breakpoints. */
+static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX];
+
/* Keep track of the breakpoints attached to tasks */
-static LIST_HEAD(bp_task_head);
+static struct rhltable task_bps_ht;
+static const struct rhashtable_params task_bps_ht_params = {
+ .head_offset = offsetof(struct hw_perf_event, bp_list),
+ .key_offset = offsetof(struct hw_perf_event, target),
+ .key_len = sizeof_field(struct hw_perf_event, target),
+ .automatic_shrinking = true,
+};
-static int constraints_initialized;
+static bool constraints_initialized __ro_after_init;
-/* Gather the number of total pinned and un-pinned bp in a cpuset */
-struct bp_busy_slots {
- unsigned int pinned;
- unsigned int flexible;
-};
+/*
+ * Synchronizes accesses to the per-CPU constraints; the locking rules are:
+ *
+ * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock
+ * (due to bp_slots_histogram::count being atomic, no update are lost).
+ *
+ * 2. Holding a write-lock is required for computations that require a
+ * stable snapshot of all bp_cpuinfo::tsk_pinned.
+ *
+ * 3. In all other cases, non-atomic accesses require the appropriately held
+ * lock (read-lock for read-only accesses; write-lock for reads/writes).
+ */
+DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem);
-/* Serialize accesses to the above constraints */
-static DEFINE_MUTEX(nr_bp_mutex);
+/*
+ * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since
+ * rhltable synchronizes concurrent insertions/deletions, independent tasks may
+ * insert/delete concurrently; therefore, a mutex per task is sufficient.
+ *
+ * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a
+ * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is
+ * that hw_breakpoint may contend with per-task perf event list management. The
+ * assumption is that perf usecases involving hw_breakpoints are very unlikely
+ * to result in unnecessary contention.
+ */
+static inline struct mutex *get_task_bps_mutex(struct perf_event *bp)
+{
+ struct task_struct *tsk = bp->hw.target;
-__weak int hw_breakpoint_weight(struct perf_event *bp)
+ return tsk ? &tsk->perf_event_mutex : NULL;
+}
+
+static struct mutex *bp_constraints_lock(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx) {
+ /*
+ * Fully analogous to the perf_try_init_event() nesting
+ * argument in the comment near perf_event_ctx_lock_nested();
+ * this child->perf_event_mutex cannot ever deadlock against
+ * the parent->perf_event_mutex usage from
+ * perf_event_task_{en,dis}able().
+ *
+ * Specifically, inherited events will never occur on
+ * ->perf_event_list.
+ */
+ mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING);
+ percpu_down_read(&bp_cpuinfo_sem);
+ } else {
+ percpu_down_write(&bp_cpuinfo_sem);
+ }
+
+ return tsk_mtx;
+}
+
+static void bp_constraints_unlock(struct mutex *tsk_mtx)
+{
+ if (tsk_mtx) {
+ percpu_up_read(&bp_cpuinfo_sem);
+ mutex_unlock(tsk_mtx);
+ } else {
+ percpu_up_write(&bp_cpuinfo_sem);
+ }
+}
+
+static bool bp_constraints_is_locked(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ return percpu_is_write_locked(&bp_cpuinfo_sem) ||
+ (tsk_mtx ? mutex_is_locked(tsk_mtx) :
+ percpu_is_read_locked(&bp_cpuinfo_sem));
+}
+
+static inline void assert_bp_constraints_lock_held(struct perf_event *bp)
+{
+ struct mutex *tsk_mtx = get_task_bps_mutex(bp);
+
+ if (tsk_mtx)
+ lockdep_assert_held(tsk_mtx);
+ lockdep_assert_held(&bp_cpuinfo_sem);
+}
+
+#ifdef hw_breakpoint_slots
+/*
+ * Number of breakpoint slots is constant, and the same for all types.
+ */
+static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA));
+static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); }
+static inline int init_breakpoint_slots(void) { return 0; }
+#else
+/*
+ * Dynamic number of breakpoint slots.
+ */
+static int __nr_bp_slots[TYPE_MAX] __ro_after_init;
+
+static inline int hw_breakpoint_slots_cached(int type)
+{
+ return __nr_bp_slots[type];
+}
+
+static __init bool
+bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL);
+ return hist->count;
+}
+
+static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist)
+{
+ kfree(hist->count);
+}
+
+static __init int init_breakpoint_slots(void)
+{
+ int i, cpu, err_cpu;
+
+ for (i = 0; i < TYPE_MAX; i++)
+ __nr_bp_slots[i] = hw_breakpoint_slots(i);
+
+ for_each_possible_cpu(cpu) {
+ for (i = 0; i < TYPE_MAX; i++) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, i);
+
+ if (!bp_slots_histogram_alloc(&info->tsk_pinned, i))
+ goto err;
+ }
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ if (!bp_slots_histogram_alloc(&cpu_pinned[i], i))
+ goto err;
+ if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i))
+ goto err;
+ }
+
+ return 0;
+err:
+ for_each_possible_cpu(err_cpu) {
+ for (i = 0; i < TYPE_MAX; i++)
+ bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned);
+ if (err_cpu == cpu)
+ break;
+ }
+ for (i = 0; i < TYPE_MAX; i++) {
+ bp_slots_histogram_free(&cpu_pinned[i]);
+ bp_slots_histogram_free(&tsk_pinned_all[i]);
+ }
+
+ return -ENOMEM;
+}
+#endif
+
+static inline void
+bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val)
+{
+ const int old_idx = old - 1;
+ const int new_idx = old_idx + val;
+
+ if (old_idx >= 0)
+ WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0);
+ if (new_idx >= 0)
+ WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0);
+}
+
+static int
+bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count = atomic_read(&hist->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist->count[i]);
+ if (count > 0)
+ return i + 1;
+ WARN(count < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+static int
+bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2,
+ enum bp_type_idx type)
+{
+ for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) {
+ const int count1 = atomic_read(&hist1->count[i]);
+ const int count2 = atomic_read(&hist2->count[i]);
+
+ /* Catch unexpected writers; we want a stable snapshot. */
+ ASSERT_EXCLUSIVE_WRITER(hist1->count[i]);
+ ASSERT_EXCLUSIVE_WRITER(hist2->count[i]);
+ if (count1 + count2 > 0)
+ return i + 1;
+ WARN(count1 < 0, "inconsistent breakpoint slots histogram");
+ WARN(count2 < 0, "inconsistent breakpoint slots histogram");
+ }
+
+ return 0;
+}
+
+#ifndef hw_breakpoint_weight
+static inline int hw_breakpoint_weight(struct perf_event *bp)
{
return 1;
}
+#endif
static inline enum bp_type_idx find_slot_idx(u64 bp_type)
{
@@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type)
}
/*
- * Report the maximum number of pinned breakpoints a task
- * have in this cpu
+ * Return the maximum number of pinned breakpoints a task has in this CPU.
*/
static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int i;
+ struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned;
- for (i = nr_slots[type] - 1; i >= 0; i--) {
- if (tsk_pinned[i] > 0)
- return i + 1;
- }
-
- return 0;
+ /*
+ * At this point we want to have acquired the bp_cpuinfo_sem as a
+ * writer to ensure that there are no concurrent writers in
+ * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot.
+ */
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type);
}
/*
* Count the number of breakpoints of the same type and same task.
* The given event must be not on the list.
+ *
+ * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent,
+ * returns a negative value.
*/
static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
{
- struct task_struct *tsk = bp->hw.target;
+ struct rhlist_head *head, *pos;
struct perf_event *iter;
int count = 0;
- list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
- if (iter->hw.target == tsk &&
- find_slot_idx(iter->attr.bp_type) == type &&
- (iter->cpu < 0 || cpu == iter->cpu))
- count += hw_breakpoint_weight(iter);
+ /*
+ * We need a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
+
+ rcu_read_lock();
+ head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params);
+ if (!head)
+ goto out;
+
+ rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) {
+ if (find_slot_idx(iter->attr.bp_type) != type)
+ continue;
+
+ if (iter->cpu >= 0) {
+ if (cpu == -1) {
+ count = -1;
+ goto out;
+ } else if (cpu != iter->cpu)
+ continue;
+ }
+
+ count += hw_breakpoint_weight(iter);
}
+out:
+ rcu_read_unlock();
return count;
}
@@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
}
/*
- * Report the number of pinned/un-pinned breakpoints we have in
- * a given cpu (cpu > -1) or in all of them (cpu = -1).
+ * Returns the max pinned breakpoint slots in a given
+ * CPU (cpu > -1) or across all of them (cpu = -1).
*/
-static void
-fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
- enum bp_type_idx type)
+static int
+max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type)
{
const struct cpumask *cpumask = cpumask_of_bp(bp);
+ int pinned_slots = 0;
int cpu;
+ if (bp->hw.target && bp->cpu < 0) {
+ int max_pinned = task_bp_pinned(-1, bp, type);
+
+ if (max_pinned >= 0) {
+ /*
+ * Fast path: task_bp_pinned() is CPU-independent and
+ * returns the same value for any CPU.
+ */
+ max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type);
+ return max_pinned;
+ }
+ }
+
for_each_cpu(cpu, cpumask) {
struct bp_cpuinfo *info = get_bp_info(cpu, type);
int nr;
@@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
else
nr += task_bp_pinned(cpu, bp, type);
- if (nr > slots->pinned)
- slots->pinned = nr;
-
- nr = info->flexible;
- if (nr > slots->flexible)
- slots->flexible = nr;
+ pinned_slots = max(nr, pinned_slots);
}
-}
-/*
- * For now, continue to consider flexible as pinned, until we can
- * ensure no flexible event can ever be scheduled before a pinned event
- * in a same cpu.
- */
-static void
-fetch_this_slot(struct bp_busy_slots *slots, int weight)
-{
- slots->pinned += weight;
-}
-
-/*
- * Add a pinned breakpoint for the given task in our constraint table
- */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
- enum bp_type_idx type, int weight)
-{
- unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
- int old_idx, new_idx;
-
- old_idx = task_bp_pinned(cpu, bp, type) - 1;
- new_idx = old_idx + weight;
-
- if (old_idx >= 0)
- tsk_pinned[old_idx]--;
- if (new_idx >= 0)
- tsk_pinned[new_idx]++;
+ return pinned_slots;
}
/*
* Add/remove the given breakpoint in our constraint table
*/
-static void
-toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
- int weight)
+static int
+toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight)
{
- const struct cpumask *cpumask = cpumask_of_bp(bp);
- int cpu;
+ int cpu, next_tsk_pinned;
if (!enable)
weight = -weight;
- /* Pinned counter cpu profiling */
if (!bp->hw.target) {
- get_bp_info(bp->cpu, type)->cpu_pinned += weight;
- return;
+ /*
+ * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the
+ * global histogram.
+ */
+ struct bp_cpuinfo *info = get_bp_info(bp->cpu, type);
+
+ lockdep_assert_held_write(&bp_cpuinfo_sem);
+ bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight);
+ info->cpu_pinned += weight;
+ return 0;
+ }
+
+ /*
+ * If bp->hw.target, tsk_pinned is only modified, but not used
+ * otherwise. We can permit concurrent updates as long as there are no
+ * other uses: having acquired bp_cpuinfo_sem as a reader allows
+ * concurrent updates here. Uses of tsk_pinned will require acquiring
+ * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value.
+ */
+ lockdep_assert_held_read(&bp_cpuinfo_sem);
+
+ /*
+ * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global
+ * histogram. We need to take care of 4 cases:
+ *
+ * 1. This breakpoint targets all CPUs (cpu < 0), and there may only
+ * exist other task breakpoints targeting all CPUs. In this case we
+ * can simply update the global slots histogram.
+ *
+ * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may
+ * only exist other task breakpoints targeting all CPUs.
+ *
+ * a. On enable: remove the existing breakpoints from the global
+ * slots histogram and use the per-CPU histogram.
+ *
+ * b. On disable: re-insert the existing breakpoints into the global
+ * slots histogram and remove from per-CPU histogram.
+ *
+ * 3. Some other existing task breakpoints target specific CPUs. Only
+ * update the per-CPU slots histogram.
+ */
+
+ if (!enable) {
+ /*
+ * Remove before updating histograms so we can determine if this
+ * was the last task breakpoint for a specific CPU.
+ */
+ int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ if (ret)
+ return ret;
+ }
+ /*
+ * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint.
+ */
+ next_tsk_pinned = task_bp_pinned(-1, bp, type);
+
+ if (next_tsk_pinned >= 0) {
+ if (bp->cpu < 0) { /* Case 1: fast path */
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight);
+ } else if (enable) { /* Case 2.a: slow path */
+ /* Add existing to per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ 0, next_tsk_pinned);
+ }
+ /* Add this first CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned,
+ -next_tsk_pinned);
+ } else { /* Case 2.b: slow path */
+ /* Remove this last CPU-pinned task breakpoint. */
+ bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned,
+ next_tsk_pinned + hw_breakpoint_weight(bp), weight);
+ /* Remove all from per-CPU histograms. */
+ for_each_possible_cpu(cpu) {
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, -next_tsk_pinned);
+ }
+ /* Rebalance global task pinned histogram. */
+ bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned);
+ }
+ } else { /* Case 3: slow path */
+ const struct cpumask *cpumask = cpumask_of_bp(bp);
+
+ for_each_cpu(cpu, cpumask) {
+ next_tsk_pinned = task_bp_pinned(cpu, bp, type);
+ if (!enable)
+ next_tsk_pinned += hw_breakpoint_weight(bp);
+ bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned,
+ next_tsk_pinned, weight);
+ }
}
- /* Pinned counter task profiling */
- for_each_cpu(cpu, cpumask)
- toggle_bp_task_slot(bp, cpu, type, weight);
+ /*
+ * Readers want a stable snapshot of the per-task breakpoint list.
+ */
+ assert_bp_constraints_lock_held(bp);
if (enable)
- list_add_tail(&bp->hw.bp_list, &bp_task_head);
- else
- list_del(&bp->hw.bp_list);
+ return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params);
+
+ return 0;
}
__weak int arch_reserve_bp_slot(struct perf_event *bp)
@@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
}
/*
- * Constraints to check before allowing this new breakpoint counter:
+ * Constraints to check before allowing this new breakpoint counter.
+ *
+ * Note: Flexible breakpoints are currently unimplemented, but outlined in the
+ * below algorithm for completeness. The implementation treats flexible as
+ * pinned due to no guarantee that we currently always schedule flexible events
+ * before a pinned event in a same CPU.
*
* == Non-pinned counter == (Considered as pinned for now)
*
@@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
*/
static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
{
- struct bp_busy_slots slots = {0};
enum bp_type_idx type;
+ int max_pinned_slots;
int weight;
int ret;
@@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- fetch_bp_busy_slots(&slots, bp, type);
- /*
- * Simulate the addition of this breakpoint to the constraints
- * and see the result.
- */
- fetch_this_slot(&slots, weight);
-
- /* Flexible counters need to keep at least one slot */
- if (slots.pinned + (!!slots.flexible) > nr_slots[type])
+ /* Check if this new breakpoint can be satisfied across all CPUs. */
+ max_pinned_slots = max_bp_pinned_slots(bp, type) + weight;
+ if (max_pinned_slots > hw_breakpoint_slots_cached(type))
return -ENOSPC;
ret = arch_reserve_bp_slot(bp);
if (ret)
return ret;
- toggle_bp_slot(bp, true, type, weight);
-
- return 0;
+ return toggle_bp_slot(bp, true, type, weight);
}
int reserve_bp_slot(struct perf_event *bp)
{
- int ret;
-
- mutex_lock(&nr_bp_mutex);
-
- ret = __reserve_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type)
type = find_slot_idx(bp_type);
weight = hw_breakpoint_weight(bp);
- toggle_bp_slot(bp, false, type, weight);
+ WARN_ON(toggle_bp_slot(bp, false, type, weight));
}
void release_bp_slot(struct perf_event *bp)
{
- mutex_lock(&nr_bp_mutex);
+ struct mutex *mtx = bp_constraints_lock(bp);
arch_unregister_hw_breakpoint(bp);
__release_bp_slot(bp, bp->attr.bp_type);
-
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
}
static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
@@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
{
- int ret;
+ struct mutex *mtx = bp_constraints_lock(bp);
+ int ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_lock(&nr_bp_mutex);
- ret = __modify_bp_slot(bp, old_type, new_type);
- mutex_unlock(&nr_bp_mutex);
+ bp_constraints_unlock(mtx);
return ret;
}
@@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type)
*/
int dbg_reserve_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ int ret;
+
+ if (bp_constraints_is_locked(bp))
return -1;
- return __reserve_bp_slot(bp, bp->attr.bp_type);
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
+ ret = __reserve_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
+
+ return ret;
}
int dbg_release_bp_slot(struct perf_event *bp)
{
- if (mutex_is_locked(&nr_bp_mutex))
+ if (bp_constraints_is_locked(bp))
return -1;
+ /* Locks aren't held; disable lockdep assert checking. */
+ lockdep_off();
__release_bp_slot(bp, bp->attr.bp_type);
+ lockdep_on();
return 0;
}
@@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
}
EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
+/**
+ * hw_breakpoint_is_used - check if breakpoints are currently used
+ *
+ * Returns: true if breakpoints are used, false otherwise.
+ */
+bool hw_breakpoint_is_used(void)
+{
+ int cpu;
+
+ if (!constraints_initialized)
+ return false;
+
+ for_each_possible_cpu(cpu) {
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ struct bp_cpuinfo *info = get_bp_info(cpu, type);
+
+ if (info->cpu_pinned)
+ return true;
+
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ if (atomic_read(&info->tsk_pinned.count[slot]))
+ return true;
+ }
+ }
+ }
+
+ for (int type = 0; type < TYPE_MAX; ++type) {
+ for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) {
+ /*
+ * Warn, because if there are CPU pinned counters,
+ * should never get here; bp_cpuinfo::cpu_pinned should
+ * be consistent with the global cpu_pinned histogram.
+ */
+ if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot])))
+ return true;
+
+ if (atomic_read(&tsk_pinned_all[type].count[slot]))
+ return true;
+ }
+ }
+
+ return false;
+}
+
static struct notifier_block hw_breakpoint_exceptions_nb = {
.notifier_call = hw_breakpoint_exceptions_notify,
/* we need to be notified first */
@@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = {
int __init init_hw_breakpoint(void)
{
- int cpu, err_cpu;
- int i;
-
- for (i = 0; i < TYPE_MAX; i++)
- nr_slots[i] = hw_breakpoint_slots(i);
+ int ret;
- for_each_possible_cpu(cpu) {
- for (i = 0; i < TYPE_MAX; i++) {
- struct bp_cpuinfo *info = get_bp_info(cpu, i);
+ ret = rhltable_init(&task_bps_ht, &task_bps_ht_params);
+ if (ret)
+ return ret;
- info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
- GFP_KERNEL);
- if (!info->tsk_pinned)
- goto err_alloc;
- }
- }
+ ret = init_breakpoint_slots();
+ if (ret)
+ return ret;
- constraints_initialized = 1;
+ constraints_initialized = true;
perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT);
return register_die_notifier(&hw_breakpoint_exceptions_nb);
-
- err_alloc:
- for_each_possible_cpu(err_cpu) {
- for (i = 0; i < TYPE_MAX; i++)
- kfree(get_bp_info(err_cpu, i)->tsk_pinned);
- if (err_cpu == cpu)
- break;
- }
-
- return -ENOMEM;
}
-
-
diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c
new file mode 100644
index 000000000000..5ced822df788
--- /dev/null
+++ b/kernel/events/hw_breakpoint_test.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit test for hw_breakpoint constraints accounting logic.
+ *
+ * Copyright (C) 2022, Google LLC.
+ */
+
+#include <kunit/test.h>
+#include <linux/cpumask.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/kthread.h>
+#include <linux/perf_event.h>
+#include <asm/hw_breakpoint.h>
+
+#define TEST_REQUIRES_BP_SLOTS(test, slots) \
+ do { \
+ if ((slots) > get_test_bp_slots()) { \
+ kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \
+ get_test_bp_slots()); \
+ } \
+ } while (0)
+
+#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr))
+
+#define MAX_TEST_BREAKPOINTS 512
+
+static char break_vars[MAX_TEST_BREAKPOINTS];
+static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS];
+static struct task_struct *__other_task;
+
+static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx)
+{
+ struct perf_event_attr attr = {};
+
+ if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS))
+ return NULL;
+
+ hw_breakpoint_init(&attr);
+ attr.bp_addr = (unsigned long)&break_vars[idx];
+ attr.bp_len = HW_BREAKPOINT_LEN_1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL);
+}
+
+static void unregister_test_bp(struct perf_event **bp)
+{
+ if (WARN_ON(IS_ERR(*bp)))
+ return;
+ if (WARN_ON(!*bp))
+ return;
+ unregister_hw_breakpoint(*bp);
+ *bp = NULL;
+}
+
+static int get_test_bp_slots(void)
+{
+ static int slots;
+
+ if (!slots)
+ slots = hw_breakpoint_slots(TYPE_DATA);
+
+ return slots;
+}
+
+static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk)
+{
+ struct perf_event *bp = register_test_bp(cpu, tsk, *id);
+
+ KUNIT_ASSERT_NOT_NULL(test, bp);
+ KUNIT_ASSERT_FALSE(test, IS_ERR(bp));
+ KUNIT_ASSERT_NULL(test, test_bps[*id]);
+ test_bps[(*id)++] = bp;
+}
+
+/*
+ * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free.
+ *
+ * Returns true if this can be called again, continuing at @id.
+ */
+static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip)
+{
+ for (int i = 0; i < get_test_bp_slots() - skip; ++i)
+ fill_one_bp_slot(test, id, cpu, tsk);
+
+ return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS;
+}
+
+static int dummy_kthread(void *arg)
+{
+ return 0;
+}
+
+static struct task_struct *get_other_task(struct kunit *test)
+{
+ struct task_struct *tsk;
+
+ if (__other_task)
+ return __other_task;
+
+ tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task");
+ KUNIT_ASSERT_FALSE(test, IS_ERR(tsk));
+ __other_task = tsk;
+ return __other_task;
+}
+
+static int get_test_cpu(int num)
+{
+ int cpu;
+
+ WARN_ON(num < 0);
+
+ for_each_online_cpu(cpu) {
+ if (num-- <= 0)
+ break;
+ }
+
+ return cpu;
+}
+
+/* ===== Test cases ===== */
+
+static void test_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_many_cpus(struct kunit *test)
+{
+ int idx = 0;
+ int cpu;
+
+ /* Test that CPUs are independent. */
+ for_each_online_cpu(cpu) {
+ bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx));
+ if (!do_continue)
+ break;
+ }
+}
+
+static void test_one_task_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, -1, current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one and adding back CPU-target should work. */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_two_tasks_on_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ /* Test that tasks are independent. */
+ fill_bp_slots(test, &idx, -1, current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Remove one from first task and adding back CPU-target should not work. */
+ unregister_test_bp(&test_bps[0]);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_one_task_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /*
+ * Remove one and adding back CPU-target should work; this case is
+ * special vs. above because the task's constraints are CPU-dependent.
+ */
+ unregister_test_bp(&test_bps[0]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+}
+
+static void test_one_task_mixed(struct kunit *test)
+{
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_bp_slots(test, &idx, -1, current, 1);
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* Transition from CPU-dependent pinned count to CPU-independent. */
+ unregister_test_bp(&test_bps[0]);
+ unregister_test_bp(&test_bps[1]);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+}
+
+static void test_two_tasks_on_one_cpu(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Can still create breakpoints on some other CPU. */
+ fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0);
+}
+
+static void test_two_tasks_on_one_all_cpus(struct kunit *test)
+{
+ int idx = 0;
+
+ fill_bp_slots(test, &idx, get_test_cpu(0), current, 0);
+ fill_bp_slots(test, &idx, -1, get_other_task(test), 0);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ /* Cannot create breakpoints on some other CPU either. */
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static void test_task_on_all_and_one_cpu(struct kunit *test)
+{
+ int tsk_on_cpu_idx, cpu_idx;
+ int idx = 0;
+
+ TEST_REQUIRES_BP_SLOTS(test, 3);
+
+ fill_bp_slots(test, &idx, -1, current, 2);
+ /* Transitioning from only all CPU breakpoints to mixed. */
+ tsk_on_cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(0), current);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+
+ /* We should still be able to use up another CPU's slots. */
+ cpu_idx = idx;
+ fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL);
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+
+ /* Transitioning back to task target on all CPUs. */
+ unregister_test_bp(&test_bps[tsk_on_cpu_idx]);
+ /* Still have a CPU target breakpoint in get_test_cpu(1). */
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ /* Remove it and try again. */
+ unregister_test_bp(&test_bps[cpu_idx]);
+ fill_one_bp_slot(test, &idx, -1, current);
+
+ TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx));
+ TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx));
+}
+
+static struct kunit_case hw_breakpoint_test_cases[] = {
+ KUNIT_CASE(test_one_cpu),
+ KUNIT_CASE(test_many_cpus),
+ KUNIT_CASE(test_one_task_on_all_cpus),
+ KUNIT_CASE(test_two_tasks_on_all_cpus),
+ KUNIT_CASE(test_one_task_on_one_cpu),
+ KUNIT_CASE(test_one_task_mixed),
+ KUNIT_CASE(test_two_tasks_on_one_cpu),
+ KUNIT_CASE(test_two_tasks_on_one_all_cpus),
+ KUNIT_CASE(test_task_on_all_and_one_cpu),
+ {},
+};
+
+static int test_init(struct kunit *test)
+{
+ /* Most test cases want 2 distinct CPUs. */
+ if (num_online_cpus() < 2)
+ return -EINVAL;
+
+ /* Want the system to not use breakpoints elsewhere. */
+ if (hw_breakpoint_is_used())
+ return -EBUSY;
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) {
+ if (test_bps[i])
+ unregister_test_bp(&test_bps[i]);
+ }
+
+ if (__other_task) {
+ kthread_stop(__other_task);
+ __other_task = NULL;
+ }
+
+ /* Verify that internal state agrees that no breakpoints are in use. */
+ KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used());
+}
+
+static struct kunit_suite hw_breakpoint_test_suite = {
+ .name = "hw_breakpoint",
+ .test_cases = hw_breakpoint_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+
+kunit_test_suites(&hw_breakpoint_test_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 5fe4c5495ba3..185bd1c906b0 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read);
__sum; \
})
+bool percpu_is_read_locked(struct percpu_rw_semaphore *sem)
+{
+ return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block);
+}
+EXPORT_SYMBOL_GPL(percpu_is_read_locked);
+
/*
* Return true if the modular sum of the sem->read_count per-CPU variable is
* zero. If this sum is zero, then it is stable due to the fact that if any
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 688552df95ca..49fb9ec8366d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1706,6 +1706,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx,
if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE))
return -EINVAL;
+ if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK)))
+ return -ENOENT;
+
if (unlikely(!br_stack))
return -ENOENT;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a05cd9dc5f8b..405c8634d028 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2556,6 +2556,16 @@ config FORTIFY_KUNIT_TEST
by the str*() and mem*() family of functions. For testing runtime
traps of FORTIFY_SOURCE, see LKDTM's "FORTIFY_*" tests.
+config HW_BREAKPOINT_KUNIT_TEST
+ bool "Test hw_breakpoint constraints accounting" if !KUNIT_ALL_TESTS
+ depends on HAVE_HW_BREAKPOINT
+ depends on KUNIT=y
+ default KUNIT_ALL_TESTS
+ help
+ Tests for hw_breakpoint constraints accounting.
+
+ If unsure, say N.
+
config TEST_UDELAY
tristate "udelay test driver"
help