From 3aac580d5cc3001ca1627725b3b61edb529f341d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:54 -0700 Subject: perf: Add sample_flags to indicate the PMU-filled sample data On some platforms, some data e.g., timestamps, can be retrieved from the PMU driver. Usually, the data from the PMU driver is more accurate. The current perf kernel should output the PMU-filled sample data if it's available. To check the availability of the PMU-filled sample data, the current perf kernel initializes the related fields in the perf_sample_data_init(). When outputting a sample, the perf checks whether the field is updated by the PMU driver. If yes, the updated value will be output. If not, the perf uses an SW way to calculate the value or just outputs the initialized value if an SW way is unavailable either. With more and more data being provided by the PMU driver, more fields has to be initialized in the perf_sample_data_init(). That will increase the number of cache lines touched in perf_sample_data_init() and be harmful to the performance. Add new "sample_flags" to indicate the PMU-filled sample data. The PMU driver should set the corresponding PERF_SAMPLE_ flag when the field is updated. The initialization of the corresponding field is not required anymore. The following patches will make use of it and remove the corresponding fields from the perf_sample_data_init(), which will further minimize the number of cache lines touched. Only clear the sample flags that have already been done by the PMU driver in the perf_prepare_sample() for the PERF_RECORD_SAMPLE. For the other PERF_RECORD_ event type, the sample data is not available. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-2-kan.liang@linux.intel.com --- kernel/events/core.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel/events/core.c') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..c9b9cb79231a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6794,11 +6794,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +6826,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7303,6 +7302,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7310,7 +7310,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); -- cgit v1.2.3-59-g8ed1b From a9a931e2666878343782c82d7d55cc173ddeb3e9 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:56 -0700 Subject: perf: Use sample_flags for branch stack Use the new sample_flags to indicate whether the branch stack is filled by the PMU driver. Remove the br_stack from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-4-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 1 + arch/x86/events/amd/core.c | 4 +++- arch/x86/events/core.c | 4 +++- arch/x86/events/intel/core.c | 4 +++- arch/x86/events/intel/ds.c | 5 ++++- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 13919eb96931..1ad1efdb33f9 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2297,6 +2297,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, cpuhw = this_cpu_ptr(&cpu_hw_events); power_pmu_bhrb_read(event, cpuhw); data.br_stack = &cpuhw->bhrb_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 36bede1d7b1e..bd99d2ae14c3 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -929,8 +929,10 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!x86_perf_event_set_period(event)) continue; - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index f969410d0c90..bb34a28fa71b 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1714,8 +1714,10 @@ int x86_pmu_handle_irq(struct pt_regs *regs) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2db93498ff71..ba101c28dcc9 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2995,8 +2995,10 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) perf_sample_data_init(&data, 0, event->hw.last_period); - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data.br_stack = &cpuc->lbr_stack; + data.sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index cdd857bd2dd6..0489f750baa0 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1640,8 +1640,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_TIME; } - if (has_branch_stack(event)) + if (has_branch_stack(event)) { data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; + } } static void adaptive_pebs_save_regs(struct pt_regs *regs, @@ -1791,6 +1793,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (has_branch_stack(event)) { intel_pmu_store_pebs_lbrs(lbr); data->br_stack = &cpuc->lbr_stack; + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0978165a2d87..1e12e79454e0 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1011,7 +1011,6 @@ struct perf_sample_data { u64 sample_flags; u64 addr; struct perf_raw_record *raw; - struct perf_branch_stack *br_stack; u64 period; union perf_sample_weight weight; u64 txn; @@ -1021,6 +1020,8 @@ struct perf_sample_data { * The other fields, optionally {set,used} by * perf_{prepare,output}_sample(). */ + struct perf_branch_stack *br_stack; + u64 type; u64 ip; struct { @@ -1061,7 +1062,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->sample_flags = 0; data->addr = addr; data->raw = NULL; - data->br_stack = NULL; data->period = period; data->weight.full = 0; data->data_src.val = PERF_MEM_NA; diff --git a/kernel/events/core.c b/kernel/events/core.c index c9b9cb79231a..104c0c9f4e6f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7052,7 +7052,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr @@ -7358,7 +7358,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { if (perf_sample_save_hw_index(event)) size += sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 2abe681da0a192ab850a5271d838a7817b469fca Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:57 -0700 Subject: perf: Use sample_flags for weight Use the new sample_flags to indicate whether the weight field is filled by the PMU driver. Remove the weight field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-5-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 5 +++-- arch/x86/events/intel/ds.c | 10 +++++++--- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 1ad1efdb33f9..a5c95a2006ea 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2305,9 +2305,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && - ppmu->get_mem_weight) + ppmu->get_mem_weight) { ppmu->get_mem_weight(&data.weight.full, event->attr.sample_type); - + data.sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); } else if (period) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 0489f750baa0..4c51118e4add 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1527,8 +1527,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) { data->weight.full = pebs->lat; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } /* * data.data_src encodes the data source @@ -1620,9 +1622,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) { data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); - + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); @@ -1764,6 +1767,7 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->weight.var1_dw = (u32)(weight & PEBS_LATENCY_MASK) ?: intel_get_tsx_weight(meminfo->tsx_tuning); } + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e12e79454e0..06a587b5faa9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1021,6 +1020,7 @@ struct perf_sample_data { * perf_{prepare,output}_sample(). */ struct perf_branch_stack *br_stack; + union perf_sample_weight weight; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 104c0c9f4e6f..f0af45db02b3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7408,6 +7408,9 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From e16fd7f2cb1a65555cfe76f983eaefb1eab7471f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:58 -0700 Subject: perf: Use sample_flags for data_src Use the new sample_flags to indicate whether the data_src field is filled by the PMU driver. Remove the data_src field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-6-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index a5c95a2006ea..6ec7069e6482 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2301,8 +2301,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, } if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && - ppmu->get_mem_data_src) + ppmu->get_mem_data_src) { ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); + data.sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (event->attr.sample_type & PERF_SAMPLE_WEIGHT_TYPE && ppmu->get_mem_weight) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4c51118e4add..bde73d492889 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1535,8 +1535,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, pebs->dse); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } /* * We must however always use iregs for the unwinder to stay sane; the @@ -1770,8 +1772,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_DATA_SRC) + if (sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = get_data_src(event, meminfo->aux); + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 06a587b5faa9..6849f10dfc7e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1013,7 +1013,6 @@ struct perf_sample_data { struct perf_raw_record *raw; u64 period; u64 txn; - union perf_mem_data_src data_src; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { */ struct perf_branch_stack *br_stack; union perf_sample_weight weight; + union perf_mem_data_src data_src; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f0af45db02b3..163e2f478e61 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7411,6 +7411,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) data->weight.full = 0; + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From ee9db0e14b0575aa827579dc2471a29ec5fc6877 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Sep 2022 06:09:59 -0700 Subject: perf: Use sample_flags for txn Use the new sample_flags to indicate whether the txn field is filled by the PMU driver. Remove the txn field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220901130959.1285717-7-kan.liang@linux.intel.com --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 3 +++ 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index bde73d492889..a5275c235c2a 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1628,9 +1628,11 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; } - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, pebs->ax); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } /* @@ -1780,9 +1782,11 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, if (sample_type & PERF_SAMPLE_ADDR_TYPE) data->addr = meminfo->address; - if (sample_type & PERF_SAMPLE_TRANSACTION) + if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, gprs ? gprs->ax : 0); + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } } if (format_size & PEBS_DATACFG_XMMS) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 6849f10dfc7e..581880ddb9ef 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1012,7 +1012,6 @@ struct perf_sample_data { u64 addr; struct perf_raw_record *raw; u64 period; - u64 txn; /* * The other fields, optionally {set,used} by @@ -1021,6 +1020,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; union perf_sample_weight weight; union perf_mem_data_src data_src; + u64 txn; u64 type; u64 ip; @@ -1063,7 +1063,6 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->addr = addr; data->raw = NULL; data->period = period; - data->txn = 0; } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index 163e2f478e61..15d27b14c827 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) data->data_src.val = PERF_MEM_NA; + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 03b02db93be407103c385814033633364674a6f6 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 6 Sep 2022 14:14:14 +0530 Subject: perf: Consolidate branch sample filter helpers Besides the branch type filtering requests, 'event.attr.branch_sample_type' also contains various flags indicating which additional information should be captured, along with the base branch record. These flags help configure the underlying hardware, and capture the branch records appropriately when required e.g after PMU interrupt. But first, this moves an existing helper perf_sample_save_hw_index() into the header before adding some more helpers for other branch sample filter flags. Signed-off-by: Anshuman Khandual Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220906084414.396220-1-anshuman.khandual@arm.com --- include/linux/perf_event.h | 26 ++++++++++++++++++++++++++ kernel/events/core.c | 9 ++------- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel/events/core.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 581880ddb9ef..a627528e5f3a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1685,4 +1685,30 @@ static inline void perf_lopwr_cb(bool mode) } #endif +#ifdef CONFIG_PERF_EVENTS +static inline bool branch_sample_no_flags(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS; +} + +static inline bool branch_sample_no_cycles(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES; +} + +static inline bool branch_sample_type(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE; +} + +static inline bool branch_sample_hw_index(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; +} + +static inline bool branch_sample_priv(const struct perf_event *event) +{ + return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE; +} +#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d27b14c827..00389d5f9241 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6966,11 +6966,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7059,7 +7054,7 @@ void perf_output_sample(struct perf_output_handle *handle, * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7359,7 +7354,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr -- cgit v1.2.3-59-g8ed1b From f3c0eba287049237b23d1300376768293eb89e69 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2022 18:48:55 +0200 Subject: perf: Add a few assertions While auditing 6b959ba22d34 ("perf/core: Fix reentry problem in perf_output_read_group()") a few spots were found that wanted assertions. Notable for_each_sibling_event() relies on exclusion from modification. This would normally be holding either ctx->lock or ctx->mutex, however due to how things are constructed disabling IRQs is a valid and sufficient substitute for ctx->lock. Another possible site to add assertions would be the various pmu::{add,del,read,..}() methods, but that's not trivially expressable in C -- the best option is wrappers, but those are easy enough to forget. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 17 +++++++++++++++++ kernel/events/core.c | 2 ++ 2 files changed, 19 insertions(+) (limited to 'kernel/events/core.c') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f88cb31eaf75..368bdc4f563f 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -61,6 +61,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -634,7 +635,23 @@ struct pmu_event_list { struct list_head list; }; +/* + * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex + * as such iteration must hold either lock. However, since ctx->lock is an IRQ + * safe lock, and is only held by the CPU doing the modification, having IRQs + * disabled is sufficient since it will hold-off the IPIs. + */ +#ifdef CONFIG_PROVE_LOCKING +#define lockdep_assert_event_ctx(event) \ + WARN_ON_ONCE(__lockdep_enabled && \ + (this_cpu_read(hardirqs_enabled) || \ + lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD)) +#else +#define lockdep_assert_event_ctx(event) +#endif + #define for_each_sibling_event(sibling, event) \ + lockdep_assert_event_ctx(event); \ if ((event)->group_leader == (event)) \ list_for_each_entry((sibling), &(event)->sibling_list, sibling_list) diff --git a/kernel/events/core.c b/kernel/events/core.c index 00389d5f9241..3e90e454b995 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; -- cgit v1.2.3-59-g8ed1b From 3749d33e510c3dc695b3a5886b706310890d7ebd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:02 -0700 Subject: perf: Use sample_flags for callchain So that it can call perf_callchain() only if needed. Historically it used __PERF_SAMPLE_CALLCHAIN_EARLY but we can do that with sample_flags in the struct perf_sample_data. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-1-namhyung@kernel.org --- arch/x86/events/amd/ibs.c | 4 +++- arch/x86/events/intel/ds.c | 8 ++++++-- kernel/events/core.c | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c251bc44c088..dab094166693 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -798,8 +798,10 @@ fail: * recorded as part of interrupt regs. Thus we need to use rip from * interrupt regs while unwinding call stack. */ - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { data.callchain = perf_callchain(event, iregs); + data.sample_flags |= PERF_SAMPLE_CALLCHAIN; + } throttle = perf_event_overflow(event, &data, ®s); out: diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index a5275c235c2a..4ba6ab6d0d92 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1546,8 +1546,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } /* * We use the interrupt regs as a base because the PEBS record does not @@ -1719,8 +1721,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, * previous PMI context or an (I)RET happened between the record and * PMI. */ - if (sample_type & PERF_SAMPLE_CALLCHAIN) + if (sample_type & PERF_SAMPLE_CALLCHAIN) { data->callchain = perf_callchain(event, iregs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } *regs = *iregs; /* The ip in basic is EventingIP */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e90e454b995..c98ecf3e09ba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7320,7 +7320,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; -- cgit v1.2.3-59-g8ed1b From 16817ad7e8b31728b44ff9f17d8d894ed8a450d0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 8 Sep 2022 14:41:03 -0700 Subject: perf/bpf: Always use perf callchains if exist If the perf_event has PERF_SAMPLE_CALLCHAIN, BPF can use it for stack trace. The problematic cases like PEBS and IBS already handled in the PMU driver and they filled the callchain info in the sample data. For others, we can call perf_callchain() before the BPF handler. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220908214104.3851807-2-namhyung@kernel.org --- kernel/bpf/stackmap.c | 4 ++-- kernel/events/core.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel/events/core.c') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/events/core.c b/kernel/events/core.c index c98ecf3e09ba..7da5515082f9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10000,8 +10000,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10027,7 +10035,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* -- cgit v1.2.3-59-g8ed1b From dca6344d7a77dd0501a73745f4a9fb1ee2bc9d7c Mon Sep 17 00:00:00 2001 From: Jules Irenge Date: Sun, 18 Sep 2022 00:41:08 +0100 Subject: perf/core: Convert snprintf() to scnprintf() Coccinelle reports a warning: WARNING: use scnprintf or sprintf This LWN article explains the rationale for this change: https: //lwn.net/Articles/69419/ Ie. snprintf() returns what *would* be the resulting length, while scnprintf() returns the actual length. Adding to that, there has also been some slow migration from snprintf to scnprintf, here's the shift in usage in the past 3.5 years, in all fs/ files: v5.0 v6.0-rc6 -------------------------------------- snprintf() uses: 63 213 scnprintf() uses: 374 186 No intended change in behavior. [ mingo: Improved the changelog & reviewed the usage sites. ] Signed-off-by: Jules Irenge Signed-off-by: Ingo Molnar --- kernel/events/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/events/core.c') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7da5515082f9..c07e9a3ea94c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10952,7 +10952,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10963,7 +10963,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10974,7 +10974,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); -- cgit v1.2.3-59-g8ed1b From 7b084630153152239d84990ac4540c2dd360186f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:31 -0700 Subject: perf: Use sample_flags for addr Use the new sample_flags to indicate whether the addr field is filled by the PMU driver. As most PMU drivers pass 0, it can set the flag only if it has a non-zero value. And use 0 in perf_sample_output() if it's not filled already. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 8 ++++++-- include/linux/perf_event.h | 8 ++++++-- kernel/events/core.c | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 4ba6ab6d0d92..d2e9ff16f6ed 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1621,8 +1621,10 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if ((sample_type & PERF_SAMPLE_ADDR_TYPE) && - x86_pmu.intel_cap.pebs_format >= 1) + x86_pmu.intel_cap.pebs_format >= 1) { data->addr = pebs->dla; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ @@ -1783,8 +1785,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, data->sample_flags |= PERF_SAMPLE_DATA_SRC; } - if (sample_type & PERF_SAMPLE_ADDR_TYPE) + if (sample_type & PERF_SAMPLE_ADDR_TYPE) { data->addr = meminfo->address; + data->sample_flags |= PERF_SAMPLE_ADDR; + } if (sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 368bdc4f563f..f4a13579b0e8 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - u64 addr; struct perf_raw_record *raw; u64 period; @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_sample_weight weight; union perf_mem_data_src data_src; u64 txn; + u64 addr; u64 type; u64 ip; @@ -1079,9 +1079,13 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, { /* remaining struct members initialized in perf_prepare_sample() */ data->sample_flags = 0; - data->addr = addr; data->raw = NULL; data->period = period; + + if (addr) { + data->addr = addr; + data->sample_flags |= PERF_SAMPLE_ADDR; + } } /* diff --git a/kernel/events/core.c b/kernel/events/core.c index c07e9a3ea94c..a91f74db9fe9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7414,6 +7414,11 @@ void perf_prepare_sample(struct perf_event_header *header, if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) data->txn = 0; + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); -- cgit v1.2.3-59-g8ed1b From 838d9bb62d132ec3baf1b5aba2e95ef9a7a9a3cd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 21 Sep 2022 15:00:32 -0700 Subject: perf: Use sample_flags for raw_data Use the new sample_flags to indicate whether the raw data field is filled by the PMU driver. Although it could check with the NULL, follow the same rule with other fields. Remove the raw field from the perf_sample_data_init() to minimize the number of cache lines touched. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220921220032.2858517-2-namhyung@kernel.org --- arch/s390/kernel/perf_cpum_cf.c | 1 + arch/s390/kernel/perf_pai_crypto.c | 1 + arch/x86/events/amd/ibs.c | 1 + include/linux/perf_event.h | 5 ++--- kernel/events/core.c | 3 ++- 5 files changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index f7dd3c849e68..f043a7ff220b 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -664,6 +664,7 @@ static int cfdiag_push_sample(struct perf_event *event, raw.frag.data = cpuhw->stop; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/s390/kernel/perf_pai_crypto.c b/arch/s390/kernel/perf_pai_crypto.c index b38b4ae01589..6826e2a69a21 100644 --- a/arch/s390/kernel/perf_pai_crypto.c +++ b/arch/s390/kernel/perf_pai_crypto.c @@ -366,6 +366,7 @@ static int paicrypt_push_sample(void) raw.frag.data = cpump->save; raw.size = raw.frag.size; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } overflow = perf_event_overflow(event, &data, ®s); diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index ce5720bfb350..c29a006954c7 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -781,6 +781,7 @@ fail: }, }; data.raw = &raw; + data.sample_flags |= PERF_SAMPLE_RAW; } /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f4a13579b0e8..e9b151cde491 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1028,7 +1028,6 @@ struct perf_sample_data { * minimize the cachelines touched. */ u64 sample_flags; - struct perf_raw_record *raw; u64 period; /* @@ -1040,6 +1039,7 @@ struct perf_sample_data { union perf_mem_data_src data_src; u64 txn; u64 addr; + struct perf_raw_record *raw; u64 type; u64 ip; @@ -1078,8 +1078,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr, u64 period) { /* remaining struct members initialized in perf_prepare_sample() */ - data->sample_flags = 0; - data->raw = NULL; + data->sample_flags = PERF_SAMPLE_PERIOD; data->period = period; if (addr) { diff --git a/kernel/events/core.c b/kernel/events/core.c index a91f74db9fe9..04e19a857d4b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7332,7 +7332,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7348,6 +7348,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; -- cgit v1.2.3-59-g8ed1b From 4674ffe2fcad45a9b164401cc0794115702326cf Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 27 Sep 2022 19:20:25 +0200 Subject: perf, hw_breakpoint: Fix use-after-free if perf_event_open() fails Local testing revealed that we can trigger a use-after-free during rhashtable lookup as follows: | BUG: KASAN: use-after-free in memcmp lib/string.c:757 | Read of size 8 at addr ffff888107544dc0 by task perf-rhltable-n/1293 | | CPU: 0 PID: 1293 Comm: perf-rhltable-n Not tainted 6.0.0-rc3-00014-g85260862789c #46 | Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-debian-1.16.0-4 04/01/2014 | Call Trace: | | memcmp lib/string.c:757 | rhashtable_compare include/linux/rhashtable.h:577 [inline] | __rhashtable_lookup include/linux/rhashtable.h:602 [inline] | rhltable_lookup include/linux/rhashtable.h:688 [inline] | task_bp_pinned kernel/events/hw_breakpoint.c:324 | toggle_bp_slot kernel/events/hw_breakpoint.c:462 | __release_bp_slot kernel/events/hw_breakpoint.c:631 [inline] | release_bp_slot kernel/events/hw_breakpoint.c:639 | register_perf_hw_breakpoint kernel/events/hw_breakpoint.c:742 | hw_breakpoint_event_init kernel/events/hw_breakpoint.c:976 | perf_try_init_event kernel/events/core.c:11261 | perf_init_event kernel/events/core.c:11325 [inline] | perf_event_alloc kernel/events/core.c:11619 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | | Allocated by task 1292: | perf_event_alloc kernel/events/core.c:11505 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | Freed by task 1292: | perf_event_alloc kernel/events/core.c:11716 | __do_sys_perf_event_open kernel/events/core.c:12157 | do_syscall_x64 arch/x86/entry/common.c:50 [inline] | do_syscall_64 arch/x86/entry/common.c:80 | entry_SYSCALL_64_after_hwframe | | The buggy address belongs to the object at ffff888107544c00 | which belongs to the cache perf_event of size 1352 | The buggy address is located 448 bytes inside of | 1352-byte region [ffff888107544c00, ffff888107545148) This happens because the first perf_event_open() managed to reserve a HW breakpoint slot, however, later fails for other reasons and returns. The second perf_event_open() runs concurrently, and during rhltable_lookup() looks up an entry which is being freed: since rhltable_lookup() may run concurrently (under the RCU read lock) with rhltable_remove(), we may end up with a stale entry, for which memory may also have already been freed when being accessed. To fix, only free the failed perf_event after an RCU grace period. This allows subsystems that store references to an event to always access it concurrently under the RCU read lock, even if initialization will fail. Given failure is unlikely and a slow-path, turning the immediate free into a call_rcu()-wrapped free does not affect performance elsewhere. Fixes: 0370dc314df3 ("perf/hw_breakpoint: Optimize list of per-task breakpoints") Reported-by: syzkaller Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220927172025.1636995-1-elver@google.com --- kernel/events/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel/events/core.c') diff --git a/kernel/events/core.c b/kernel/events/core.c index 04e19a857d4b..e1ffdb861b53 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11734,11 +11734,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } -- cgit v1.2.3-59-g8ed1b From 5b26af6d2b7854639ddf893366bbca7e74fa7c54 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 28 Sep 2022 15:27:56 +0530 Subject: perf/x86/amd: Support PERF_SAMPLE_PHY_ADDR IBS_DC_PHYSADDR provides the physical data address for the tagged load/ store operation. Populate perf sample physical address using it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20220928095805.596-7-ravi.bangoria@amd.com --- arch/x86/events/amd/ibs.c | 8 +++++++- kernel/events/core.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel/events/core.c') diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0ad49105c154..3271735f0070 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -989,6 +989,11 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, data->addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; data->sample_flags |= PERF_SAMPLE_ADDR; } + + if (sample_type & PERF_SAMPLE_PHYS_ADDR && op_data3.dc_phy_addr_valid) { + data->phys_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)]; + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } } static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, @@ -998,7 +1003,8 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, (perf_ibs == &perf_ibs_op && (sample_type & PERF_SAMPLE_DATA_SRC || sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR))) + sample_type & PERF_SAMPLE_ADDR || + sample_type & PERF_SAMPLE_PHYS_ADDR))) return perf_ibs->offset_max; else if (check_rip) return 3; diff --git a/kernel/events/core.c b/kernel/events/core.c index e1ffdb861b53..49bc3b5e6c8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7435,7 +7435,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF -- cgit v1.2.3-59-g8ed1b From 7be51cc1c68dfa180ef84e71bcb4204237bb5620 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Oct 2022 11:03:47 +0200 Subject: perf: Fix pmu_filter_match() Mark reported that the new for_each_sibling_event() assertion triggers in pmu_filter_match() -- which isn't always called with IRQs disabled or ctx->mutex held. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YvvJq2f/7eFVcnNy@FVFF77S0Q05N --- kernel/events/core.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/events/core.c') diff --git a/kernel/events/core.c b/kernel/events/core.c index 49bc3b5e6c8a..b981b879bcd8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2226,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int -- cgit v1.2.3-59-g8ed1b