From 9483409ab5067941860754e78a4a44a60311d276 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 15 Mar 2021 12:34:36 +0900 Subject: perf core: Allocate perf_buffer in the target node memory I found the ring buffer pages are allocated in the node but the ring buffer itself is not. Let's convert it to use kzalloc_node() too. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210315033436.682438-1-namhyung@kernel.org --- kernel/events/ring_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ef91ae75ca56..bd55ccc91373 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -804,7 +804,7 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct perf_buffer *rb; unsigned long size; - int i; + int i, node; size = sizeof(struct perf_buffer); size += nr_pages * sizeof(void *); @@ -812,7 +812,8 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; - rb = kzalloc(size, GFP_KERNEL); + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; @@ -906,11 +907,13 @@ struct perf_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) struct perf_buffer *rb; unsigned long size; void *all_buf; + int node; size = sizeof(struct perf_buffer); size += sizeof(void *); - rb = kzalloc(size, GFP_KERNEL); + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + rb = kzalloc_node(size, GFP_KERNEL, node); if (!rb) goto fail; -- cgit v1.2.3-59-g8ed1b From bdacfaf26da166dd56c62f23f27a4b3e71f2d89e Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Mar 2021 20:54:12 +0900 Subject: perf core: Add a kmem_cache for struct perf_event The kernel can allocate a lot of struct perf_event when profiling. For example, 256 cpu x 8 events x 20 cgroups = 40K instances of the struct would be allocated on a large system. The size of struct perf_event in my setup is 1152 byte. As it's allocated by kmalloc, the actual allocation size would be rounded up to 2K. Then there's 896 byte (~43%) of waste per instance resulting in total ~35MB with 40K instances. We can create a dedicated kmem_cache to avoid such a big unnecessary memory consumption. With this change, I can see below (note this machine has 112 cpus). # grep perf_event /proc/slabinfo perf_event 224 784 1152 7 2 : tunables 24 12 8 : slabdata 112 112 0 The sixth column is pages-per-slab which is 2, and the fifth column is obj-per-slab which is 7. Thus actually it can use 1152 x 7 = 8064 byte in the 8K, and wasted memory is (8192 - 8064) / 7 = ~18 byte per instance. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210311115413.444407-1-namhyung@kernel.org --- kernel/events/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 03db40f6cba9..f526ddb50d5e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -405,6 +405,7 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; static cpumask_var_t perf_online_mask; +static struct kmem_cache *perf_event_cache; /* * perf event paranoia level: @@ -4611,7 +4612,7 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - kfree(event); + kmem_cache_free(perf_event_cache, event); } static void ring_buffer_attach(struct perf_event *event, @@ -11293,7 +11294,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, return ERR_PTR(-EINVAL); } - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kmem_cache_zalloc(perf_event_cache, GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -11497,7 +11498,7 @@ err_ns: put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kfree(event); + kmem_cache_free(perf_event_cache, event); return ERR_PTR(err); } @@ -13130,6 +13131,8 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC); + /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. -- cgit v1.2.3-59-g8ed1b From ff65338e78418e5970a7aabbabb94c46f2bb821d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 11 Mar 2021 20:54:13 +0900 Subject: perf core: Allocate perf_event in the target node memory For cpu events, it'd better allocating them in the corresponding node memory as they would be mostly accessed by the target cpu. Although perf tools sets the cpu affinity before calling perf_event_open, there are places it doesn't (notably perf record) and we should consider other external users too. Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210311115413.444407-2-namhyung@kernel.org --- kernel/events/core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f526ddb50d5e..6182cb199f72 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11288,13 +11288,16 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *event; struct hw_perf_event *hwc; long err = -EINVAL; + int node; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) return ERR_PTR(-EINVAL); } - event = kmem_cache_zalloc(perf_event_cache, GFP_KERNEL); + node = (cpu >= 0) ? cpu_to_node(cpu) : -1; + event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, + node); if (!event) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-59-g8ed1b From 08ef1af4de5fe7de9c6d69f1e22e51b66e385d9b Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Wed, 24 Feb 2021 22:56:28 +0100 Subject: perf/core: Fix unconditional security_locked_down() call Currently, the lockdown state is queried unconditionally, even though its result is used only if the PERF_SAMPLE_REGS_INTR bit is set in attr.sample_type. While that doesn't matter in case of the Lockdown LSM, it causes trouble with the SELinux's lockdown hook implementation. SELinux implements the locked_down hook with a check whether the current task's type has the corresponding "lockdown" class permission ("integrity" or "confidentiality") allowed in the policy. This means that calling the hook when the access control decision would be ignored generates a bogus permission check and audit record. Fix this by checking sample_type first and only calling the hook when its result would be honored. Fixes: b0c8fdc7fdb7 ("lockdown: Lock down perf when in confidentiality mode") Signed-off-by: Ondrej Mosnacek Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paul Moore Link: https://lkml.kernel.org/r/20210224215628.192519-1-omosnace@redhat.com --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 6182cb199f72..f07943183041 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11833,12 +11833,12 @@ SYSCALL_DEFINE5(perf_event_open, return err; } - err = security_locked_down(LOCKDOWN_PERF); - if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR)) - /* REGS_INTR can leak data, lockdown must prevent this */ - return err; - - err = 0; + /* REGS_INTR can leak data, lockdown must prevent this */ + if (attr.sample_type & PERF_SAMPLE_REGS_INTR) { + err = security_locked_down(LOCKDOWN_PERF); + if (err) + return err; + } /* * In cgroup mode, the pid argument is used to pass the fd -- cgit v1.2.3-59-g8ed1b From d68e6799a5c87f415d3bfa0dea49caee28ab00d1 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Apr 2021 18:49:54 +0300 Subject: perf: Cap allocation order at aux_watermark Currently, we start allocating AUX pages half the size of the total requested AUX buffer size, ignoring the attr.aux_watermark setting. This, in turn, makes intel_pt driver disregard the watermark also, as it uses page order for its SG (ToPA) configuration. Now, this can be fixed in the intel_pt PMU driver, but seeing as it's the only one currently making use of high order allocations, there is no reason not to fix the allocator instead. This way, any other driver wishing to add this support would not have to worry about this. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210414154955.49603-2-alexander.shishkin@linux.intel.com --- kernel/events/ring_buffer.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index bd55ccc91373..52868716ec35 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -674,21 +674,26 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, if (!has_aux(event)) return -EOPNOTSUPP; - /* - * We need to start with the max_order that fits in nr_pages, - * not the other way around, hence ilog2() and not get_order. - */ - max_order = ilog2(nr_pages); - - /* - * PMU requests more than one contiguous chunks of memory - * for SW double buffering - */ if (!overwrite) { - if (!max_order) - return -EINVAL; + /* + * Watermark defaults to half the buffer, and so does the + * max_order, to aid PMU drivers in double buffering. + */ + if (!watermark) + watermark = nr_pages << (PAGE_SHIFT - 1); - max_order--; + /* + * Use aux_watermark as the basis for chunking to + * help PMU drivers honor the watermark. + */ + max_order = get_order(watermark); + } else { + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); + watermark = 0; } rb->aux_pages = kcalloc_node(nr_pages, sizeof(void *), GFP_KERNEL, @@ -743,9 +748,6 @@ int rb_alloc_aux(struct perf_buffer *rb, struct perf_event *event, rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; - if (!rb->aux_watermark && !rb->aux_overwrite) - rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); - out: if (!ret) rb->aux_pgoff = pgoff; -- cgit v1.2.3-59-g8ed1b From ef54c1a476aef7eef26fe13ea10dc090952c00f8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 8 Apr 2021 12:35:56 +0200 Subject: perf: Rework perf_event_exit_event() Make perf_event_exit_event() more robust, such that we can use it from other contexts. Specifically the up and coming remove_on_exec. For this to work we need to address a few issues. Remove_on_exec will not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to disable event_function_call() and we thus have to use perf_remove_from_context(). When using perf_remove_from_context(), there's two races to consider. The first is against close(), where we can have concurrent tear-down of the event. The second is against child_list iteration, which should not find a half baked event. To address this, teach perf_remove_from_context() to special case !ctx->is_active and about DETACH_CHILD. [ elver@google.com: fix racing parent/child exit in sync_child_event(). ] Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210408103605.1676875-2-elver@google.com --- include/linux/perf_event.h | 1 + kernel/events/core.c | 142 +++++++++++++++++++++++++-------------------- 2 files changed, 80 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3f7f89ea5e51..3d478abf411c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -607,6 +607,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 +#define PERF_ATTACH_CHILD 0x40 struct perf_cgroup; struct perf_buffer; diff --git a/kernel/events/core.c b/kernel/events/core.c index f07943183041..318ff7b021b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2205,6 +2205,26 @@ out: perf_event__header_size(leader); } +static void sync_child_event(struct perf_event *child_event); + +static void perf_child_detach(struct perf_event *event) +{ + struct perf_event *parent_event = event->parent; + + if (!(event->attach_state & PERF_ATTACH_CHILD)) + return; + + event->attach_state &= ~PERF_ATTACH_CHILD; + + if (WARN_ON_ONCE(!parent_event)) + return; + + lockdep_assert_held(&parent_event->child_mutex); + + sync_child_event(event); + list_del_init(&event->child_list); +} + static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; @@ -2312,6 +2332,7 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL +#define DETACH_CHILD 0x02UL /* * Cross CPU call to remove a performance event @@ -2335,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event, event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); + if (flags & DETACH_CHILD) + perf_child_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { @@ -2363,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla lockdep_assert_held(&ctx->mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); - /* - * The above event_function_call() can NO-OP when it hits - * TASK_TOMBSTONE. In that case we must already have been detached - * from the context (by perf_event_exit_event()) but the grouping - * might still be in-tact. + * Because of perf_event_exit_task(), perf_remove_from_context() ought + * to work in the face of TASK_TOMBSTONE, unlike every other + * event_function_call() user. */ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - if ((flags & DETACH_GROUP) && - (event->attach_state & PERF_ATTACH_GROUP)) { - /* - * Since in that case we cannot possibly be scheduled, simply - * detach now. - */ - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); + raw_spin_lock_irq(&ctx->lock); + if (!ctx->is_active) { + __perf_remove_from_context(event, __get_cpu_context(ctx), + ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); + return; } + raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -12377,14 +12396,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) } EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); -static void sync_child_event(struct perf_event *child_event, - struct task_struct *child) +static void sync_child_event(struct perf_event *child_event) { struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_event->attr.inherit_stat) - perf_event_read_event(child_event, child); + if (child_event->attr.inherit_stat) { + struct task_struct *task = child_event->ctx->task; + + if (task && task != TASK_TOMBSTONE) + perf_event_read_event(child_event, task); + } child_val = perf_event_count(child_event); @@ -12399,60 +12421,53 @@ static void sync_child_event(struct perf_event *child_event, } static void -perf_event_exit_event(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *parent_event = child_event->parent; + struct perf_event *parent_event = event->parent; + unsigned long detach_flags = 0; - /* - * Do not destroy the 'original' grouping; because of the context - * switch optimization the original events could've ended up in a - * random child task. - * - * If we were to destroy the original group, all group related - * operations would cease to function properly after this random - * child dies. - * - * Do destroy all inherited groups, we don't care about those - * and being thorough is better. - */ - raw_spin_lock_irq(&child_ctx->lock); - WARN_ON_ONCE(child_ctx->is_active); + if (parent_event) { + /* + * Do not destroy the 'original' grouping; because of the + * context switch optimization the original events could've + * ended up in a random child task. + * + * If we were to destroy the original group, all group related + * operations would cease to function properly after this + * random child dies. + * + * Do destroy all inherited groups, we don't care about those + * and being thorough is better. + */ + detach_flags = DETACH_GROUP | DETACH_CHILD; + mutex_lock(&parent_event->child_mutex); + } - if (parent_event) - perf_group_detach(child_event); - list_del_event(child_event, child_ctx); - perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */ - raw_spin_unlock_irq(&child_ctx->lock); + perf_remove_from_context(event, detach_flags); + + raw_spin_lock_irq(&ctx->lock); + if (event->state > PERF_EVENT_STATE_EXIT) + perf_event_set_state(event, PERF_EVENT_STATE_EXIT); + raw_spin_unlock_irq(&ctx->lock); /* - * Parent events are governed by their filedesc, retain them. + * Child events can be freed. */ - if (!parent_event) { - perf_event_wakeup(child_event); + if (parent_event) { + mutex_unlock(&parent_event->child_mutex); + /* + * Kick perf_poll() for is_event_hup(); + */ + perf_event_wakeup(parent_event); + free_event(event); + put_event(parent_event); return; } - /* - * Child events can be cleaned up. - */ - - sync_child_event(child_event, child); /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Kick perf_poll() for is_event_hup(). + * Parent events are governed by their filedesc, retain them. */ - perf_event_wakeup(parent_event); - free_event(child_event); - put_event(parent_event); + perf_event_wakeup(event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) @@ -12509,7 +12524,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) perf_event_task(child, child_ctx, 0); list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - perf_event_exit_event(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx); mutex_unlock(&child_ctx->mutex); @@ -12769,6 +12784,7 @@ inherit_event(struct perf_event *parent_event, */ raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); + child_event->attach_state |= PERF_ATTACH_CHILD; raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* -- cgit v1.2.3-59-g8ed1b From 47f661eca0700928012e11c57ea0328f5ccfc3b9 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:35:57 +0200 Subject: perf: Apply PERF_EVENT_IOC_MODIFY_ATTRIBUTES to children As with other ioctls (such as PERF_EVENT_IOC_{ENABLE,DISABLE}), fix up handling of PERF_EVENT_IOC_MODIFY_ATTRIBUTES to also apply to children. Suggested-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dmitry Vyukov Link: https://lkml.kernel.org/r/20210408103605.1676875-3-elver@google.com --- kernel/events/core.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 318ff7b021b4..10ed2cd434dc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3200,16 +3200,36 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { + int (*func)(struct perf_event *, struct perf_event_attr *); + struct perf_event *child; + int err; + if (event->attr.type != attr->type) return -EINVAL; switch (event->attr.type) { case PERF_TYPE_BREAKPOINT: - return perf_event_modify_breakpoint(event, attr); + func = perf_event_modify_breakpoint; + break; default: /* Place holder for future additions. */ return -EOPNOTSUPP; } + + WARN_ON_ONCE(event->ctx->parent_ctx); + + mutex_lock(&event->child_mutex); + err = func(event, attr); + if (err) + goto out; + list_for_each_entry(child, &event->child_list, child_list) { + err = func(child, attr); + if (err) + goto out; + } +out: + mutex_unlock(&event->child_mutex); + return err; } static void ctx_sched_out(struct perf_event_context *ctx, -- cgit v1.2.3-59-g8ed1b From 2b26f0aa004995f49f7b6f4100dd0e4c39a9ed5f Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:35:58 +0200 Subject: perf: Support only inheriting events if cloned with CLONE_THREAD Adds bit perf_event_attr::inherit_thread, to restricting inheriting events only if the child was cloned with CLONE_THREAD. This option supports the case where an event is supposed to be process-wide only (including subthreads), but should not propagate beyond the current process's shared environment. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/lkml/YBvj6eJR%2FDY2TsEB@hirez.programming.kicks-ass.net/ --- include/linux/perf_event.h | 5 +++-- include/uapi/linux/perf_event.h | 3 ++- kernel/events/core.c | 21 ++++++++++++++------- kernel/fork.c | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3d478abf411c..1660039199b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -958,7 +958,7 @@ extern void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task); extern void __perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next); -extern int perf_event_init_task(struct task_struct *child); +extern int perf_event_init_task(struct task_struct *child, u64 clone_flags); extern void perf_event_exit_task(struct task_struct *child); extern void perf_event_free_task(struct task_struct *task); extern void perf_event_delayed_put(struct task_struct *task); @@ -1449,7 +1449,8 @@ perf_event_task_sched_in(struct task_struct *prev, static inline void perf_event_task_sched_out(struct task_struct *prev, struct task_struct *next) { } -static inline int perf_event_init_task(struct task_struct *child) { return 0; } +static inline int perf_event_init_task(struct task_struct *child, + u64 clone_flags) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } static inline void perf_event_delayed_put(struct task_struct *task) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ad15e40d7f5d..813efb65fea8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -389,7 +389,8 @@ struct perf_event_attr { cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ - __reserved_1 : 29; + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + __reserved_1 : 28; union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 10ed2cd434dc..3e3c00fd0b2e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11653,6 +11653,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) return -EINVAL; + if (!attr->inherit && attr->inherit_thread) + return -EINVAL; + out: return ret; @@ -12873,12 +12876,13 @@ static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, struct task_struct *child, int ctxn, - int *inherited_all) + u64 clone_flags, int *inherited_all) { int ret; struct perf_event_context *child_ctx; - if (!event->attr.inherit) { + if (!event->attr.inherit || + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) { *inherited_all = 0; return 0; } @@ -12910,7 +12914,8 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn) +static int perf_event_init_context(struct task_struct *child, int ctxn, + u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -12950,7 +12955,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, ctxn, clone_flags, + &inherited_all); if (ret) goto out_unlock; } @@ -12966,7 +12972,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn) perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, &inherited_all); + child, ctxn, clone_flags, + &inherited_all); if (ret) goto out_unlock; } @@ -13008,7 +13015,7 @@ out_unlock: /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child, u64 clone_flags) { int ctxn, ret; @@ -13017,7 +13024,7 @@ int perf_event_init_task(struct task_struct *child) INIT_LIST_HEAD(&child->perf_event_list); for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn); + ret = perf_event_init_context(child, ctxn, clone_flags); if (ret) { perf_event_free_task(child); return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 0acc8ed1076b..3728a645771c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2078,7 +2078,7 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_cleanup_policy; - retval = perf_event_init_task(p); + retval = perf_event_init_task(p, clone_flags); if (retval) goto bad_fork_cleanup_policy; retval = audit_alloc(p); -- cgit v1.2.3-59-g8ed1b From 2e498d0a74e5b88a6689ae1b811f247f91ff188e Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:35:59 +0200 Subject: perf: Add support for event removal on exec Adds bit perf_event_attr::remove_on_exec, to support removing an event from a task on exec. This option supports the case where an event is supposed to be process-wide only, and should not propagate beyond exec, to limit monitoring to the original process image only. Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210408103605.1676875-5-elver@google.com --- include/uapi/linux/perf_event.h | 3 +- kernel/events/core.c | 70 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 64 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 813efb65fea8..8c5b9f5ad63f 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -390,7 +390,8 @@ struct perf_event_attr { text_poke : 1, /* include text poke events */ build_id : 1, /* use build id in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ - __reserved_1 : 28; + remove_on_exec : 1, /* event is removed from task on exec */ + __reserved_1 : 27; union { __u32 wakeup_events; /* wakeup every n events */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 3e3c00fd0b2e..e4a584bceb7a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4248,6 +4248,57 @@ out: put_ctx(clone_ctx); } +static void perf_remove_from_owner(struct perf_event *event); +static void perf_event_exit_event(struct perf_event *event, + struct perf_event_context *ctx); + +/* + * Removes all events from the current task that have been marked + * remove-on-exec, and feeds their values back to parent events. + */ +static void perf_event_remove_on_exec(int ctxn) +{ + struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event *event, *next; + LIST_HEAD(free_list); + unsigned long flags; + bool modified = false; + + ctx = perf_pin_task_context(current, ctxn); + if (!ctx) + return; + + mutex_lock(&ctx->mutex); + + if (WARN_ON_ONCE(ctx->task != current)) + goto unlock; + + list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) { + if (!event->attr.remove_on_exec) + continue; + + if (!is_kernel_event(event)) + perf_remove_from_owner(event); + + modified = true; + + perf_event_exit_event(event, ctx); + } + + raw_spin_lock_irqsave(&ctx->lock, flags); + if (modified) + clone_ctx = unclone_ctx(ctx); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + +unlock: + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + if (clone_ctx) + put_ctx(clone_ctx); +} + struct perf_read_data { struct perf_event *event; bool group; @@ -7560,18 +7611,18 @@ void perf_event_exec(void) struct perf_event_context *ctx; int ctxn; - rcu_read_lock(); for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - perf_event_enable_on_exec(ctxn); + perf_event_remove_on_exec(ctxn); - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, - true); + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, + NULL, true); + } + rcu_read_unlock(); } - rcu_read_unlock(); } struct remote_output { @@ -11656,6 +11707,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (!attr->inherit && attr->inherit_thread) return -EINVAL; + if (attr->remove_on_exec && attr->enable_on_exec) + return -EINVAL; + out: return ret; -- cgit v1.2.3-59-g8ed1b From fb6cc127e0b6e629252cdd0f77d5a1f49db95b92 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:36:00 +0200 Subject: signal: Introduce TRAP_PERF si_code and si_perf to siginfo Introduces the TRAP_PERF si_code, and associated siginfo_t field si_perf. These will be used by the perf event subsystem to send signals (if requested) to the task where an event occurred. Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Geert Uytterhoeven # m68k Acked-by: Arnd Bergmann # asm-generic Link: https://lkml.kernel.org/r/20210408103605.1676875-6-elver@google.com --- arch/m68k/kernel/signal.c | 3 +++ arch/x86/kernel/signal_compat.c | 5 ++++- fs/signalfd.c | 4 ++++ include/linux/compat.h | 2 ++ include/linux/signal.h | 1 + include/uapi/asm-generic/siginfo.h | 6 +++++- include/uapi/linux/signalfd.h | 4 +++- kernel/signal.c | 11 +++++++++++ 8 files changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c index 349570f16a78..a4b7ee1df211 100644 --- a/arch/m68k/kernel/signal.c +++ b/arch/m68k/kernel/signal.c @@ -622,6 +622,9 @@ static inline void siginfo_build_tests(void) /* _sigfault._addr_pkey */ BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x12); + /* _sigfault._perf */ + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x10); + /* _sigpoll */ BUILD_BUG_ON(offsetof(siginfo_t, si_band) != 0x0c); BUILD_BUG_ON(offsetof(siginfo_t, si_fd) != 0x10); diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a5330ff498f0..0e5d0a7e203b 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -29,7 +29,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGFPE != 15); BUILD_BUG_ON(NSIGSEGV != 9); BUILD_BUG_ON(NSIGBUS != 5); - BUILD_BUG_ON(NSIGTRAP != 5); + BUILD_BUG_ON(NSIGTRAP != 6); BUILD_BUG_ON(NSIGCHLD != 6); BUILD_BUG_ON(NSIGSYS != 2); @@ -138,6 +138,9 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(offsetof(siginfo_t, si_pkey) != 0x20); BUILD_BUG_ON(offsetof(compat_siginfo_t, si_pkey) != 0x14); + BUILD_BUG_ON(offsetof(siginfo_t, si_perf) != 0x18); + BUILD_BUG_ON(offsetof(compat_siginfo_t, si_perf) != 0x10); + CHECK_CSI_OFFSET(_sigpoll); CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); diff --git a/fs/signalfd.c b/fs/signalfd.c index 456046e15873..040a1142915f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -134,6 +134,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, #endif new.ssi_addr_lsb = (short) kinfo->si_addr_lsb; break; + case SIL_PERF_EVENT: + new.ssi_addr = (long) kinfo->si_addr; + new.ssi_perf = kinfo->si_perf; + break; case SIL_CHLD: new.ssi_pid = kinfo->si_pid; new.ssi_uid = kinfo->si_uid; diff --git a/include/linux/compat.h b/include/linux/compat.h index 6e65be753603..c8821d966812 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -236,6 +236,8 @@ typedef struct compat_siginfo { char _dummy_pkey[__COMPAT_ADDR_BND_PKEY_PAD]; u32 _pkey; } _addr_pkey; + /* used when si_code=TRAP_PERF */ + compat_u64 _perf; }; } _sigfault; diff --git a/include/linux/signal.h b/include/linux/signal.h index 205526c4003a..1e98548d7cf6 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -43,6 +43,7 @@ enum siginfo_layout { SIL_FAULT_MCEERR, SIL_FAULT_BNDERR, SIL_FAULT_PKUERR, + SIL_PERF_EVENT, SIL_CHLD, SIL_RT, SIL_SYS, diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index d2597000407a..d0bb9125c853 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -91,6 +91,8 @@ union __sifields { char _dummy_pkey[__ADDR_BND_PKEY_PAD]; __u32 _pkey; } _addr_pkey; + /* used when si_code=TRAP_PERF */ + __u64 _perf; }; } _sigfault; @@ -155,6 +157,7 @@ typedef struct siginfo { #define si_lower _sifields._sigfault._addr_bnd._lower #define si_upper _sifields._sigfault._addr_bnd._upper #define si_pkey _sifields._sigfault._addr_pkey._pkey +#define si_perf _sifields._sigfault._perf #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd #define si_call_addr _sifields._sigsys._call_addr @@ -253,7 +256,8 @@ typedef struct siginfo { #define TRAP_BRANCH 3 /* process taken branch trap */ #define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */ #define TRAP_UNK 5 /* undiagnosed trap */ -#define NSIGTRAP 5 +#define TRAP_PERF 6 /* perf event with sigtrap=1 */ +#define NSIGTRAP 6 /* * There is an additional set of SIGTRAP si_codes used by ptrace diff --git a/include/uapi/linux/signalfd.h b/include/uapi/linux/signalfd.h index 83429a05b698..7e333042c7e3 100644 --- a/include/uapi/linux/signalfd.h +++ b/include/uapi/linux/signalfd.h @@ -39,6 +39,8 @@ struct signalfd_siginfo { __s32 ssi_syscall; __u64 ssi_call_addr; __u32 ssi_arch; + __u32 __pad3; + __u64 ssi_perf; /* * Pad strcture to 128 bytes. Remember to update the @@ -49,7 +51,7 @@ struct signalfd_siginfo { * comes out of a read(2) and we really don't want to have * a compat on read(2). */ - __u8 __pad[28]; + __u8 __pad[16]; }; diff --git a/kernel/signal.c b/kernel/signal.c index ba4d1ef39a9e..f68351825e5e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1199,6 +1199,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: + case SIL_PERF_EVENT: case SIL_SYS: ret = false; break; @@ -2531,6 +2532,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig) case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: + case SIL_PERF_EVENT: ksig->info.si_addr = arch_untagged_si_addr( ksig->info.si_addr, ksig->sig, ksig->info.si_code); break; @@ -3333,6 +3335,10 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, #endif to->si_pkey = from->si_pkey; break; + case SIL_PERF_EVENT: + to->si_addr = ptr_to_compat(from->si_addr); + to->si_perf = from->si_perf; + break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; @@ -3413,6 +3419,10 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, #endif to->si_pkey = from->si_pkey; break; + case SIL_PERF_EVENT: + to->si_addr = compat_ptr(from->si_addr); + to->si_perf = from->si_perf; + break; case SIL_CHLD: to->si_pid = from->si_pid; to->si_uid = from->si_uid; @@ -4593,6 +4603,7 @@ static inline void siginfo_buildtime_checks(void) CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); + CHECK_OFFSET(si_perf); /* sigpoll */ CHECK_OFFSET(si_band); -- cgit v1.2.3-59-g8ed1b From 97ba62b278674293762c3d91f724f1bb922f04e0 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 8 Apr 2021 12:36:01 +0200 Subject: perf: Add support for SIGTRAP on perf events Adds bit perf_event_attr::sigtrap, which can be set to cause events to send SIGTRAP (with si_code TRAP_PERF) to the task where the event occurred. The primary motivation is to support synchronous signals on perf events in the task where an event (such as breakpoints) triggered. To distinguish perf events based on the event type, the type is set in si_errno. For events that are associated with an address, si_addr is copied from perf_sample_data. The new field perf_event_attr::sig_data is copied to si_perf, which allows user space to disambiguate which event (of the same type) triggered the signal. For example, user space could encode the relevant information it cares about in sig_data. We note that the choice of an opaque u64 provides the simplest and most flexible option. Alternatives where a reference to some user space data is passed back suffer from the problem that modification of referenced data (be it the event fd, or the perf_event_attr) can race with the signal being delivered (of course, the same caveat applies if user space decides to store a pointer in sig_data, but the ABI explicitly avoids prescribing such a design). Suggested-by: Peter Zijlstra Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Acked-by: Dmitry Vyukov Link: https://lore.kernel.org/lkml/YBv3rAT566k+6zjg@hirez.programming.kicks-ass.net/ --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 10 ++++++++- kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 58 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1660039199b2..7d7280aa4e22 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -735,6 +735,7 @@ struct perf_event { int pending_wakeup; int pending_kill; int pending_disable; + unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending; atomic_t event_limit; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 8c5b9f5ad63f..31b00e3b69c9 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -311,6 +311,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ #define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ #define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -391,7 +392,8 @@ struct perf_event_attr { build_id : 1, /* use build id in mmap2 events */ inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ remove_on_exec : 1, /* event is removed from task on exec */ - __reserved_1 : 27; + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; union { __u32 wakeup_events; /* wakeup every n events */ @@ -443,6 +445,12 @@ struct perf_event_attr { __u16 __reserved_2; __u32 aux_sample_size; __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf, e.g. to permit user to identify the event. + */ + __u64 sig_data; }; /* diff --git a/kernel/events/core.c b/kernel/events/core.c index e4a584bceb7a..6f0723c711a9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6392,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event) } } +static void perf_sigtrap(struct perf_event *event) +{ + struct kernel_siginfo info; + + /* + * We'd expect this to only occur if the irq_work is delayed and either + * ctx->task or current has changed in the meantime. This can be the + * case on architectures that do not implement arch_irq_work_raise(). + */ + if (WARN_ON_ONCE(event->ctx->task != current)) + return; + + /* + * perf_pending_event() can race with the task exiting. + */ + if (current->flags & PF_EXITING) + return; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_code = TRAP_PERF; + info.si_errno = event->attr.type; + info.si_perf = event->attr.sig_data; + info.si_addr = (void __user *)event->pending_addr; + force_sig_info(&info); +} + static void perf_pending_event_disable(struct perf_event *event) { int cpu = READ_ONCE(event->pending_disable); @@ -6401,6 +6428,13 @@ static void perf_pending_event_disable(struct perf_event *event) if (cpu == smp_processor_id()) { WRITE_ONCE(event->pending_disable, -1); + + if (event->attr.sigtrap) { + perf_sigtrap(event); + atomic_set_release(&event->event_limit, 1); /* rearm event */ + return; + } + perf_event_disable_local(event); return; } @@ -9103,6 +9137,7 @@ static int __perf_event_overflow(struct perf_event *event, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; + event->pending_addr = data->addr; perf_event_disable_inatomic(event); } @@ -11384,6 +11419,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!task || cpu != -1) return ERR_PTR(-EINVAL); } + if (attr->sigtrap && !task) { + /* Requires a task: avoid signalling random tasks. */ + return ERR_PTR(-EINVAL); + } node = (cpu >= 0) ? cpu_to_node(cpu) : -1; event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, @@ -11432,6 +11471,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (event->attr.sigtrap) + atomic_set(&event->event_limit, 1); + if (task) { event->attach_state = PERF_ATTACH_TASK; /* @@ -11710,6 +11752,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->remove_on_exec && attr->enable_on_exec) return -EINVAL; + if (attr->sigtrap && !attr->remove_on_exec) + return -EINVAL; + out: return ret; @@ -12936,7 +12981,9 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *child_ctx; if (!event->attr.inherit || - (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD))) { + (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || + /* Do not inherit if sigtrap and signal handlers were cleared. */ + (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) { *inherited_all = 0; return 0; } -- cgit v1.2.3-59-g8ed1b From 55bcf6ef314ae8ba81bcd74aa760247b635ed47b Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 12 Apr 2021 07:31:01 -0700 Subject: perf: Extend PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE Current Hardware events and Hardware cache events have special perf types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't pass the PMU type in the user interface. For a hybrid system, the perf subsystem doesn't know which PMU the events belong to. The first capable PMU will always be assigned to the events. The events never get a chance to run on the other capable PMUs. Extend the two types to become PMU aware types. The PMU type ID is stored at attr.config[63:32]. Add a new PMU capability, PERF_PMU_CAP_EXTENDED_HW_TYPE, to indicate a PMU which supports the extended PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The PMU type is only required when searching a specific PMU. The PMU specific codes will only be interested in the 'real' config value, which is stored in the low 32 bit of the event->attr.config. Update the event->attr.config in the generic code, so the PMU specific codes don't need to calculate it separately. If a user specifies a PMU type, but the PMU doesn't support the extended type, error out. If an event cannot be initialized in a PMU specified by a user, error out immediately. Perf should not try to open it on other PMUs. The new PMU capability is only set for the X86 hybrid PMUs for now. Other architectures, e.g., ARM, may need it as well. The support on ARM may be implemented later separately. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1618237865-33448-22-git-send-email-kan.liang@linux.intel.com --- arch/x86/events/core.c | 1 + include/linux/perf_event.h | 19 ++++++++++--------- include/uapi/linux/perf_event.h | 15 +++++++++++++++ kernel/events/core.c | 19 ++++++++++++++++--- 4 files changed, 42 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 4f6595ee6359..3fe66b7aa721 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2173,6 +2173,7 @@ static int __init init_hw_perf_events(void) hybrid_pmu->pmu.type = -1; hybrid_pmu->pmu.attr_update = x86_pmu.attr_update; hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_HETEROGENEOUS_CPUS; + hybrid_pmu->pmu.capabilities |= PERF_PMU_CAP_EXTENDED_HW_TYPE; err = perf_pmu_register(&hybrid_pmu->pmu, hybrid_pmu->name, (hybrid_pmu->cpu_type == hybrid_big) ? PERF_TYPE_RAW : -1); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 61b385154c4a..a763928a0e41 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -260,15 +260,16 @@ struct perf_event; /** * pmu::capabilities flags */ -#define PERF_PMU_CAP_NO_INTERRUPT 0x01 -#define PERF_PMU_CAP_NO_NMI 0x02 -#define PERF_PMU_CAP_AUX_NO_SG 0x04 -#define PERF_PMU_CAP_EXTENDED_REGS 0x08 -#define PERF_PMU_CAP_EXCLUSIVE 0x10 -#define PERF_PMU_CAP_ITRACE 0x20 -#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x40 -#define PERF_PMU_CAP_NO_EXCLUDE 0x80 -#define PERF_PMU_CAP_AUX_OUTPUT 0x100 +#define PERF_PMU_CAP_NO_INTERRUPT 0x0001 +#define PERF_PMU_CAP_NO_NMI 0x0002 +#define PERF_PMU_CAP_AUX_NO_SG 0x0004 +#define PERF_PMU_CAP_EXTENDED_REGS 0x0008 +#define PERF_PMU_CAP_EXCLUSIVE 0x0010 +#define PERF_PMU_CAP_ITRACE 0x0020 +#define PERF_PMU_CAP_HETEROGENEOUS_CPUS 0x0040 +#define PERF_PMU_CAP_NO_EXCLUDE 0x0080 +#define PERF_PMU_CAP_AUX_OUTPUT 0x0100 +#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0200 struct perf_output_handle; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0b58970bab6b..e54e639248c8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -37,6 +37,21 @@ enum perf_type_id { PERF_TYPE_MAX, /* non-ABI */ }; +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + /* * Generalized performance event event_id types, used by the * attr.event_id parameter of the sys_perf_event_open() diff --git a/kernel/events/core.c b/kernel/events/core.c index 6f0723c711a9..928b166d888e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11220,6 +11220,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) static struct pmu *perf_init_event(struct perf_event *event) { + bool extended_type = false; int idx, type, ret; struct pmu *pmu; @@ -11238,16 +11239,27 @@ static struct pmu *perf_init_event(struct perf_event *event) * are often aliases for PERF_TYPE_RAW. */ type = event->attr.type; - if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) - type = PERF_TYPE_RAW; + if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) { + type = event->attr.config >> PERF_PMU_TYPE_SHIFT; + if (!type) { + type = PERF_TYPE_RAW; + } else { + extended_type = true; + event->attr.config &= PERF_HW_EVENT_MASK; + } + } again: rcu_read_lock(); pmu = idr_find(&pmu_idr, type); rcu_read_unlock(); if (pmu) { + if (event->attr.type != type && type != PERF_TYPE_RAW && + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE)) + goto fail; + ret = perf_try_init_event(pmu, event); - if (ret == -ENOENT && event->attr.type != type) { + if (ret == -ENOENT && event->attr.type != type && !extended_type) { type = event->attr.type; goto again; } @@ -11268,6 +11280,7 @@ again: goto unlock; } } +fail: pmu = ERR_PTR(-ENOENT); unlock: srcu_read_unlock(&pmus_srcu, idx); -- cgit v1.2.3-59-g8ed1b From ed8e50800bf4c2d904db9c75408a67085e6cca3d Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 22 Apr 2021 21:18:23 +0200 Subject: signal, perf: Add missing TRAP_PERF case in siginfo_layout() Add the missing TRAP_PERF case in siginfo_layout() for interpreting the layout correctly as SIL_PERF_EVENT instead of just SIL_FAULT. This ensures the si_perf field is copied and not just the si_addr field. This was caught and tested by running the perf_events/sigtrap_threads kselftest as a 32-bit binary with a 64-bit kernel. Fixes: fb6cc127e0b6 ("signal: Introduce TRAP_PERF si_code and si_perf to siginfo") Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210422191823.79012-2-elver@google.com --- kernel/signal.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index f68351825e5e..343d87c95c78 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3206,6 +3206,8 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR)) layout = SIL_FAULT_PKUERR; #endif + else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) + layout = SIL_PERF_EVENT; } else if (si_code <= NSIGPOLL) layout = SIL_POLL; -- cgit v1.2.3-59-g8ed1b