aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/callchain.c14
-rw-r--r--kernel/events/core.c309
-rw-r--r--kernel/events/internal.h25
3 files changed, 228 insertions, 120 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 179ef4640964..e9fdb5203de5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -104,7 +104,7 @@ fail:
return -ENOMEM;
}
-int get_callchain_buffers(void)
+int get_callchain_buffers(int event_max_stack)
{
int err = 0;
int count;
@@ -121,6 +121,15 @@ int get_callchain_buffers(void)
/* If the allocation failed, give up */
if (!callchain_cpus_entries)
err = -ENOMEM;
+ /*
+ * If requesting per event more than the global cap,
+ * return a different error to help userspace figure
+ * this out.
+ *
+ * And also do it here so that we have &callchain_mutex held.
+ */
+ if (event_max_stack > sysctl_perf_event_max_stack)
+ err = -EOVERFLOW;
goto exit;
}
@@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
bool user = !event->attr.exclude_callchain_user;
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
+ const u32 max_stack = event->attr.sample_max_stack;
if (!kernel && !user)
return NULL;
- return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true);
+ return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true);
}
struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 274450efea90..356a6c7cb52a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -335,6 +335,7 @@ static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
if (ret || !write)
return ret;
+ /*
+ * If throttling is disabled don't allow the write:
+ */
+ if (sysctl_perf_cpu_time_max_percent == 100 ||
+ sysctl_perf_cpu_time_max_percent == 0)
+ return -EINVAL;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -1678,12 +1686,33 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int pmu_filter_match(struct perf_event *event)
+static inline int __pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
return pmu->filter_match ? pmu->filter_match(event) : 1;
}
+/*
+ * Check whether we should attempt to schedule an event group based on
+ * PMU-specific filtering. An event group can consist of HW and SW events,
+ * potentially with a SW leader, so we must check all the filters, to
+ * determine whether a group is schedulable:
+ */
+static inline int pmu_filter_match(struct perf_event *event)
+{
+ struct perf_event *child;
+
+ if (!__pmu_filter_match(event))
+ return 0;
+
+ list_for_each_entry(child, &event->sibling_list, group_entry) {
+ if (!__pmu_filter_match(child))
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int
event_filter_match(struct perf_event *event)
{
@@ -3665,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head)
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb);
+static void detach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_del_rcu(&event->sb_list);
+ raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+ struct perf_event_attr *attr = &event->attr;
+
+ if (event->parent)
+ return false;
+
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return false;
+
+ if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+ attr->comm || attr->comm_exec ||
+ attr->task ||
+ attr->context_switch)
+ return true;
+ return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ detach_sb_event(event);
+}
+
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -3728,6 +3790,8 @@ static void unaccount_event(struct perf_event *event)
}
unaccount_event_cpu(event, event->cpu);
+
+ unaccount_pmu_sb_event(event);
}
static void perf_sched_delayed(struct work_struct *work)
@@ -3862,10 +3926,8 @@ static void _free_event(struct perf_event *event)
if (event->ctx)
put_ctx(event->ctx);
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
@@ -5555,16 +5617,26 @@ void perf_output_sample(struct perf_output_handle *handle,
}
if (sample_type & PERF_SAMPLE_RAW) {
- if (data->raw) {
- u32 raw_size = data->raw->size;
- u32 real_size = round_up(raw_size + sizeof(u32),
- sizeof(u64)) - sizeof(u32);
- u64 zero = 0;
-
- perf_output_put(handle, real_size);
- __output_copy(handle, data->raw->data, raw_size);
- if (real_size - raw_size)
- __output_copy(handle, &zero, real_size - raw_size);
+ struct perf_raw_record *raw = data->raw;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+
+ perf_output_put(handle, raw->size);
+ do {
+ if (frag->copy) {
+ __output_custom(handle, frag->copy,
+ frag->data, frag->size);
+ } else {
+ __output_copy(handle, frag->data,
+ frag->size);
+ }
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+ if (frag->pad)
+ __output_skip(handle, NULL, frag->pad);
} else {
struct {
u32 size;
@@ -5689,14 +5761,28 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & PERF_SAMPLE_RAW) {
- int size = sizeof(u32);
-
- if (data->raw)
- size += data->raw->size;
- else
- size += sizeof(u32);
+ struct perf_raw_record *raw = data->raw;
+ int size;
+
+ if (raw) {
+ struct perf_raw_frag *frag = &raw->frag;
+ u32 sum = 0;
+
+ do {
+ sum += frag->size;
+ if (perf_raw_frag_last(frag))
+ break;
+ frag = frag->next;
+ } while (1);
+
+ size = round_up(sum + sizeof(u32), sizeof(u64));
+ raw->size = size - sizeof(u32);
+ frag->pad = raw->size - sum;
+ } else {
+ size = sizeof(u64);
+ }
- header->size += round_up(size, sizeof(u64));
+ header->size += size;
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5856,11 +5942,11 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
- perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+ perf_iterate_f output,
void *data, bool all)
{
struct perf_event *event;
@@ -5877,52 +5963,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
}
}
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
- struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
- rcu_read_lock();
- preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data, false);
- preempt_enable();
- rcu_read_unlock();
+ struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &pel->list, sb_list) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ output(event, data);
+ }
}
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
- struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
int ctxn;
+ rcu_read_lock();
+ preempt_disable();
+
/*
- * If we have task_ctx != NULL we only notify
- * the task context itself. The task_ctx is set
- * only for EXIT events before releasing task
+ * If we have task_ctx != NULL we only notify the task context itself.
+ * The task_ctx is set only for EXIT events before releasing task
* context.
*/
if (task_ctx) {
- perf_event_aux_task_ctx(output, data, task_ctx);
- return;
+ perf_iterate_ctx(task_ctx, output, data, false);
+ goto done;
}
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
+ perf_iterate_sb_cpu(output, data);
+
+ for_each_task_context_nr(ctxn) {
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data, false);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
+ perf_iterate_ctx(ctx, output, data, false);
}
+done:
+ preempt_enable();
rcu_read_unlock();
}
@@ -5971,7 +6060,7 @@ void perf_event_exec(void)
perf_event_enable_on_exec(ctxn);
- perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
true);
}
rcu_read_unlock();
@@ -6015,9 +6104,9 @@ static int __perf_pmu_output_stop(void *info)
};
rcu_read_lock();
- perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
if (cpuctx->task_ctx)
- perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
&ro, false);
rcu_read_unlock();
@@ -6146,7 +6235,7 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_aux(perf_event_task_output,
+ perf_iterate_sb(perf_event_task_output,
&task_event,
task_ctx);
}
@@ -6225,7 +6314,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- perf_event_aux(perf_event_comm_output,
+ perf_iterate_sb(perf_event_comm_output,
comm_event,
NULL);
}
@@ -6456,7 +6545,7 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- perf_event_aux(perf_event_mmap_output,
+ perf_iterate_sb(perf_event_mmap_output,
mmap_event,
NULL);
@@ -6539,7 +6628,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
if (!ctx)
continue;
- perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
}
rcu_read_unlock();
}
@@ -6726,7 +6815,7 @@ static void perf_event_switch(struct task_struct *task,
},
};
- perf_event_aux(perf_event_switch_output,
+ perf_iterate_sb(perf_event_switch_output,
&switch_event,
NULL);
}
@@ -7333,7 +7422,7 @@ static struct pmu perf_swevent = {
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
- void *record = data->raw->data;
+ void *record = data->raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -7389,8 +7478,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct perf_event *event;
struct perf_raw_record raw = {
- .size = entry_size,
- .data = record,
+ .frag = {
+ .size = entry_size,
+ .data = record,
+ },
};
perf_sample_data_init(&data, 0, 0);
@@ -8648,6 +8739,28 @@ unlock:
return pmu;
}
+static void attach_sb_event(struct perf_event *event)
+{
+ struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+ raw_spin_lock(&pel->lock);
+ list_add_rcu(&event->sb_list, &pel->list);
+ raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+ if (is_sb_event(event))
+ attach_sb_event(event);
+}
+
static void account_event_cpu(struct perf_event *event, int cpu)
{
if (event->parent)
@@ -8728,6 +8841,8 @@ static void account_event(struct perf_event *event)
enabled:
account_event_cpu(event, event->cpu);
+
+ account_pmu_sb_event(event);
}
/*
@@ -8876,7 +8991,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!event->parent) {
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
- err = get_callchain_buffers();
+ err = get_callchain_buffers(attr->sample_max_stack);
if (err)
goto err_addr_filters;
}
@@ -9198,6 +9313,9 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ if (!attr.sample_max_stack)
+ attr.sample_max_stack = sysctl_perf_event_max_stack;
+
/*
* In cgroup mode, the pid argument is used to pass the fd
* opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9271,7 +9389,7 @@ SYSCALL_DEFINE5(perf_event_open,
if (is_sampling_event(event)) {
if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
- err = -ENOTSUPP;
+ err = -EOPNOTSUPP;
goto err_alloc;
}
}
@@ -10233,10 +10351,13 @@ static void __init perf_event_init_all_cpus(void)
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+ INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+ raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
}
}
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -10249,6 +10370,7 @@ static void perf_event_init_cpu(int cpu)
rcu_assign_pointer(swhash->swevent_hlist, hlist);
}
mutex_unlock(&swhash->hlist_mutex);
+ return 0;
}
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10280,14 +10402,17 @@ static void perf_event_exit_cpu_context(int cpu)
}
srcu_read_unlock(&pmus_srcu, idx);
}
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
-static void perf_event_exit_cpu(int cpu)
+#endif
+
+int perf_event_exit_cpu(unsigned int cpu)
{
perf_event_exit_cpu_context(cpu);
+ return 0;
}
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10309,46 +10434,6 @@ static struct notifier_block perf_reboot_notifier = {
.priority = INT_MIN,
};
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
- unsigned int cpu = (long)hcpu;
-
- switch (action & ~CPU_TASKS_FROZEN) {
-
- case CPU_UP_PREPARE:
- /*
- * This must be done before the CPU comes alive, because the
- * moment we can run tasks we can encounter (software) events.
- *
- * Specifically, someone can have inherited events on kthreadd
- * or a pre-existing worker thread that gets re-bound.
- */
- perf_event_init_cpu(cpu);
- break;
-
- case CPU_DOWN_PREPARE:
- /*
- * This must be done before the CPU dies because after that an
- * active event might want to IPI the CPU and that'll not work
- * so great for dead CPUs.
- *
- * XXX smp_call_function_single() return -ENXIO without a warn
- * so we could possibly deal with this.
- *
- * This is safe against new events arriving because
- * sys_perf_event_open() serializes against hotplug using
- * get_online_cpus().
- */
- perf_event_exit_cpu(cpu);
- break;
- default:
- break;
- }
-
- return NOTIFY_OK;
-}
-
void __init perf_event_init(void)
{
int ret;
@@ -10361,7 +10446,7 @@ void __init perf_event_init(void)
perf_pmu_register(&perf_cpu_clock, NULL, -1);
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
- perf_cpu_notifier(perf_cpu_notify);
+ perf_event_init_cpu(smp_processor_id());
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 05f9f6d626df..486fd78eb8d5 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb)
return rb->aux_nr_pages << PAGE_SHIFT;
}
-#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
-static inline unsigned long \
-func_name(struct perf_output_handle *handle, \
- const void *buf, unsigned long len) \
+#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \
{ \
unsigned long size, written; \
\
do { \
size = min(handle->size, len); \
- written = memcpy_func(handle->addr, buf, size); \
+ written = memcpy_func(__VA_ARGS__); \
written = size - written; \
\
len -= written; \
handle->addr += written; \
- buf += written; \
+ if (advance_buf) \
+ buf += written; \
handle->size -= written; \
if (!handle->size) { \
struct ring_buffer *rb = handle->rb; \
@@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \
return len; \
}
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned long \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned long len) \
+__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size)
+
+static inline unsigned long
+__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func,
+ const void *buf, unsigned long len)
+{
+ unsigned long orig_len = len;
+ __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf,
+ orig_len - len, size)
+}
+
static inline unsigned long
memcpy_common(void *dst, const void *src, unsigned long n)
{