From 30dd568c912602b7dbd609a45d053e01b13422bb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 21 Jul 2009 15:56:48 +0200 Subject: x86, perf_counter, bts: Add BTS support to perfcounters Implement a performance counter with: attr.type = PERF_TYPE_HARDWARE attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS attr.sample_period = 1 Using branch trace store (BTS) on x86 hardware, if available. The from and to address for each branch can be sampled using: PERF_SAMPLE_IP for the from address PERF_SAMPLE_ADDR for the to address [ v2: address review feedback, fix bugs ] Signed-off-by: Markus Metzger Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d62941..bf8110b35c51 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4566,6 +4567,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4590,6 +4596,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3-59-g8ed1b From fa289beca9de9119c7760bd984f3640da21bc94c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 25 Aug 2009 15:17:20 +1000 Subject: perf_counter: Start counting time enabled when group leader gets enabled Currently, if a group is created where the group leader is initially disabled but a non-leader member is initially enabled, and then the leader is subsequently enabled some time later, the time_enabled for the non-leader member will reflect the whole time since it was created, not just the time since the leader was enabled. This is incorrect, because all of the members are effectively disabled while the leader is disabled, since none of the members can go on the PMU if the leader can't. Thus we have to update the ->tstamp_enabled for all the enabled group members when a group leader is enabled, so that the time_enabled computation only counts the time since the leader was enabled. Similarly, when disabling a group leader we have to update the time_enabled and time_running for all of the group members. Also, in update_counter_times, we have to treat a counter whose group leader is disabled as being disabled. Reported-by: Stephane Eranian Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: LKML-Reference: <19091.29664.342227.445006@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f274e1959885..06bf6a4f2608 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -469,7 +469,8 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state < PERF_COUNTER_STATE_INACTIVE) + if (counter->state < PERF_COUNTER_STATE_INACTIVE || + counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) return; counter->total_time_enabled = ctx->time - counter->tstamp_enabled; @@ -518,7 +519,7 @@ static void __perf_counter_disable(void *info) */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { update_context_time(ctx); - update_counter_times(counter); + update_group_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -573,7 +574,7 @@ static void perf_counter_disable(struct perf_counter *counter) * in, so we can change the state safely. */ if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; } @@ -850,6 +851,27 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Put a counter into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_counter_mark_enabled(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + struct perf_counter *sub; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + list_for_each_entry(sub, &counter->sibling_list, list_entry) + if (sub->state >= PERF_COUNTER_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + /* * Cross CPU call to enable a performance counter */ @@ -877,8 +899,7 @@ static void __perf_counter_enable(void *info) if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); /* * If the counter is in a group and isn't the group leader, @@ -971,11 +992,9 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - } + if (counter->state == PERF_COUNTER_STATE_OFF) + __perf_counter_mark_enabled(counter, ctx); + out: spin_unlock_irq(&ctx->lock); } @@ -1479,9 +1498,7 @@ static void perf_counter_enable_on_exec(struct task_struct *task) counter->attr.enable_on_exec = 0; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; + __perf_counter_mark_enabled(counter, ctx); enabled = 1; } -- cgit v1.2.3-59-g8ed1b From a4be7c2778d1fd8e0a8a5607bec91fa221ab2c91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Aug 2009 11:18:27 +0200 Subject: perf_counter: Allow sharing of output channels Provide the ability to configure a counter to send its output to another (already existing) counter's output stream. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: stephane eranian Cc: Paul Mackerras LKML-Reference: <20090819092023.980284148@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +++ kernel/perf_counter.c | 83 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 85 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b53f7006cc4e..e022b847c90d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -216,6 +216,7 @@ struct perf_counter_attr { #define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) #define PERF_COUNTER_IOC_RESET _IO ('$', 3) #define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, @@ -415,6 +416,9 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -536,6 +540,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; + struct perf_counter *output; const struct pmu *pmu; enum perf_counter_active_state state; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 06bf6a4f2608..53abcbefa0bf 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,6 +1692,11 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_task_counters); } + if (counter->output) { + fput(counter->output->filp); + counter->output = NULL; + } + if (counter->destroy) counter->destroy(counter); @@ -1977,6 +1982,8 @@ unlock: return ret; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd); + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -2000,6 +2007,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_PERIOD: return perf_counter_period(counter, (u64 __user *)arg); + case PERF_COUNTER_IOC_SET_OUTPUT: + return perf_counter_set_output(counter, arg); + default: return -ENOTTY; } @@ -2270,6 +2280,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->mmap_mutex); + if (counter->output) { + ret = -EINVAL; + goto unlock; + } + if (atomic_inc_not_zero(&counter->mmap_count)) { if (nr_pages != counter->data->nr_pages) ret = -EINVAL; @@ -2655,6 +2670,7 @@ static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int sample) { + struct perf_counter *output_counter; struct perf_mmap_data *data; unsigned int offset, head; int have_lost; @@ -2664,13 +2680,17 @@ static int perf_output_begin(struct perf_output_handle *handle, u64 lost; } lost_event; + rcu_read_lock(); /* * For inherited counters we send all the output towards the parent. */ if (counter->parent) counter = counter->parent; - rcu_read_lock(); + output_counter = rcu_dereference(counter->output); + if (output_counter) + counter = output_counter; + data = rcu_dereference(counter->data); if (!data) goto out; @@ -4218,6 +4238,57 @@ err_size: goto out; } +int perf_counter_set_output(struct perf_counter *counter, int output_fd) +{ + struct perf_counter *output_counter = NULL; + struct file *output_file = NULL; + struct perf_counter *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_counter = output_file->private_data; + + /* Don't chain output fds */ + if (output_counter->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (counter->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&counter->mmap_mutex); + old_output = counter->output; + rcu_assign_pointer(counter->output, output_counter); + mutex_unlock(&counter->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this counter. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -4240,7 +4311,7 @@ SYSCALL_DEFINE5(perf_counter_open, int ret; /* for future expandability... */ - if (flags) + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) return -EINVAL; ret = perf_copy_attr(attr_uptr, &attr); @@ -4268,7 +4339,7 @@ SYSCALL_DEFINE5(perf_counter_open, * Look up the group leader (we will attach this counter to it): */ group_leader = NULL; - if (group_fd != -1) { + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { ret = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) @@ -4310,6 +4381,12 @@ SYSCALL_DEFINE5(perf_counter_open, if (!counter_file) goto err_free_put_context; + if (flags & PERF_FLAG_FD_OUTPUT) { + ret = perf_counter_set_output(counter, group_fd); + if (ret) + goto err_free_put_context; + } + counter->filp = counter_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); -- cgit v1.2.3-59-g8ed1b From 0fbdea19e9394a5cb5f2f5081b028c50b558910a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 Sep 2009 21:46:00 +0200 Subject: perf_counter: Introduce new (non-)paranoia level to allow raw tracepoint access I want to sample inherited tracepoint workloads as a normal user and the CAP_SYS_ADMIN check prevents me from doing that right now. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d988dfb4bbab..0aa609f69103 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,12 +46,18 @@ static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: - * 0 - not paranoid - * 1 - disallow cpu counters to unpriv - * 2 - disallow kernel profiling to unpriv + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu counters for unpriv + * 2 - disallow kernel profiling for unpriv */ int sysctl_perf_counter_paranoid __read_mostly = 1; +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_counter_paranoid > -1; +} + static inline bool perf_paranoid_cpu(void) { return sysctl_perf_counter_paranoid > 0; @@ -3971,6 +3977,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) * have these. */ if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); -- cgit v1.2.3-59-g8ed1b From dc86cabe4b242446ea9aa8492c727e1220817898 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 3 Sep 2009 18:03:00 +0200 Subject: perf_counter: Fix output-sharing error path We forget to release the fd in the PERF_FLAG_FD_OUTPUT error path. Reorganize the error flow here to be a clean fall-through logic. Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0aa609f69103..e0d91fdf0c3c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4316,15 +4316,15 @@ SYSCALL_DEFINE5(perf_counter_open, struct file *group_file = NULL; int fput_needed = 0; int fput_needed2 = 0; - int ret; + int err; /* for future expandability... */ if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) return -EINVAL; - ret = perf_copy_attr(attr_uptr, &attr); - if (ret) - return ret; + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; if (!attr.exclude_kernel) { if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) @@ -4348,7 +4348,7 @@ SYSCALL_DEFINE5(perf_counter_open, */ group_leader = NULL; if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - ret = -EINVAL; + err = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) goto err_put_context; @@ -4377,22 +4377,22 @@ SYSCALL_DEFINE5(perf_counter_open, counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, NULL, GFP_KERNEL); - ret = PTR_ERR(counter); + err = PTR_ERR(counter); if (IS_ERR(counter)) goto err_put_context; - ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (ret < 0) + err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + if (err < 0) goto err_free_put_context; - counter_file = fget_light(ret, &fput_needed2); + counter_file = fget_light(err, &fput_needed2); if (!counter_file) goto err_free_put_context; if (flags & PERF_FLAG_FD_OUTPUT) { - ret = perf_counter_set_output(counter, group_fd); - if (ret) - goto err_free_put_context; + err = perf_counter_set_output(counter, group_fd); + if (err) + goto err_fput_free_put_context; } counter->filp = counter_file; @@ -4408,20 +4408,20 @@ SYSCALL_DEFINE5(perf_counter_open, list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); mutex_unlock(¤t->perf_counter_mutex); +err_fput_free_put_context: fput_light(counter_file, fput_needed2); -out_fput: - fput_light(group_file, fput_needed); - - return ret; - err_free_put_context: - kfree(counter); + if (err < 0) + kfree(counter); err_put_context: - put_ctx(ctx); + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); - goto out_fput; + return err; } /* -- cgit v1.2.3-59-g8ed1b