diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 300 |
1 files changed, 146 insertions, 154 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index eacc3702654d..7099c77bc53b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -380,7 +380,6 @@ enum event_type_t { /* * perf_sched_events : >0 events exist - * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ static void perf_sched_delayed(struct work_struct *work); @@ -389,7 +388,6 @@ static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; -static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -844,9 +842,16 @@ static void perf_cgroup_switch(struct task_struct *task) struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - cgrp = perf_cgroup_from_task(task, NULL); + /* + * cpuctx->cgrp is set when the first cgroup event enabled, + * and is cleared when the last cgroup event disabled. + */ + if (READ_ONCE(cpuctx->cgrp) == NULL) + return; WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + + cgrp = perf_cgroup_from_task(task, NULL); if (READ_ONCE(cpuctx->cgrp) == cgrp) return; @@ -3631,8 +3636,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * to check if we have to switch out PMU state. * cgroup event are system-wide mode only */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(next); + perf_cgroup_switch(next); } static bool perf_less_group_idx(const void *l, const void *r) @@ -4809,19 +4813,17 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); epc = &cpc->epc; - + raw_spin_lock_irq(&ctx->lock); if (!epc->ctx) { atomic_set(&epc->refcount, 1); epc->embedded = 1; - raw_spin_lock_irq(&ctx->lock); list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); epc->ctx = ctx; - raw_spin_unlock_irq(&ctx->lock); } else { WARN_ON_ONCE(epc->ctx != ctx); atomic_inc(&epc->refcount); } - + raw_spin_unlock_irq(&ctx->lock); return epc; } @@ -4892,33 +4894,30 @@ static void free_epc_rcu(struct rcu_head *head) static void put_pmu_ctx(struct perf_event_pmu_context *epc) { + struct perf_event_context *ctx = epc->ctx; unsigned long flags; - if (!atomic_dec_and_test(&epc->refcount)) + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags)) return; - if (epc->ctx) { - struct perf_event_context *ctx = epc->ctx; - - /* - * XXX - * - * lockdep_assert_held(&ctx->mutex); - * - * can't because of the call-site in _free_event()/put_event() - * which isn't always called under ctx->mutex. - */ + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); - raw_spin_lock_irqsave(&ctx->lock, flags); - list_del_init(&epc->pmu_ctx_entry); - epc->ctx = NULL; - raw_spin_unlock_irqrestore(&ctx->lock, flags); - } + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; WARN_ON_ONCE(!list_empty(&epc->pinned_active)); WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + if (epc->embedded) return; @@ -4974,15 +4973,6 @@ static void unaccount_pmu_sb_event(struct perf_event *event) detach_sb_event(event); } -static void unaccount_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_dec(&per_cpu(perf_cgroup_events, cpu)); -} - #ifdef CONFIG_NO_HZ_FULL static DEFINE_SPINLOCK(nr_freq_lock); #endif @@ -5048,8 +5038,6 @@ static void unaccount_event(struct perf_event *event) schedule_delayed_work(&perf_sched_work, HZ); } - unaccount_event_cpu(event, event->cpu); - unaccount_pmu_sb_event(event); } @@ -7053,13 +7041,20 @@ out_put: ring_buffer_put(rb); } -static void __perf_event_header__init_id(struct perf_event_header *header, - struct perf_sample_data *data, +/* + * A set of common sample data types saved even for non-sample records + * when event->attr.sample_id_all is set. + */ +#define PERF_SAMPLE_ID_ALL (PERF_SAMPLE_TID | PERF_SAMPLE_TIME | \ + PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER) + +static void __perf_event_header__init_id(struct perf_sample_data *data, struct perf_event *event, u64 sample_type) { data->type = event->attr.sample_type; - header->size += event->id_header_size; + data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL; if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ @@ -7086,8 +7081,10 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event) { - if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event, event->attr.sample_type); + if (event->attr.sample_id_all) { + header->size += event->id_header_size; + __perf_event_header__init_id(data, event, event->attr.sample_type); + } } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -7317,7 +7314,7 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { size_t size; size = data->br_stack->nr @@ -7561,83 +7558,68 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) return callchain ?: &__empty_callchain; } -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, +static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d) +{ + return d * !!(flags & s); +} + +void perf_prepare_sample(struct perf_sample_data *data, struct perf_event *event, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; u64 filtered_sample_type; - header->type = PERF_RECORD_SAMPLE; - header->size = sizeof(*header) + event->header_size; - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - /* - * Clear the sample flags that have already been done by the - * PMU driver. + * Add the sample flags that are dependent to others. And clear the + * sample flags that have already been done by the PMU driver. */ - filtered_sample_type = sample_type & ~data->sample_flags; - __perf_event_header__init_id(header, data, event, filtered_sample_type); - - if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) - data->ip = perf_instruction_pointer(regs); + filtered_sample_type = sample_type; + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE, + PERF_SAMPLE_IP); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE | + PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR); + filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER, + PERF_SAMPLE_REGS_USER); + filtered_sample_type &= ~data->sample_flags; - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) - data->callchain = perf_callchain(event, regs); - - size += data->callchain->nr; - - header->size += size * sizeof(u64); + if (filtered_sample_type == 0) { + /* Make sure it has the correct data->type for output */ + data->type = event->attr.sample_type; + return; } - if (sample_type & PERF_SAMPLE_RAW) { - struct perf_raw_record *raw = data->raw; - int size; - - if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { - struct perf_raw_frag *frag = &raw->frag; - u32 sum = 0; + __perf_event_header__init_id(data, event, filtered_sample_type); - do { - sum += frag->size; - if (perf_raw_frag_last(frag)) - break; - frag = frag->next; - } while (1); + if (filtered_sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + data->sample_flags |= PERF_SAMPLE_IP; + } - size = round_up(sum + sizeof(u32), sizeof(u64)); - raw->size = size - sizeof(u32); - frag->pad = raw->size - sum; - } else { - size = sizeof(u64); - data->raw = NULL; - } + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) + perf_sample_save_callchain(data, event, regs); - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_RAW) { + data->raw = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_RAW; } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - int size = sizeof(u64); /* nr */ - if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { - if (branch_sample_hw_index(event)) - size += sizeof(u64); - - size += data->br_stack->nr - * sizeof(struct perf_branch_entry); - } - header->size += size; + if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) { + data->br_stack = NULL; + data->dyn_size += sizeof(u64); + data->sample_flags |= PERF_SAMPLE_BRANCH_STACK; } - if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) + if (filtered_sample_type & PERF_SAMPLE_REGS_USER) perf_sample_regs_user(&data->regs_user, regs); - if (sample_type & PERF_SAMPLE_REGS_USER) { + /* + * It cannot use the filtered_sample_type here as REGS_USER can be set + * by STACK_USER (using __cond_set() above) and we don't want to update + * the dyn_size if it's not requested by users. + */ + if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7646,10 +7628,11 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_USER; } - if (sample_type & PERF_SAMPLE_STACK_USER) { + if (filtered_sample_type & PERF_SAMPLE_STACK_USER) { /* * Either we need PERF_SAMPLE_STACK_USER bit to be always * processed as the last one or have additional check added @@ -7657,9 +7640,10 @@ void perf_prepare_sample(struct perf_event_header *header, * up the rest of the sample size. */ u16 stack_size = event->attr.sample_stack_user; + u16 header_size = perf_sample_data_size(data, event); u16 size = sizeof(u64); - stack_size = perf_sample_ustack_size(stack_size, header->size, + stack_size = perf_sample_ustack_size(stack_size, header_size, data->regs_user.regs); /* @@ -7671,24 +7655,31 @@ void perf_prepare_sample(struct perf_event_header *header, size += sizeof(u64) + stack_size; data->stack_user_size = stack_size; - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_STACK_USER; } - if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) { data->weight.full = 0; + data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE; + } - if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) { data->data_src.val = PERF_MEM_NA; + data->sample_flags |= PERF_SAMPLE_DATA_SRC; + } - if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) { data->txn = 0; + data->sample_flags |= PERF_SAMPLE_TRANSACTION; + } - if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { - if (filtered_sample_type & PERF_SAMPLE_ADDR) - data->addr = 0; + if (filtered_sample_type & PERF_SAMPLE_ADDR) { + data->addr = 0; + data->sample_flags |= PERF_SAMPLE_ADDR; } - if (sample_type & PERF_SAMPLE_REGS_INTR) { + if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7700,20 +7691,23 @@ void perf_prepare_sample(struct perf_event_header *header, size += hweight64(mask) * sizeof(u64); } - header->size += size; + data->dyn_size += size; + data->sample_flags |= PERF_SAMPLE_REGS_INTR; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR && - filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) + if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) { data->phys_addr = perf_virt_to_phys(data->addr); + data->sample_flags |= PERF_SAMPLE_PHYS_ADDR; + } #ifdef CONFIG_CGROUP_PERF - if (sample_type & PERF_SAMPLE_CGROUP) { + if (filtered_sample_type & PERF_SAMPLE_CGROUP) { struct cgroup *cgrp; /* protected by RCU */ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; data->cgroup = cgroup_id(cgrp); + data->sample_flags |= PERF_SAMPLE_CGROUP; } #endif @@ -7722,16 +7716,21 @@ void perf_prepare_sample(struct perf_event_header *header, * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, * but the value will not dump to the userspace. */ - if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) { data->data_page_size = perf_get_page_size(data->addr); + data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) { data->code_page_size = perf_get_page_size(data->ip); + data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE; + } - if (sample_type & PERF_SAMPLE_AUX) { + if (filtered_sample_type & PERF_SAMPLE_AUX) { u64 size; + u16 header_size = perf_sample_data_size(data, event); - header->size += sizeof(u64); /* size */ + header_size += sizeof(u64); /* size */ /* * Given the 16bit nature of header::size, an AUX sample can @@ -7739,14 +7738,26 @@ void perf_prepare_sample(struct perf_event_header *header, * Make sure this doesn't happen by using up to U16_MAX bytes * per sample in total (rounded down to 8 byte boundary). */ - size = min_t(size_t, U16_MAX - header->size, + size = min_t(size_t, U16_MAX - header_size, event->attr.aux_sample_size); size = rounddown(size, 8); size = perf_prepare_sample_aux(event, data, size); - WARN_ON_ONCE(size + header->size > U16_MAX); - header->size += size; + WARN_ON_ONCE(size + header_size > U16_MAX); + data->dyn_size += size + sizeof(u64); /* size above */ + data->sample_flags |= PERF_SAMPLE_AUX; } +} + +void perf_prepare_header(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + header->type = PERF_RECORD_SAMPLE; + header->size = perf_sample_data_size(data, event); + header->misc = perf_misc_flags(regs); + /* * If you're adding more sample types here, you likely need to do * something about the overflowing header::size, like repurpose the @@ -7774,7 +7785,8 @@ __perf_event_output(struct perf_event *event, /* protect the callchain buffers */ rcu_read_lock(); - perf_prepare_sample(&header, data, event, regs); + perf_prepare_sample(data, event, regs); + perf_prepare_header(&header, data, event, regs); err = output_begin(&handle, data, event, header.size); if (err) @@ -10132,8 +10144,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, }; perf_sample_data_init(&data, 0, 0); - data.raw = &raw; - data.sample_flags |= PERF_SAMPLE_RAW; + perf_sample_save_raw_data(&data, &raw); perf_trace_buf_update(record, event_type); @@ -10340,13 +10351,7 @@ static void bpf_overflow_handler(struct perf_event *event, rcu_read_lock(); prog = READ_ONCE(event->prog); if (prog) { - if (prog->call_get_stack && - (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && - !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { - data->callchain = perf_callchain(event, regs); - data->sample_flags |= PERF_SAMPLE_CALLCHAIN; - } - + perf_prepare_sample(data, event, regs); ret = bpf_prog_run(prog, &ctx); } rcu_read_unlock(); @@ -11679,15 +11684,6 @@ static void account_pmu_sb_event(struct perf_event *event) attach_sb_event(event); } -static void account_event_cpu(struct perf_event *event, int cpu) -{ - if (event->parent) - return; - - if (is_cgroup_event(event)) - atomic_inc(&per_cpu(perf_cgroup_events, cpu)); -} - /* Freq events need the tick to stay alive (see perf_event_task_tick). */ static void account_freq_event_nohz(void) { @@ -11775,8 +11771,6 @@ static void account_event(struct perf_event *event) } enabled: - account_event_cpu(event, event->cpu); - account_pmu_sb_event(event); } @@ -12339,12 +12333,12 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & ~PERF_FLAG_ALL) return -EINVAL; - /* Do we allow access to perf_event_open(2) ? */ - err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); + err = perf_copy_attr(attr_uptr, &attr); if (err) return err; - err = perf_copy_attr(attr_uptr, &attr); + /* Do we allow access to perf_event_open(2) ? */ + err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); if (err) return err; @@ -12689,7 +12683,8 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: - /* event->pmu_ctx freed by free_event() */ + put_pmu_ctx(event->pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_locked: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12802,6 +12797,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, err_pmu_ctx: put_pmu_ctx(pmu_ctx); + event->pmu_ctx = NULL; /* _free_event() */ err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); @@ -12822,13 +12818,11 @@ static void __perf_pmu_remove(struct perf_event_context *ctx, perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, cpu); put_pmu_ctx(event->pmu_ctx); list_add(&event->migrate_entry, events); for_each_sibling_event(sibling, event) { perf_remove_from_context(sibling, 0); - unaccount_event_cpu(sibling, cpu); put_pmu_ctx(sibling->pmu_ctx); list_add(&sibling->migrate_entry, events); } @@ -12847,7 +12841,6 @@ static void __perf_pmu_install_event(struct pmu *pmu, if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, cpu); perf_install_in_context(ctx, event, cpu); } @@ -13231,7 +13224,7 @@ inherit_event(struct perf_event *parent_event, pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); if (IS_ERR(pmu_ctx)) { free_event(child_event); - return NULL; + return ERR_CAST(pmu_ctx); } child_event->pmu_ctx = pmu_ctx; @@ -13742,8 +13735,7 @@ static int __perf_cgroup_move(void *info) struct task_struct *task = info; preempt_disable(); - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_switch(task); + perf_cgroup_switch(task); preempt_enable(); return 0; |