aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c592
1 files changed, 454 insertions, 138 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e453589da97c..bc9b98a9af9a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -28,6 +28,7 @@
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
+#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
@@ -49,6 +50,7 @@
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
+#include <linux/min_heap.h>
#include "internal.h"
@@ -386,6 +388,7 @@ static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
+static atomic_t nr_cgroup_events __read_mostly;
static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
@@ -891,6 +894,47 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
rcu_read_unlock();
}
+static int perf_cgroup_ensure_storage(struct perf_event *event,
+ struct cgroup_subsys_state *css)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event **storage;
+ int cpu, heap_size, ret = 0;
+
+ /*
+ * Allow storage to have sufficent space for an iterator for each
+ * possibly nested cgroup plus an iterator for events with no cgroup.
+ */
+ for (heap_size = 1; css; css = css->parent)
+ heap_size++;
+
+ for_each_possible_cpu(cpu) {
+ cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ if (heap_size <= cpuctx->heap_size)
+ continue;
+
+ storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!storage) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ raw_spin_lock_irq(&cpuctx->ctx.lock);
+ if (cpuctx->heap_size < heap_size) {
+ swap(cpuctx->heap, storage);
+ if (storage == cpuctx->heap_default)
+ storage = NULL;
+ cpuctx->heap_size = heap_size;
+ }
+ raw_spin_unlock_irq(&cpuctx->ctx.lock);
+
+ kfree(storage);
+ }
+
+ return ret;
+}
+
static inline int perf_cgroup_connect(int fd, struct perf_event *event,
struct perf_event_attr *attr,
struct perf_event *group_leader)
@@ -910,6 +954,10 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
goto out;
}
+ ret = perf_cgroup_ensure_storage(event, css);
+ if (ret)
+ goto out;
+
cgrp = container_of(css, struct perf_cgroup, css);
event->cgrp = cgrp;
@@ -935,16 +983,10 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-/*
- * Update cpuctx->cgrp so that it is set when first cgroup event is added and
- * cleared when last cgroup event is removed.
- */
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
- struct list_head *cpuctx_entry;
if (!is_cgroup_event(event))
return;
@@ -961,28 +1003,41 @@ list_update_cgroup_event(struct perf_event *event,
* because if the first would mismatch, the second would not try again
* and we would leave cpuctx->cgrp unset.
*/
- if (add && !cpuctx->cgrp) {
+ if (ctx->is_active && !cpuctx->cgrp) {
struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
cpuctx->cgrp = cgrp;
}
- if (add && ctx->nr_cgroups++)
+ if (ctx->nr_cgroups++)
+ return;
+
+ list_add(&cpuctx->cgrp_cpuctx_entry,
+ per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
+}
+
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (!is_cgroup_event(event))
return;
- else if (!add && --ctx->nr_cgroups)
+
+ /*
+ * Because cgroup events are always per-cpu events,
+ * @ctx == &cpuctx->ctx.
+ */
+ cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
+
+ if (--ctx->nr_cgroups)
return;
- /* no cgroup running */
- if (!add)
+ if (ctx->is_active && cpuctx->cgrp)
cpuctx->cgrp = NULL;
- cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- if (add)
- list_add(cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
- else
- list_del(cpuctx_entry);
+ list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1048,11 +1103,14 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-list_update_cgroup_event(struct perf_event *event,
- struct perf_event_context *ctx, bool add)
+perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+static inline void
+perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
+{
+}
#endif
/*
@@ -1249,7 +1307,7 @@ static void put_ctx(struct perf_event_context *ctx)
* function.
*
* Lock order:
- * cred_guard_mutex
+ * exec_update_mutex
* task_struct::perf_event_mutex
* perf_event_context::mutex
* perf_event::child_mutex;
@@ -1531,6 +1589,30 @@ perf_event_groups_less(struct perf_event *left, struct perf_event *right)
if (left->cpu > right->cpu)
return false;
+#ifdef CONFIG_CGROUP_PERF
+ if (left->cgrp != right->cgrp) {
+ if (!left->cgrp || !left->cgrp->css.cgroup) {
+ /*
+ * Left has no cgroup but right does, no cgroups come
+ * first.
+ */
+ return true;
+ }
+ if (!right->cgrp || !right->cgrp->css.cgroup) {
+ /*
+ * Right has no cgroup but left does, no cgroups come
+ * first.
+ */
+ return false;
+ }
+ /* Two dissimilar cgroups, order by id. */
+ if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id)
+ return true;
+
+ return false;
+ }
+#endif
+
if (left->group_index < right->group_index)
return true;
if (left->group_index > right->group_index)
@@ -1610,25 +1692,48 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the @cpu subtree.
+ * Get the leftmost event in the cpu/cgroup subtree.
*/
static struct perf_event *
-perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+perf_event_groups_first(struct perf_event_groups *groups, int cpu,
+ struct cgroup *cgrp)
{
struct perf_event *node_event = NULL, *match = NULL;
struct rb_node *node = groups->tree.rb_node;
+#ifdef CONFIG_CGROUP_PERF
+ u64 node_cgrp_id, cgrp_id = 0;
+
+ if (cgrp)
+ cgrp_id = cgrp->kn->id;
+#endif
while (node) {
node_event = container_of(node, struct perf_event, group_node);
if (cpu < node_event->cpu) {
node = node->rb_left;
- } else if (cpu > node_event->cpu) {
+ continue;
+ }
+ if (cpu > node_event->cpu) {
node = node->rb_right;
- } else {
- match = node_event;
+ continue;
+ }
+#ifdef CONFIG_CGROUP_PERF
+ node_cgrp_id = 0;
+ if (node_event->cgrp && node_event->cgrp->css.cgroup)
+ node_cgrp_id = node_event->cgrp->css.cgroup->kn->id;
+
+ if (cgrp_id < node_cgrp_id) {
node = node->rb_left;
+ continue;
}
+ if (cgrp_id > node_cgrp_id) {
+ node = node->rb_right;
+ continue;
+ }
+#endif
+ match = node_event;
+ node = node->rb_left;
}
return match;
@@ -1641,12 +1746,26 @@ static struct perf_event *
perf_event_groups_next(struct perf_event *event)
{
struct perf_event *next;
+#ifdef CONFIG_CGROUP_PERF
+ u64 curr_cgrp_id = 0;
+ u64 next_cgrp_id = 0;
+#endif
next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
- if (next && next->cpu == event->cpu)
- return next;
+ if (next == NULL || next->cpu != event->cpu)
+ return NULL;
- return NULL;
+#ifdef CONFIG_CGROUP_PERF
+ if (event->cgrp && event->cgrp->css.cgroup)
+ curr_cgrp_id = event->cgrp->css.cgroup->kn->id;
+
+ if (next->cgrp && next->cgrp->css.cgroup)
+ next_cgrp_id = next->cgrp->css.cgroup->kn->id;
+
+ if (curr_cgrp_id != next_cgrp_id)
+ return NULL;
+#endif
+ return next;
}
/*
@@ -1682,13 +1801,14 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
add_event_to_groups(event, ctx);
}
- list_update_cgroup_event(event, ctx, true);
-
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
+ if (event->state > PERF_EVENT_STATE_OFF)
+ perf_cgroup_event_enable(event, ctx);
+
ctx->generation++;
}
@@ -1754,6 +1874,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
size += sizeof(data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ size += sizeof(data->cgroup);
+
event->header_size = size;
}
@@ -1864,8 +1987,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
- list_update_cgroup_event(event, ctx, false);
-
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -1882,8 +2003,10 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of error state is by explicit re-enabling
* of the event
*/
- if (event->state > PERF_EVENT_STATE_OFF)
+ if (event->state > PERF_EVENT_STATE_OFF) {
+ perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ }
ctx->generation++;
}
@@ -1986,6 +2109,12 @@ static int perf_get_aux_event(struct perf_event *event,
return 1;
}
+static inline struct list_head *get_event_list(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+}
+
static void perf_group_detach(struct perf_event *event)
{
struct perf_event *sibling, *tmp;
@@ -2028,12 +2157,8 @@ static void perf_group_detach(struct perf_event *event)
if (!RB_EMPTY_NODE(&event->group_node)) {
add_event_to_groups(sibling, event->ctx);
- if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
- struct list_head *list = sibling->attr.pinned ?
- &ctx->pinned_active : &ctx->flexible_active;
-
- list_add_tail(&sibling->active_list, list);
- }
+ if (sibling->state == PERF_EVENT_STATE_ACTIVE)
+ list_add_tail(&sibling->active_list, get_event_list(sibling));
}
WARN_ON_ONCE(sibling->ctx != event->ctx);
@@ -2112,6 +2237,7 @@ event_sched_out(struct perf_event *event,
if (READ_ONCE(event->pending_disable) >= 0) {
WRITE_ONCE(event->pending_disable, -1);
+ perf_cgroup_event_disable(event, ctx);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
@@ -2182,6 +2308,7 @@ __perf_remove_from_context(struct perf_event *event,
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
+ ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2248,6 +2375,7 @@ static void __perf_event_disable(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
+ perf_cgroup_event_disable(event, ctx);
}
/*
@@ -2350,6 +2478,8 @@ event_sched_in(struct perf_event *event,
{
int ret = 0;
+ WARN_ON_ONCE(event->ctx != ctx);
+
lockdep_assert_held(&ctx->lock);
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -2629,7 +2759,7 @@ static int __perf_install_in_context(void *info)
}
#ifdef CONFIG_CGROUP_PERF
- if (is_cgroup_event(event)) {
+ if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
/*
* If the current cgroup doesn't match the event's
* cgroup, we should not try to schedule it.
@@ -2789,6 +2919,7 @@ static void __perf_event_enable(struct perf_event *event,
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
+ perf_cgroup_event_enable(event, ctx);
if (!ctx->is_active)
return;
@@ -3077,12 +3208,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
- /*
- * If we had been multiplexing, no rotations are necessary, now no events
- * are active.
- */
- ctx->rotate_necessary = 0;
-
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3092,6 +3217,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
+
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ ctx->rotate_necessary = 0;
}
perf_pmu_enable(ctx->pmu);
}
@@ -3388,71 +3520,104 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
}
-static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
- int (*func)(struct perf_event *, void *), void *data)
+static bool perf_less_group_idx(const void *l, const void *r)
{
- struct perf_event **evt, *evt1, *evt2;
- int ret;
-
- evt1 = perf_event_groups_first(groups, -1);
- evt2 = perf_event_groups_first(groups, cpu);
+ const struct perf_event *le = *(const struct perf_event **)l;
+ const struct perf_event *re = *(const struct perf_event **)r;
- while (evt1 || evt2) {
- if (evt1 && evt2) {
- if (evt1->group_index < evt2->group_index)
- evt = &evt1;
- else
- evt = &evt2;
- } else if (evt1) {
- evt = &evt1;
- } else {
- evt = &evt2;
- }
-
- ret = func(*evt, data);
- if (ret)
- return ret;
+ return le->group_index < re->group_index;
+}
- *evt = perf_event_groups_next(*evt);
- }
+static void swap_ptr(void *l, void *r)
+{
+ void **lp = l, **rp = r;
- return 0;
+ swap(*lp, *rp);
}
-struct sched_in_data {
- struct perf_event_context *ctx;
- struct perf_cpu_context *cpuctx;
- int can_add_hw;
+static const struct min_heap_callbacks perf_min_heap = {
+ .elem_size = sizeof(struct perf_event *),
+ .less = perf_less_group_idx,
+ .swp = swap_ptr,
};
-static int pinned_sched_in(struct perf_event *event, void *data)
+static void __heap_add(struct min_heap *heap, struct perf_event *event)
{
- struct sched_in_data *sid = data;
+ struct perf_event **itrs = heap->data;
- if (event->state <= PERF_EVENT_STATE_OFF)
- return 0;
+ if (event) {
+ itrs[heap->nr] = event;
+ heap->nr++;
+ }
+}
- if (!event_filter_match(event))
- return 0;
+static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+ struct perf_event_groups *groups, int cpu,
+ int (*func)(struct perf_event *, void *),
+ void *data)
+{
+#ifdef CONFIG_CGROUP_PERF
+ struct cgroup_subsys_state *css = NULL;
+#endif
+ /* Space for per CPU and/or any CPU event iterators. */
+ struct perf_event *itrs[2];
+ struct min_heap event_heap;
+ struct perf_event **evt;
+ int ret;
+
+ if (cpuctx) {
+ event_heap = (struct min_heap){
+ .data = cpuctx->heap,
+ .nr = 0,
+ .size = cpuctx->heap_size,
+ };
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->pinned_active);
+ lockdep_assert_held(&cpuctx->ctx.lock);
+
+#ifdef CONFIG_CGROUP_PERF
+ if (cpuctx->cgrp)
+ css = &cpuctx->cgrp->css;
+#endif
+ } else {
+ event_heap = (struct min_heap){
+ .data = itrs,
+ .nr = 0,
+ .size = ARRAY_SIZE(itrs),
+ };
+ /* Events not within a CPU context may be on any CPU. */
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
}
+ evt = event_heap.data;
- /*
- * If this pinned group hasn't been scheduled,
- * put it in error state.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+
+#ifdef CONFIG_CGROUP_PERF
+ for (; css; css = css->parent)
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+#endif
+
+ min_heapify_all(&event_heap, &perf_min_heap);
+
+ while (event_heap.nr) {
+ ret = func(*evt, data);
+ if (ret)
+ return ret;
+
+ *evt = perf_event_groups_next(*evt);
+ if (*evt)
+ min_heapify(&event_heap, 0, &perf_min_heap);
+ else
+ min_heap_pop(&event_heap, &perf_min_heap);
+ }
return 0;
}
-static int flexible_sched_in(struct perf_event *event, void *data)
+static int merge_sched_in(struct perf_event *event, void *data)
{
- struct sched_in_data *sid = data;
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
@@ -3460,14 +3625,19 @@ static int flexible_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
- if (ret) {
- sid->can_add_hw = 0;
- sid->ctx->rotate_necessary = 1;
- return 0;
+ if (group_can_go_on(event, cpuctx, *can_add_hw)) {
+ if (!group_sched_in(event, cpuctx, ctx))
+ list_add_tail(&event->active_list, get_event_list(event));
+ }
+
+ if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ if (event->attr.pinned) {
+ perf_cgroup_event_disable(event, ctx);
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+
+ *can_add_hw = 0;
+ ctx->rotate_necessary = 1;
}
return 0;
@@ -3477,30 +3647,28 @@ static void
ctx_pinned_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
+
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
- visit_groups_merge(&ctx->pinned_groups,
+ visit_groups_merge(cpuctx, &ctx->pinned_groups,
smp_processor_id(),
- pinned_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
ctx_flexible_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx)
{
- struct sched_in_data sid = {
- .ctx = ctx,
- .cpuctx = cpuctx,
- .can_add_hw = 1,
- };
+ int can_add_hw = 1;
+
+ if (ctx != &cpuctx->ctx)
+ cpuctx = NULL;
- visit_groups_merge(&ctx->flexible_groups,
+ visit_groups_merge(cpuctx, &ctx->flexible_groups,
smp_processor_id(),
- flexible_sched_in, &sid);
+ merge_sched_in, &can_add_hw);
}
static void
@@ -3841,6 +4009,12 @@ ctx_event_to_rotate(struct perf_event_context *ctx)
typeof(*event), group_node);
}
+ /*
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+ * finds there are unschedulable events, it will set it again.
+ */
+ ctx->rotate_necessary = 0;
+
return event;
}
@@ -4456,6 +4630,8 @@ static void unaccount_event(struct perf_event *event)
atomic_dec(&nr_comm_events);
if (event->attr.namespaces)
atomic_dec(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_dec(&nr_cgroup_events);
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
@@ -6555,6 +6731,11 @@ static void perf_output_read(struct perf_output_handle *handle,
perf_output_read_one(handle, event, enabled, running);
}
+static inline bool perf_sample_save_hw_index(struct perf_event *event)
+{
+ return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
+}
+
void perf_output_sample(struct perf_output_handle *handle,
struct perf_event_header *header,
struct perf_sample_data *data,
@@ -6643,6 +6824,8 @@ void perf_output_sample(struct perf_output_handle *handle,
* sizeof(struct perf_branch_entry);
perf_output_put(handle, data->br_stack->nr);
+ if (perf_sample_save_hw_index(event))
+ perf_output_put(handle, data->br_stack->hw_idx);
perf_output_copy(handle, data->br_stack->entries, size);
} else {
/*
@@ -6705,6 +6888,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
perf_output_put(handle, data->phys_addr);
+ if (sample_type & PERF_SAMPLE_CGROUP)
+ perf_output_put(handle, data->cgroup);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
@@ -6748,9 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
@@ -6836,6 +7025,9 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
int size = sizeof(u64); /* nr */
if (data->br_stack) {
+ if (perf_sample_save_hw_index(event))
+ size += sizeof(u64);
+
size += data->br_stack->nr
* sizeof(struct perf_branch_entry);
}
@@ -6901,6 +7093,16 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_PHYS_ADDR)
data->phys_addr = perf_virt_to_phys(data->addr);
+#ifdef CONFIG_CGROUP_PERF
+ if (sample_type & PERF_SAMPLE_CGROUP) {
+ struct cgroup *cgrp;
+
+ /* protected by RCU */
+ cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
+ data->cgroup = cgroup_id(cgrp);
+ }
+#endif
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
@@ -7574,6 +7776,105 @@ void perf_event_namespaces(struct task_struct *task)
}
/*
+ * cgroup tracking
+ */
+#ifdef CONFIG_CGROUP_PERF
+
+struct perf_cgroup_event {
+ char *path;
+ int path_size;
+ struct {
+ struct perf_event_header header;
+ u64 id;
+ char path[];
+ } event_id;
+};
+
+static int perf_event_cgroup_match(struct perf_event *event)
+{
+ return event->attr.cgroup;
+}
+
+static void perf_event_cgroup_output(struct perf_event *event, void *data)
+{
+ struct perf_cgroup_event *cgroup_event = data;
+ struct perf_output_handle handle;
+ struct perf_sample_data sample;
+ u16 header_size = cgroup_event->event_id.header.size;
+ int ret;
+
+ if (!perf_event_cgroup_match(event))
+ return;
+
+ perf_event_header__init_id(&cgroup_event->event_id.header,
+ &sample, event);
+ ret = perf_output_begin(&handle, event,
+ cgroup_event->event_id.header.size);
+ if (ret)
+ goto out;
+
+ perf_output_put(&handle, cgroup_event->event_id);
+ __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
+
+ perf_event__output_id_sample(event, &handle, &sample);
+
+ perf_output_end(&handle);
+out:
+ cgroup_event->event_id.header.size = header_size;
+}
+
+static void perf_event_cgroup(struct cgroup *cgrp)
+{
+ struct perf_cgroup_event cgroup_event;
+ char path_enomem[16] = "//enomem";
+ char *pathname;
+ size_t size;
+
+ if (!atomic_read(&nr_cgroup_events))
+ return;
+
+ cgroup_event = (struct perf_cgroup_event){
+ .event_id = {
+ .header = {
+ .type = PERF_RECORD_CGROUP,
+ .misc = 0,
+ .size = sizeof(cgroup_event.event_id),
+ },
+ .id = cgroup_id(cgrp),
+ },
+ };
+
+ pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (pathname == NULL) {
+ cgroup_event.path = path_enomem;
+ } else {
+ /* just to be sure to have enough space for alignment */
+ cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
+ cgroup_event.path = pathname;
+ }
+
+ /*
+ * Since our buffer works in 8 byte units we need to align our string
+ * size to a multiple of 8. However, we must guarantee the tail end is
+ * zero'd out to avoid leaking random bits to userspace.
+ */
+ size = strlen(cgroup_event.path) + 1;
+ while (!IS_ALIGNED(size, sizeof(u64)))
+ cgroup_event.path[size++] = '\0';
+
+ cgroup_event.event_id.header.size += size;
+ cgroup_event.path_size = size;
+
+ perf_iterate_sb(perf_event_cgroup_output,
+ &cgroup_event,
+ NULL);
+
+ kfree(pathname);
+}
+
+#endif
+
+/*
* mmap tracking
*/
@@ -7693,7 +7994,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
flags |= MAP_EXECUTABLE;
if (vma->vm_flags & VM_LOCKED)
flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
+ if (is_vm_hugetlb_page(vma))
flags |= MAP_HUGETLB;
if (file) {
@@ -8255,23 +8556,22 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
enum perf_bpf_event_type type)
{
bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
- char sym[KSYM_NAME_LEN];
int i;
if (prog->aux->func_cnt == 0) {
- bpf_get_prog_name(prog, sym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)prog->bpf_func,
- prog->jited_len, unregister, sym);
+ prog->jited_len, unregister,
+ prog->aux->ksym.name);
} else {
for (i = 0; i < prog->aux->func_cnt; i++) {
struct bpf_prog *subprog = prog->aux->func[i];
- bpf_get_prog_name(subprog, sym);
perf_event_ksymbol(
PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)subprog->bpf_func,
- subprog->jited_len, unregister, sym);
+ subprog->jited_len, unregister,
+ prog->aux->ksym.name);
}
}
}
@@ -9206,7 +9506,6 @@ static void bpf_overflow_handler(struct perf_event *event,
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
- preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
@@ -9214,7 +9513,6 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
- preempt_enable();
if (!ret)
return;
@@ -10349,6 +10647,9 @@ skip_type:
cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
__perf_mux_hrtimer_init(cpuctx, cpu);
+
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
got_cpu_context:
@@ -10616,6 +10917,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_comm_events);
if (event->attr.namespaces)
atomic_inc(&nr_namespaces_events);
+ if (event->attr.cgroup)
+ atomic_inc(&nr_cgroup_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq)
@@ -10794,12 +11097,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
- if (cgroup_fd != -1) {
- err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
- if (err)
- goto err_ns;
- }
-
pmu = perf_init_event(event);
if (IS_ERR(pmu)) {
err = PTR_ERR(pmu);
@@ -10821,6 +11118,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
goto err_pmu;
}
+ if (cgroup_fd != -1) {
+ err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
+ if (err)
+ goto err_pmu;
+ }
+
err = exclusive_event_init(event);
if (err)
goto err_pmu;
@@ -10881,12 +11184,12 @@ err_per_task:
exclusive_event_destroy(event);
err_pmu:
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
if (event->destroy)
event->destroy(event);
module_put(pmu->module);
err_ns:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
if (event->ns)
put_pid_ns(event->ns);
if (event->hw.target)
@@ -10995,6 +11298,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
ret = perf_reg_validate(attr->sample_regs_intr);
+
+#ifndef CONFIG_CGROUP_PERF
+ if (attr->sample_type & PERF_SAMPLE_CGROUP)
+ return -EINVAL;
+#endif
+
out:
return ret;
@@ -11263,14 +11572,14 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (task) {
- err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+ err = mutex_lock_interruptible(&task->signal->exec_update_mutex);
if (err)
goto err_task;
/*
* Reuse ptrace permission checks for now.
*
- * We must hold cred_guard_mutex across this and any potential
+ * We must hold exec_update_mutex across this and any potential
* perf_install_in_context() call for this new event to
* serialize against exec() altering our credentials (and the
* perf_event_exit_task() that could imply).
@@ -11559,7 +11868,7 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_unlock(&ctx->mutex);
if (task) {
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
put_task_struct(task);
}
@@ -11595,7 +11904,7 @@ err_alloc:
free_event(event);
err_cred:
if (task)
- mutex_unlock(&task->signal->cred_guard_mutex);
+ mutex_unlock(&task->signal->exec_update_mutex);
err_task:
if (task)
put_task_struct(task);
@@ -11900,7 +12209,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
/*
* When a child task exits, feed back event values to parent events.
*
- * Can be called with cred_guard_mutex held when called from
+ * Can be called with exec_update_mutex held when called from
* install_exec_creds().
*/
void perf_event_exit_task(struct task_struct *child)
@@ -12592,6 +12901,12 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
kfree(jc);
}
+static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ perf_event_cgroup(css->cgroup);
+ return 0;
+}
+
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
@@ -12613,6 +12928,7 @@ static void perf_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
+ .css_online = perf_cgroup_css_online,
.attach = perf_cgroup_attach,
/*
* Implicitly enable on dfl hierarchy so that perf events can