aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/tools/perf/util/bpf_skel
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/util/bpf_skel')
-rw-r--r--tools/perf/util/bpf_skel/.gitignore4
-rw-r--r--tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c579
-rw-r--r--tools/perf/util/bpf_skel/bench_uprobe.bpf.c39
-rw-r--r--tools/perf/util/bpf_skel/bperf_cgroup.bpf.c227
-rw-r--r--tools/perf/util/bpf_skel/bperf_follower.bpf.c162
-rw-r--r--tools/perf/util/bpf_skel/bperf_leader.bpf.c55
-rw-r--r--tools/perf/util/bpf_skel/bperf_u.h19
-rw-r--r--tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c92
-rw-r--r--tools/perf/util/bpf_skel/func_latency.bpf.c157
-rw-r--r--tools/perf/util/bpf_skel/kwork_top.bpf.c340
-rw-r--r--tools/perf/util/bpf_skel/kwork_trace.bpf.c384
-rw-r--r--tools/perf/util/bpf_skel/lock_contention.bpf.c989
-rw-r--r--tools/perf/util/bpf_skel/lock_data.h78
-rw-r--r--tools/perf/util/bpf_skel/off_cpu.bpf.c372
-rw-r--r--tools/perf/util/bpf_skel/sample-filter.h72
-rw-r--r--tools/perf/util/bpf_skel/sample_filter.bpf.c298
-rw-r--r--tools/perf/util/bpf_skel/syscall_summary.bpf.c153
-rw-r--r--tools/perf/util/bpf_skel/syscall_summary.h27
-rw-r--r--tools/perf/util/bpf_skel/vmlinux/.gitignore1
-rw-r--r--tools/perf/util/bpf_skel/vmlinux/vmlinux.h215
20 files changed, 4263 insertions, 0 deletions
diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore
new file mode 100644
index 000000000000..cd01455e1b53
--- /dev/null
+++ b/tools/perf/util/bpf_skel/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+.tmp
+*.skel.h
+vmlinux.h
diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
new file mode 100644
index 000000000000..e4352881e3fa
--- /dev/null
+++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c
@@ -0,0 +1,579 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
+ *
+ * This exactly matches what is marshalled into the raw_syscall:sys_enter
+ * payload expected by the 'perf trace' beautifiers.
+ */
+
+#include "vmlinux.h"
+#include "../trace_augment.h"
+
+#include <bpf/bpf_helpers.h>
+#include <linux/limits.h>
+
+#define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask))
+
+/**
+ * is_power_of_2() - check if a value is a power of two
+ * @n: the value to check
+ *
+ * Determine whether some value is a power of two, where zero is *not*
+ * considered a power of two. Return: true if @n is a power of 2, otherwise
+ * false.
+ */
+#define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
+
+#define MAX_CPUS 4096
+
+/* bpf-output associated map */
+struct __augmented_syscalls__ {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __type(key, int);
+ __type(value, __u32);
+ __uint(max_entries, MAX_CPUS);
+} __augmented_syscalls__ SEC(".maps");
+
+/*
+ * What to augment at entry?
+ *
+ * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
+ */
+struct syscalls_sys_enter {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 512);
+} syscalls_sys_enter SEC(".maps");
+
+/*
+ * What to augment at exit?
+ *
+ * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
+ */
+struct syscalls_sys_exit {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __type(key, __u32);
+ __type(value, __u32);
+ __uint(max_entries, 512);
+} syscalls_sys_exit SEC(".maps");
+
+struct syscall_enter_args {
+ unsigned long long common_tp_fields;
+ long syscall_nr;
+ unsigned long args[6];
+};
+
+struct syscall_exit_args {
+ unsigned long long common_tp_fields;
+ long syscall_nr;
+ long ret;
+};
+
+/*
+ * Desired design of maximum size and alignment (see RFC2553)
+ */
+#define SS_MAXSIZE 128 /* Implementation specific max size */
+
+typedef unsigned short sa_family_t;
+
+/*
+ * FIXME: Should come from system headers
+ *
+ * The definition uses anonymous union and struct in order to control the
+ * default alignment.
+ */
+struct sockaddr_storage {
+ union {
+ struct {
+ sa_family_t ss_family; /* address family */
+ /* Following field(s) are implementation specific */
+ char __data[SS_MAXSIZE - sizeof(unsigned short)];
+ /* space to achieve desired size, */
+ /* _SS_MAXSIZE value minus size of ss_family */
+ };
+ void *__align; /* implementation specific desired alignment */
+ };
+};
+
+struct augmented_arg {
+ unsigned int size;
+ int err;
+ union {
+ char value[PATH_MAX];
+ struct sockaddr_storage saddr;
+ };
+};
+
+struct pids_filtered {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, pid_t);
+ __type(value, bool);
+ __uint(max_entries, 64);
+} pids_filtered SEC(".maps");
+
+struct augmented_args_payload {
+ struct syscall_enter_args args;
+ struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
+};
+
+// We need more tmp space than the BPF stack can give us
+struct augmented_args_tmp {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, int);
+ __type(value, struct augmented_args_payload);
+ __uint(max_entries, 1);
+} augmented_args_tmp SEC(".maps");
+
+struct beauty_map_enter {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, __u32[6]);
+ __uint(max_entries, 512);
+} beauty_map_enter SEC(".maps");
+
+struct beauty_payload_enter {
+ struct syscall_enter_args args;
+ struct augmented_arg aug_args[6];
+};
+
+struct beauty_payload_enter_map {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, int);
+ __type(value, struct beauty_payload_enter);
+ __uint(max_entries, 1);
+} beauty_payload_enter_map SEC(".maps");
+
+static inline struct augmented_args_payload *augmented_args_payload(void)
+{
+ int key = 0;
+ return bpf_map_lookup_elem(&augmented_args_tmp, &key);
+}
+
+static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
+{
+ /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
+ return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
+}
+
+static inline int augmented__beauty_output(void *ctx, void *data, int len)
+{
+ return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
+}
+
+static inline
+unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
+{
+ unsigned int augmented_len = sizeof(*augmented_arg);
+ int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
+
+ augmented_arg->size = augmented_arg->err = 0;
+ /*
+ * probe_read_str may return < 0, e.g. -EFAULT
+ * So we leave that in the augmented_arg->size that userspace will
+ */
+ if (string_len > 0) {
+ augmented_len -= sizeof(augmented_arg->value) - string_len;
+ _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
+ augmented_len &= sizeof(augmented_arg->value) - 1;
+ augmented_arg->size = string_len;
+ } else {
+ /*
+ * So that username notice the error while still being able
+ * to skip this augmented arg record
+ */
+ augmented_arg->err = string_len;
+ augmented_len = offsetof(struct augmented_arg, value);
+ }
+
+ return augmented_len;
+}
+
+SEC("tp/raw_syscalls/sys_enter")
+int syscall_unaugmented(struct syscall_enter_args *args)
+{
+ return 1;
+}
+
+/*
+ * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
+ * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
+ * on from there, reading the first syscall arg as a string, i.e. open's
+ * filename.
+ */
+SEC("tp/syscalls/sys_enter_connect")
+int sys_enter_connect(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *sockaddr_arg = (const void *)args->args[1];
+ unsigned int socklen = args->args[2];
+ unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
+ socklen &= sizeof(augmented_args->arg.saddr) - 1;
+
+ bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
+ augmented_args->arg.size = socklen;
+ augmented_args->arg.err = 0;
+
+ return augmented__output(args, augmented_args, len + socklen);
+}
+
+SEC("tp/syscalls/sys_enter_sendto")
+int sys_enter_sendto(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *sockaddr_arg = (const void *)args->args[4];
+ unsigned int socklen = args->args[5];
+ unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ socklen &= sizeof(augmented_args->arg.saddr) - 1;
+
+ bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
+
+ return augmented__output(args, augmented_args, len + socklen);
+}
+
+SEC("tp/syscalls/sys_enter_open")
+int sys_enter_open(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *filename_arg = (const void *)args->args[0];
+ unsigned int len = sizeof(augmented_args->args);
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("tp/syscalls/sys_enter_openat")
+int sys_enter_openat(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *filename_arg = (const void *)args->args[1];
+ unsigned int len = sizeof(augmented_args->args);
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("tp/syscalls/sys_enter_rename")
+int sys_enter_rename(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *oldpath_arg = (const void *)args->args[0],
+ *newpath_arg = (const void *)args->args[1];
+ unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
+ oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
+ augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+ len += augmented_args->arg.size;
+
+ /* Every read from userspace is limited to value size */
+ if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
+ return 1; /* Failure: don't filter */
+
+ struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+ newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+ arg2->size = newpath_len;
+
+ len += newpath_len;
+
+ return augmented__output(args, augmented_args, len);
+}
+
+SEC("tp/syscalls/sys_enter_renameat2")
+int sys_enter_renameat2(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *oldpath_arg = (const void *)args->args[1],
+ *newpath_arg = (const void *)args->args[3];
+ unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
+
+ if (augmented_args == NULL)
+ return 1; /* Failure: don't filter */
+
+ len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
+
+ oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
+ augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
+ len += augmented_args->arg.size;
+
+ /* Every read from userspace is limited to value size */
+ if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
+ return 1; /* Failure: don't filter */
+
+ struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
+
+ newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
+ arg2->size = newpath_len;
+
+ len += newpath_len;
+
+ return augmented__output(args, augmented_args, len);
+}
+
+#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
+
+// we need just the start, get the size to then copy it
+struct perf_event_attr_size {
+ __u32 type;
+ /*
+ * Size of the attr structure, for fwd/bwd compat.
+ */
+ __u32 size;
+};
+
+SEC("tp/syscalls/sys_enter_perf_event_open")
+int sys_enter_perf_event_open(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
+ unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
+
+ if (augmented_args == NULL)
+ goto failure;
+
+ if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
+ goto failure;
+
+ attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
+
+ __u32 size = attr_read->size;
+
+ if (!size)
+ size = PERF_ATTR_SIZE_VER0;
+
+ if (size > sizeof(augmented_args->arg.value))
+ goto failure;
+
+ // Now that we read attr->size and tested it against the size limits, read it completely
+ if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
+ goto failure;
+
+ return augmented__output(args, augmented_args, len + size);
+failure:
+ return 1; /* Failure: don't filter */
+}
+
+SEC("tp/syscalls/sys_enter_clock_nanosleep")
+int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *rqtp_arg = (const void *)args->args[2];
+ unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
+ __u32 size = sizeof(struct timespec64);
+
+ if (augmented_args == NULL)
+ goto failure;
+
+ if (size > sizeof(augmented_args->arg.value))
+ goto failure;
+
+ bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
+
+ return augmented__output(args, augmented_args, len + size);
+failure:
+ return 1; /* Failure: don't filter */
+}
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int sys_enter_nanosleep(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args = augmented_args_payload();
+ const void *req_arg = (const void *)args->args[0];
+ unsigned int len = sizeof(augmented_args->args);
+ __u32 size = sizeof(struct timespec64);
+
+ if (augmented_args == NULL)
+ goto failure;
+
+ if (size > sizeof(augmented_args->arg.value))
+ goto failure;
+
+ bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
+
+ return augmented__output(args, augmented_args, len + size);
+failure:
+ return 1; /* Failure: don't filter */
+}
+
+static pid_t getpid(void)
+{
+ return bpf_get_current_pid_tgid();
+}
+
+static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
+{
+ return bpf_map_lookup_elem(pids, &pid) != NULL;
+}
+
+static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
+{
+ bool augmented, do_output = false;
+ int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
+ u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
+ s64 aug_size, size;
+ unsigned int nr, *beauty_map;
+ struct beauty_payload_enter *payload;
+ void *arg, *payload_offset;
+
+ /* fall back to do predefined tail call */
+ if (args == NULL)
+ return 1;
+
+ /* use syscall number to get beauty_map entry */
+ nr = (__u32)args->syscall_nr;
+ beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr);
+
+ /* set up payload for output */
+ payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
+ payload_offset = (void *)&payload->aug_args;
+
+ if (beauty_map == NULL || payload == NULL)
+ return 1;
+
+ /* copy the sys_enter header, which has the syscall_nr */
+ __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
+
+ /*
+ * Determine what type of argument and how many bytes to read from user space, using the
+ * value in the beauty_map. This is the relation of parameter type and its corresponding
+ * value in the beauty map, and how many bytes we read eventually:
+ *
+ * string: 1 -> size of string
+ * struct: size of struct -> size of struct
+ * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
+ */
+ for (int i = 0; i < 6; i++) {
+ arg = (void *)args->args[i];
+ augmented = false;
+ size = beauty_map[i];
+ aug_size = size; /* size of the augmented data read from user space */
+
+ if (size == 0 || arg == NULL)
+ continue;
+
+ if (size == 1) { /* string */
+ aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
+ /* minimum of 0 to pass the verifier */
+ if (aug_size < 0)
+ aug_size = 0;
+
+ augmented = true;
+ } else if (size > 0 && size <= value_size) { /* struct */
+ if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
+ augmented = true;
+ } else if ((int)size < 0 && size >= -6) { /* buffer */
+ index = -(size + 1);
+ barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
+ index &= 7; // Satisfy the bounds checking with the verifier in some kernels.
+ aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
+
+ if (aug_size > 0) {
+ if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
+ augmented = true;
+ }
+ }
+
+ /* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
+ if (aug_size > value_size)
+ aug_size = value_size;
+
+ /* write data to payload */
+ if (augmented) {
+ int written = offsetof(struct augmented_arg, value) + aug_size;
+
+ if (written < 0 || written > sizeof(struct augmented_arg))
+ return 1;
+
+ ((struct augmented_arg *)payload_offset)->size = aug_size;
+ output += written;
+ payload_offset += written;
+ do_output = true;
+ }
+ }
+
+ if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
+ return 1;
+
+ return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
+}
+
+SEC("tp/raw_syscalls/sys_enter")
+int sys_enter(struct syscall_enter_args *args)
+{
+ struct augmented_args_payload *augmented_args;
+ /*
+ * We start len, the amount of data that will be in the perf ring
+ * buffer, if this is not filtered out by one of pid_filter__has(),
+ * syscall->enabled, etc, with the non-augmented raw syscall payload,
+ * i.e. sizeof(augmented_args->args).
+ *
+ * We'll add to this as we add augmented syscalls right after that
+ * initial, non-augmented raw_syscalls:sys_enter payload.
+ */
+
+ if (pid_filter__has(&pids_filtered, getpid()))
+ return 0;
+
+ augmented_args = augmented_args_payload();
+ if (augmented_args == NULL)
+ return 1;
+
+ bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
+
+ /*
+ * Jump to syscall specific augmenter, even if the default one,
+ * "!raw_syscalls:unaugmented" that will just return 1 to return the
+ * unaugmented tracepoint payload.
+ */
+ if (augment_sys_enter(args, &augmented_args->args))
+ bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
+
+ // If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+ return 0;
+}
+
+SEC("tp/raw_syscalls/sys_exit")
+int sys_exit(struct syscall_exit_args *args)
+{
+ struct syscall_exit_args exit_args;
+
+ if (pid_filter__has(&pids_filtered, getpid()))
+ return 0;
+
+ bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
+ /*
+ * Jump to syscall specific return augmenter, even if the default one,
+ * "!raw_syscalls:unaugmented" that will just return 1 to return the
+ * unaugmented tracepoint payload.
+ */
+ bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
+ /*
+ * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
+ */
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c
new file mode 100644
index 000000000000..a01c7f791fcd
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2023 Red Hat
+#include "vmlinux.h"
+#include <bpf/bpf_tracing.h>
+
+unsigned int nr_uprobes;
+unsigned int nr_uretprobes;
+
+SEC("uprobe")
+int BPF_UPROBE(empty)
+{
+ return 0;
+}
+
+SEC("uprobe")
+int BPF_UPROBE(trace_printk)
+{
+ char fmt[] = "perf bench uprobe %u";
+
+ bpf_trace_printk(fmt, sizeof(fmt), ++nr_uprobes);
+ return 0;
+}
+
+SEC("uretprobe")
+int BPF_URETPROBE(empty_ret)
+{
+ return 0;
+}
+
+SEC("uretprobe")
+int BPF_URETPROBE(trace_printk_ret)
+{
+ char fmt[] = "perf bench uretprobe %u";
+
+ bpf_trace_printk(fmt, sizeof(fmt), ++nr_uretprobes);
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
new file mode 100644
index 000000000000..57cab7647a9a
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+// Copyright (c) 2021 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary
+#define MAX_EVENTS 32 // max events per cgroup: arbitrary
+
+// NOTE: many of map and global data will be modified before loading
+// from the userspace (perf tool) using the skeleton helpers.
+
+// single set of global perf events to measure
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, 1);
+} events SEC(".maps");
+
+// from cgroup id to event index
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u32));
+ __uint(max_entries, 1);
+} cgrp_idx SEC(".maps");
+
+// per-cpu event snapshots to calculate delta
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+} prev_readings SEC(".maps");
+
+// aggregated event values for each cgroup (per-cpu)
+// will be read from the user-space
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+} cgrp_readings SEC(".maps");
+
+/* new kernel cgroup definition */
+struct cgroup___new {
+ int level;
+ struct cgroup *ancestors[];
+} __attribute__((preserve_access_index));
+
+/* old kernel cgroup definition */
+struct cgroup___old {
+ int level;
+ u64 ancestor_ids[];
+} __attribute__((preserve_access_index));
+
+const volatile __u32 num_events = 1;
+const volatile __u32 num_cpus = 1;
+const volatile int use_cgroup_v2 = 0;
+
+int enabled = 0;
+int perf_subsys_id = -1;
+
+static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
+{
+ /* recast pointer to capture new type for compiler */
+ struct cgroup___new *cgrp_new = (void *)cgrp;
+
+ if (bpf_core_field_exists(cgrp_new->ancestors)) {
+ return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
+ } else {
+ /* recast pointer to capture old type for compiler */
+ struct cgroup___old *cgrp_old = (void *)cgrp;
+
+ return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
+ }
+}
+
+static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
+{
+ struct task_struct *p = (void *)bpf_get_current_task();
+ struct cgroup *cgrp;
+ register int i = 0;
+ __u32 *elem;
+ int level;
+ int cnt;
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+ cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
+ level = BPF_CORE_READ(cgrp, level);
+
+ for (cnt = 0; i < MAX_LEVELS; i++) {
+ __u64 cgrp_id;
+
+ if (i > level)
+ break;
+
+ // convert cgroup-id to a map index
+ cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
+ elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
+ if (!elem)
+ continue;
+
+ cgrps[cnt++] = *elem;
+ if (cnt == size)
+ break;
+ }
+
+ return cnt;
+}
+
+static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
+{
+ register int i = 0;
+ __u32 *elem;
+ int cnt;
+
+ for (cnt = 0; i < MAX_LEVELS; i++) {
+ __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
+
+ if (cgrp_id == 0)
+ break;
+
+ // convert cgroup-id to a map index
+ elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
+ if (!elem)
+ continue;
+
+ cgrps[cnt++] = *elem;
+ if (cnt == size)
+ break;
+ }
+
+ return cnt;
+}
+
+static int bperf_cgroup_count(void)
+{
+ register __u32 idx = 0; // to have it in a register to pass BPF verifier
+ register int c = 0;
+ struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
+ __u32 cpu = bpf_get_smp_processor_id();
+ __u32 cgrp_idx[MAX_LEVELS];
+ int cgrp_cnt;
+ __u32 key, cgrp;
+ long err;
+
+ if (use_cgroup_v2)
+ cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS);
+ else
+ cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS);
+
+ for ( ; idx < MAX_EVENTS; idx++) {
+ if (idx == num_events)
+ break;
+
+ // XXX: do not pass idx directly (for verifier)
+ key = idx;
+ // this is per-cpu array for diff
+ prev_val = bpf_map_lookup_elem(&prev_readings, &key);
+ if (!prev_val) {
+ val.counter = val.enabled = val.running = 0;
+ bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
+
+ prev_val = bpf_map_lookup_elem(&prev_readings, &key);
+ if (!prev_val)
+ continue;
+ }
+
+ // read from global perf_event array
+ key = idx * num_cpus + cpu;
+ err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+ if (err)
+ continue;
+
+ if (enabled) {
+ delta.counter = val.counter - prev_val->counter;
+ delta.enabled = val.enabled - prev_val->enabled;
+ delta.running = val.running - prev_val->running;
+
+ for (c = 0; c < MAX_LEVELS; c++) {
+ if (c == cgrp_cnt)
+ break;
+
+ cgrp = cgrp_idx[c];
+
+ // aggregate the result by cgroup
+ key = cgrp * num_events + idx;
+ cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
+ if (cgrp_val) {
+ cgrp_val->counter += delta.counter;
+ cgrp_val->enabled += delta.enabled;
+ cgrp_val->running += delta.running;
+ } else {
+ bpf_map_update_elem(&cgrp_readings, &key,
+ &delta, BPF_ANY);
+ }
+ }
+ }
+
+ *prev_val = val;
+ }
+ return 0;
+}
+
+// This will be attached to cgroup-switches event for each cpu
+SEC("perf_event")
+int BPF_PROG(on_cgrp_switch)
+{
+ return bperf_cgroup_count();
+}
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(trigger_read)
+{
+ return bperf_cgroup_count();
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
new file mode 100644
index 000000000000..0595063139a3
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bperf_u.h"
+
+#define MAX_ENTRIES 102400
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} diff_readings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} accum_readings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bperf_filter_value));
+ __uint(max_entries, MAX_ENTRIES);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+} filter SEC(".maps");
+
+enum bperf_filter_type type = 0;
+int enabled = 0;
+int inherit;
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+ struct bpf_perf_event_value *diff_val, *accum_val;
+ __u32 filter_key, zero = 0;
+ __u32 accum_key;
+ struct bperf_filter_value *fval;
+
+ if (!enabled)
+ return 0;
+
+ switch (type) {
+ case BPERF_FILTER_GLOBAL:
+ accum_key = zero;
+ goto do_add;
+ case BPERF_FILTER_CPU:
+ filter_key = bpf_get_smp_processor_id();
+ break;
+ case BPERF_FILTER_PID:
+ filter_key = bpf_get_current_pid_tgid() & 0xffffffff;
+ break;
+ case BPERF_FILTER_TGID:
+ /* Use pid as the filter_key to exclude new task counts
+ * when inherit is disabled. Don't worry about the existing
+ * children in TGID losing their counts, bpf_counter has
+ * already added them to the filter map via perf_thread_map
+ * before this bpf prog runs.
+ */
+ filter_key = inherit ?
+ bpf_get_current_pid_tgid() >> 32 :
+ bpf_get_current_pid_tgid() & 0xffffffff;
+ break;
+ default:
+ return 0;
+ }
+
+ fval = bpf_map_lookup_elem(&filter, &filter_key);
+ if (!fval)
+ return 0;
+
+ accum_key = fval->accum_key;
+ if (fval->exited)
+ bpf_map_delete_elem(&filter, &filter_key);
+
+do_add:
+ diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+ if (!diff_val)
+ return 0;
+
+ accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key);
+ if (!accum_val)
+ return 0;
+
+ accum_val->counter += diff_val->counter;
+ accum_val->enabled += diff_val->enabled;
+ accum_val->running += diff_val->running;
+
+ return 0;
+}
+
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/task_newtask")
+int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags)
+{
+ __u32 parent_key, child_key;
+ struct bperf_filter_value *parent_fval;
+ struct bperf_filter_value child_fval = { 0 };
+
+ if (!enabled)
+ return 0;
+
+ switch (type) {
+ case BPERF_FILTER_PID:
+ parent_key = bpf_get_current_pid_tgid() & 0xffffffff;
+ child_key = task->pid;
+ break;
+ case BPERF_FILTER_TGID:
+ parent_key = bpf_get_current_pid_tgid() >> 32;
+ child_key = task->tgid;
+ if (child_key == parent_key)
+ return 0;
+ break;
+ default:
+ return 0;
+ }
+
+ /* Check if the current task is one of the target tasks to be counted */
+ parent_fval = bpf_map_lookup_elem(&filter, &parent_key);
+ if (!parent_fval)
+ return 0;
+
+ /* Start counting for the new task by adding it into filter map,
+ * inherit the accum key of its parent task so that they can be
+ * counted together.
+ */
+ child_fval.accum_key = parent_fval->accum_key;
+ child_fval.exited = 0;
+ bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST);
+
+ return 0;
+}
+
+/* The program is only used for PID or TGID filter types. */
+SEC("tp_btf/sched_process_exit")
+int BPF_PROG(on_exittask, struct task_struct *task)
+{
+ __u32 pid;
+ struct bperf_filter_value *fval;
+
+ if (!enabled)
+ return 0;
+
+ /* Stop counting for this task by removing it from filter map.
+ * For TGID type, if the pid can be found in the map, it means that
+ * this pid belongs to the leader task. After the task exits, the
+ * tgid of its child tasks (if any) will be 1, so the pid can be
+ * safely removed.
+ */
+ pid = task->pid;
+ fval = bpf_map_lookup_elem(&filter, &pid);
+ if (fval)
+ fval->exited = 1;
+
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
new file mode 100644
index 000000000000..e2a2d4cd7779
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(int));
+ __uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} events SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} prev_readings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} diff_readings SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(on_switch)
+{
+ struct bpf_perf_event_value val, *prev_val, *diff_val;
+ __u32 key = bpf_get_smp_processor_id();
+ __u32 zero = 0;
+ long err;
+
+ prev_val = bpf_map_lookup_elem(&prev_readings, &zero);
+ if (!prev_val)
+ return 0;
+
+ diff_val = bpf_map_lookup_elem(&diff_readings, &zero);
+ if (!diff_val)
+ return 0;
+
+ err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
+ if (err)
+ return 0;
+
+ diff_val->counter = val.counter - prev_val->counter;
+ diff_val->enabled = val.enabled - prev_val->enabled;
+ diff_val->running = val.running - prev_val->running;
+ *prev_val = val;
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h
new file mode 100644
index 000000000000..4a4a753980be
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bperf_u.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Facebook
+
+#ifndef __BPERF_STAT_U_H
+#define __BPERF_STAT_U_H
+
+enum bperf_filter_type {
+ BPERF_FILTER_GLOBAL = 1,
+ BPERF_FILTER_CPU,
+ BPERF_FILTER_PID,
+ BPERF_FILTER_TGID,
+};
+
+struct bperf_filter_value {
+ __u32 accum_key;
+ __u8 exited;
+};
+
+#endif /* __BPERF_STAT_U_H */
diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
new file mode 100644
index 000000000000..97037d3b3d9f
--- /dev/null
+++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2020 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+/* map of perf event fds, num_cpu * num_metric entries */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(int));
+} events SEC(".maps");
+
+/* readings at fentry */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} fentry_readings SEC(".maps");
+
+/* accumulated readings */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_perf_event_value));
+ __uint(max_entries, 1);
+} accum_readings SEC(".maps");
+
+const volatile __u32 num_cpu = 1;
+
+SEC("fentry/XXX")
+int BPF_PROG(fentry_XXX)
+{
+ __u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *ptr;
+ __u32 zero = 0;
+ long err;
+
+ /* look up before reading, to reduce error */
+ ptr = bpf_map_lookup_elem(&fentry_readings, &zero);
+ if (!ptr)
+ return 0;
+
+ err = bpf_perf_event_read_value(&events, key, ptr, sizeof(*ptr));
+ if (err)
+ return 0;
+
+ return 0;
+}
+
+static inline void
+fexit_update_maps(struct bpf_perf_event_value *after)
+{
+ struct bpf_perf_event_value *before, diff;
+ __u32 zero = 0;
+
+ before = bpf_map_lookup_elem(&fentry_readings, &zero);
+ /* only account samples with a valid fentry_reading */
+ if (before && before->counter) {
+ struct bpf_perf_event_value *accum;
+
+ diff.counter = after->counter - before->counter;
+ diff.enabled = after->enabled - before->enabled;
+ diff.running = after->running - before->running;
+
+ accum = bpf_map_lookup_elem(&accum_readings, &zero);
+ if (accum) {
+ accum->counter += diff.counter;
+ accum->enabled += diff.enabled;
+ accum->running += diff.running;
+ }
+ }
+}
+
+SEC("fexit/XXX")
+int BPF_PROG(fexit_XXX)
+{
+ struct bpf_perf_event_value reading;
+ __u32 cpu = bpf_get_smp_processor_id();
+ int err;
+
+ /* read all events before updating the maps, to reduce error */
+ err = bpf_perf_event_read_value(&events, cpu, &reading, sizeof(reading));
+ if (err)
+ return 0;
+
+ fexit_update_maps(&reading);
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c
new file mode 100644
index 000000000000..e731a79a753a
--- /dev/null
+++ b/tools/perf/util/bpf_skel/func_latency.bpf.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2021 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+// This should be in sync with "util/ftrace.h"
+#define NUM_BUCKET 22
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, 10000);
+} functime SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} task_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, NUM_BUCKET);
+} latency SEC(".maps");
+
+
+int enabled = 0;
+
+// stats
+__s64 total;
+__s64 count;
+__s64 max;
+__s64 min;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int use_nsec = 0;
+const volatile unsigned int bucket_range;
+const volatile unsigned int min_latency;
+const volatile unsigned int max_latency;
+const volatile unsigned int bucket_num = NUM_BUCKET;
+
+SEC("kprobe/func")
+int BPF_PROG(func_begin)
+{
+ __u64 key, now;
+
+ if (!enabled)
+ return 0;
+
+ key = bpf_get_current_pid_tgid();
+
+ if (has_cpu) {
+ __u32 cpu = bpf_get_smp_processor_id();
+ __u8 *ok;
+
+ ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_task) {
+ __u32 pid = key & 0xffffffff;
+ __u8 *ok;
+
+ ok = bpf_map_lookup_elem(&task_filter, &pid);
+ if (!ok)
+ return 0;
+ }
+
+ now = bpf_ktime_get_ns();
+
+ // overwrite timestamp for nested functions
+ bpf_map_update_elem(&functime, &key, &now, BPF_ANY);
+ return 0;
+}
+
+SEC("kretprobe/func")
+int BPF_PROG(func_end)
+{
+ __u64 tid;
+ __u64 *start;
+ __u64 cmp_base = use_nsec ? 1 : 1000;
+
+ if (!enabled)
+ return 0;
+
+ tid = bpf_get_current_pid_tgid();
+
+ start = bpf_map_lookup_elem(&functime, &tid);
+ if (start) {
+ __s64 delta = bpf_ktime_get_ns() - *start;
+ __u64 val = delta;
+ __u32 key = 0;
+ __u64 *hist;
+
+ bpf_map_delete_elem(&functime, &tid);
+
+ if (delta < 0)
+ return 0;
+
+ if (bucket_range != 0) {
+ val = delta / cmp_base;
+
+ if (min_latency > 0) {
+ if (val > min_latency)
+ val -= min_latency;
+ else
+ goto do_lookup;
+ }
+
+ // Less than 1 unit (ms or ns), or, in the future,
+ // than the min latency desired.
+ if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units )
+ key = val / bucket_range + 1;
+ if (key >= bucket_num)
+ key = bucket_num - 1;
+ }
+
+ goto do_lookup;
+ }
+ // calculate index using delta
+ for (key = 0; key < (bucket_num - 1); key++) {
+ if (delta < (cmp_base << key))
+ break;
+ }
+
+do_lookup:
+ hist = bpf_map_lookup_elem(&latency, &key);
+ if (!hist)
+ return 0;
+
+ __sync_fetch_and_add(hist, 1);
+
+ __sync_fetch_and_add(&total, delta); // always in nsec
+ __sync_fetch_and_add(&count, 1);
+
+ if (delta > max)
+ max = delta;
+ if (delta < min)
+ min = delta;
+ }
+
+ return 0;
+}
diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c
new file mode 100644
index 000000000000..73e32e063030
--- /dev/null
+++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022, Huawei
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/*
+ * This should be in sync with "util/kwork.h"
+ */
+enum kwork_class_type {
+ KWORK_CLASS_IRQ,
+ KWORK_CLASS_SOFTIRQ,
+ KWORK_CLASS_WORKQUEUE,
+ KWORK_CLASS_SCHED,
+ KWORK_CLASS_MAX,
+};
+
+#define MAX_ENTRIES 102400
+#ifndef MAX_NR_CPUS
+#define MAX_NR_CPUS 4096
+#endif
+#define PF_KTHREAD 0x00200000
+#define MAX_COMMAND_LEN 16
+
+struct time_data {
+ __u64 timestamp;
+};
+
+struct work_data {
+ __u64 runtime;
+};
+
+struct task_data {
+ __u32 tgid;
+ __u32 is_kthread;
+ char comm[MAX_COMMAND_LEN];
+};
+
+struct work_key {
+ __u32 type;
+ __u32 pid;
+ __u64 task_p;
+};
+
+struct task_key {
+ __u32 pid;
+ __u32 cpu;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct time_data);
+} kwork_top_task_time SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(struct work_key));
+ __uint(value_size, sizeof(struct time_data));
+ __uint(max_entries, MAX_ENTRIES);
+} kwork_top_irq_time SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct task_key));
+ __uint(value_size, sizeof(struct task_data));
+ __uint(max_entries, MAX_ENTRIES);
+} kwork_top_tasks SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(key_size, sizeof(struct work_key));
+ __uint(value_size, sizeof(struct work_data));
+ __uint(max_entries, MAX_ENTRIES);
+} kwork_top_works SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u8));
+ __uint(max_entries, MAX_NR_CPUS);
+} kwork_top_cpu_filter SEC(".maps");
+
+int enabled = 0;
+
+const volatile int has_cpu_filter = 0;
+
+__u64 from_timestamp = 0;
+__u64 to_timestamp = 0;
+
+static __always_inline int cpu_is_filtered(__u32 cpu)
+{
+ __u8 *cpu_val;
+
+ if (has_cpu_filter) {
+ cpu_val = bpf_map_lookup_elem(&kwork_top_cpu_filter, &cpu);
+ if (!cpu_val)
+ return 1;
+ }
+
+ return 0;
+}
+
+static __always_inline void update_task_info(struct task_struct *task, __u32 cpu)
+{
+ struct task_key key = {
+ .pid = task->pid,
+ .cpu = cpu,
+ };
+
+ if (!bpf_map_lookup_elem(&kwork_top_tasks, &key)) {
+ struct task_data data = {
+ .tgid = task->tgid,
+ .is_kthread = task->flags & PF_KTHREAD ? 1 : 0,
+ };
+ BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
+
+ bpf_map_update_elem(&kwork_top_tasks, &key, &data, BPF_ANY);
+ }
+}
+
+static __always_inline void update_work(struct work_key *key, __u64 delta)
+{
+ struct work_data *data;
+
+ data = bpf_map_lookup_elem(&kwork_top_works, key);
+ if (data) {
+ data->runtime += delta;
+ } else {
+ struct work_data new_data = {
+ .runtime = delta,
+ };
+
+ bpf_map_update_elem(&kwork_top_works, key, &new_data, BPF_ANY);
+ }
+}
+
+static void on_sched_out(struct task_struct *task, __u64 ts, __u32 cpu)
+{
+ __u64 delta;
+ struct time_data *pelem;
+
+ pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL, 0);
+ if (pelem)
+ delta = ts - pelem->timestamp;
+ else
+ delta = ts - from_timestamp;
+
+ struct work_key key = {
+ .type = KWORK_CLASS_SCHED,
+ .pid = task->pid,
+ .task_p = (__u64)task,
+ };
+
+ update_work(&key, delta);
+ update_task_info(task, cpu);
+}
+
+static void on_sched_in(struct task_struct *task, __u64 ts)
+{
+ struct time_data *pelem;
+
+ pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (pelem)
+ pelem->timestamp = ts;
+}
+
+SEC("tp_btf/sched_switch")
+int on_switch(u64 *ctx)
+{
+ struct task_struct *prev, *next;
+
+ prev = (struct task_struct *)ctx[1];
+ next = (struct task_struct *)ctx[2];
+
+ if (!enabled)
+ return 0;
+
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu_is_filtered(cpu))
+ return 0;
+
+ __u64 ts = bpf_ktime_get_ns();
+
+ on_sched_out(prev, ts, cpu);
+ on_sched_in(next, ts);
+
+ return 0;
+}
+
+SEC("tp_btf/irq_handler_entry")
+int on_irq_handler_entry(u64 *cxt)
+{
+ struct task_struct *task;
+
+ if (!enabled)
+ return 0;
+
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu_is_filtered(cpu))
+ return 0;
+
+ __u64 ts = bpf_ktime_get_ns();
+
+ task = (struct task_struct *)bpf_get_current_task();
+ if (!task)
+ return 0;
+
+ struct work_key key = {
+ .type = KWORK_CLASS_IRQ,
+ .pid = BPF_CORE_READ(task, pid),
+ .task_p = (__u64)task,
+ };
+
+ struct time_data data = {
+ .timestamp = ts,
+ };
+
+ bpf_map_update_elem(&kwork_top_irq_time, &key, &data, BPF_ANY);
+
+ return 0;
+}
+
+SEC("tp_btf/irq_handler_exit")
+int on_irq_handler_exit(u64 *cxt)
+{
+ __u64 delta;
+ struct task_struct *task;
+ struct time_data *pelem;
+
+ if (!enabled)
+ return 0;
+
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu_is_filtered(cpu))
+ return 0;
+
+ __u64 ts = bpf_ktime_get_ns();
+
+ task = (struct task_struct *)bpf_get_current_task();
+ if (!task)
+ return 0;
+
+ struct work_key key = {
+ .type = KWORK_CLASS_IRQ,
+ .pid = BPF_CORE_READ(task, pid),
+ .task_p = (__u64)task,
+ };
+
+ pelem = bpf_map_lookup_elem(&kwork_top_irq_time, &key);
+ if (pelem && pelem->timestamp != 0)
+ delta = ts - pelem->timestamp;
+ else
+ delta = ts - from_timestamp;
+
+ update_work(&key, delta);
+
+ return 0;
+}
+
+SEC("tp_btf/softirq_entry")
+int on_softirq_entry(u64 *cxt)
+{
+ struct task_struct *task;
+
+ if (!enabled)
+ return 0;
+
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu_is_filtered(cpu))
+ return 0;
+
+ __u64 ts = bpf_ktime_get_ns();
+
+ task = (struct task_struct *)bpf_get_current_task();
+ if (!task)
+ return 0;
+
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .pid = BPF_CORE_READ(task, pid),
+ .task_p = (__u64)task,
+ };
+
+ struct time_data data = {
+ .timestamp = ts,
+ };
+
+ bpf_map_update_elem(&kwork_top_irq_time, &key, &data, BPF_ANY);
+
+ return 0;
+}
+
+SEC("tp_btf/softirq_exit")
+int on_softirq_exit(u64 *cxt)
+{
+ __u64 delta;
+ struct task_struct *task;
+ struct time_data *pelem;
+
+ if (!enabled)
+ return 0;
+
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (cpu_is_filtered(cpu))
+ return 0;
+
+ __u64 ts = bpf_ktime_get_ns();
+
+ task = (struct task_struct *)bpf_get_current_task();
+ if (!task)
+ return 0;
+
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .pid = BPF_CORE_READ(task, pid),
+ .task_p = (__u64)task,
+ };
+
+ pelem = bpf_map_lookup_elem(&kwork_top_irq_time, &key);
+ if (pelem)
+ delta = ts - pelem->timestamp;
+ else
+ delta = ts - from_timestamp;
+
+ update_work(&key, delta);
+
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
new file mode 100644
index 000000000000..9ce9c8dddc4b
--- /dev/null
+++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022, Huawei
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define KWORK_COUNT 100
+#define MAX_KWORKNAME 128
+
+/*
+ * This should be in sync with "util/kwork.h"
+ */
+enum kwork_class_type {
+ KWORK_CLASS_IRQ,
+ KWORK_CLASS_SOFTIRQ,
+ KWORK_CLASS_WORKQUEUE,
+ KWORK_CLASS_MAX,
+};
+
+struct work_key {
+ __u32 type;
+ __u32 cpu;
+ __u64 id;
+};
+
+struct report_data {
+ __u64 nr;
+ __u64 total_time;
+ __u64 max_time;
+ __u64 max_time_start;
+ __u64 max_time_end;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct work_key));
+ __uint(value_size, MAX_KWORKNAME);
+ __uint(max_entries, KWORK_COUNT);
+} perf_kwork_names SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct work_key));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, KWORK_COUNT);
+} perf_kwork_time SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct work_key));
+ __uint(value_size, sizeof(struct report_data));
+ __uint(max_entries, KWORK_COUNT);
+} perf_kwork_report SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} perf_kwork_cpu_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, MAX_KWORKNAME);
+ __uint(max_entries, 1);
+} perf_kwork_name_filter SEC(".maps");
+
+int enabled = 0;
+
+const volatile int has_cpu_filter = 0;
+const volatile int has_name_filter = 0;
+
+static __always_inline int local_strncmp(const char *s1,
+ unsigned int sz, const char *s2)
+{
+ int ret = 0;
+ unsigned int i;
+
+ for (i = 0; i < sz; i++) {
+ ret = (unsigned char)s1[i] - (unsigned char)s2[i];
+ if (ret || !s1[i])
+ break;
+ }
+
+ return ret;
+}
+
+static __always_inline int trace_event_match(struct work_key *key, char *name)
+{
+ __u8 *cpu_val;
+ char *name_val;
+ __u32 zero = 0;
+ __u32 cpu = bpf_get_smp_processor_id();
+
+ if (!enabled)
+ return 0;
+
+ if (has_cpu_filter) {
+ cpu_val = bpf_map_lookup_elem(&perf_kwork_cpu_filter, &cpu);
+ if (!cpu_val)
+ return 0;
+ }
+
+ if (has_name_filter && (name != NULL)) {
+ name_val = bpf_map_lookup_elem(&perf_kwork_name_filter, &zero);
+ if (name_val &&
+ (local_strncmp(name_val, MAX_KWORKNAME, name) != 0)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static __always_inline void do_update_time(void *map, struct work_key *key,
+ __u64 time_start, __u64 time_end)
+{
+ struct report_data zero, *data;
+ __s64 delta = time_end - time_start;
+
+ if (delta < 0)
+ return;
+
+ data = bpf_map_lookup_elem(map, key);
+ if (!data) {
+ __builtin_memset(&zero, 0, sizeof(zero));
+ bpf_map_update_elem(map, key, &zero, BPF_NOEXIST);
+ data = bpf_map_lookup_elem(map, key);
+ if (!data)
+ return;
+ }
+
+ if ((delta > data->max_time) ||
+ (data->max_time == 0)) {
+ data->max_time = delta;
+ data->max_time_start = time_start;
+ data->max_time_end = time_end;
+ }
+
+ data->total_time += delta;
+ data->nr++;
+}
+
+static __always_inline void do_update_timestart(void *map, struct work_key *key)
+{
+ __u64 ts = bpf_ktime_get_ns();
+
+ bpf_map_update_elem(map, key, &ts, BPF_ANY);
+}
+
+static __always_inline void do_update_timeend(void *report_map, void *time_map,
+ struct work_key *key)
+{
+ __u64 *time = bpf_map_lookup_elem(time_map, key);
+
+ if (time) {
+ bpf_map_delete_elem(time_map, key);
+ do_update_time(report_map, key, *time, bpf_ktime_get_ns());
+ }
+}
+
+static __always_inline void do_update_name(void *map,
+ struct work_key *key, char *name)
+{
+ if (!bpf_map_lookup_elem(map, key))
+ bpf_map_update_elem(map, key, name, BPF_ANY);
+}
+
+static __always_inline int update_timestart(void *map, struct work_key *key)
+{
+ if (!trace_event_match(key, NULL))
+ return 0;
+
+ do_update_timestart(map, key);
+ return 0;
+}
+
+static __always_inline int update_timestart_and_name(void *time_map,
+ void *names_map,
+ struct work_key *key,
+ char *name)
+{
+ if (!trace_event_match(key, name))
+ return 0;
+
+ do_update_timestart(time_map, key);
+ do_update_name(names_map, key, name);
+
+ return 0;
+}
+
+static __always_inline int update_timeend(void *report_map,
+ void *time_map, struct work_key *key)
+{
+ if (!trace_event_match(key, NULL))
+ return 0;
+
+ do_update_timeend(report_map, time_map, key);
+
+ return 0;
+}
+
+static __always_inline int update_timeend_and_name(void *report_map,
+ void *time_map,
+ void *names_map,
+ struct work_key *key,
+ char *name)
+{
+ if (!trace_event_match(key, name))
+ return 0;
+
+ do_update_timeend(report_map, time_map, key);
+ do_update_name(names_map, key, name);
+
+ return 0;
+}
+
+SEC("tracepoint/irq/irq_handler_entry")
+int report_irq_handler_entry(struct trace_event_raw_irq_handler_entry *ctx)
+{
+ char name[MAX_KWORKNAME];
+ struct work_key key = {
+ .type = KWORK_CLASS_IRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->irq,
+ };
+ void *name_addr = (void *)ctx + (ctx->__data_loc_name & 0xffff);
+
+ bpf_probe_read_kernel_str(name, sizeof(name), name_addr);
+
+ return update_timestart_and_name(&perf_kwork_time,
+ &perf_kwork_names, &key, name);
+}
+
+SEC("tracepoint/irq/irq_handler_exit")
+int report_irq_handler_exit(struct trace_event_raw_irq_handler_exit *ctx)
+{
+ struct work_key key = {
+ .type = KWORK_CLASS_IRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->irq,
+ };
+
+ return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+static char softirq_name_list[NR_SOFTIRQS][MAX_KWORKNAME] = {
+ { "HI" },
+ { "TIMER" },
+ { "NET_TX" },
+ { "NET_RX" },
+ { "BLOCK" },
+ { "IRQ_POLL" },
+ { "TASKLET" },
+ { "SCHED" },
+ { "HRTIMER" },
+ { "RCU" },
+};
+
+SEC("tracepoint/irq/softirq_entry")
+int report_softirq_entry(struct trace_event_raw_softirq *ctx)
+{
+ unsigned int vec = ctx->vec;
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)vec,
+ };
+
+ if (vec < NR_SOFTIRQS) {
+ return update_timestart_and_name(&perf_kwork_time,
+ &perf_kwork_names, &key,
+ softirq_name_list[vec]);
+ }
+
+ return 0;
+}
+
+SEC("tracepoint/irq/softirq_exit")
+int report_softirq_exit(struct trace_event_raw_softirq *ctx)
+{
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->vec,
+ };
+
+ return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+SEC("tracepoint/irq/softirq_raise")
+int latency_softirq_raise(struct trace_event_raw_softirq *ctx)
+{
+ unsigned int vec = ctx->vec;
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)vec,
+ };
+
+ if (vec < NR_SOFTIRQS) {
+ return update_timestart_and_name(&perf_kwork_time,
+ &perf_kwork_names, &key,
+ softirq_name_list[vec]);
+ }
+
+ return 0;
+}
+
+SEC("tracepoint/irq/softirq_entry")
+int latency_softirq_entry(struct trace_event_raw_softirq *ctx)
+{
+ struct work_key key = {
+ .type = KWORK_CLASS_SOFTIRQ,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->vec,
+ };
+
+ return update_timeend(&perf_kwork_report, &perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_start")
+int report_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx)
+{
+ struct work_key key = {
+ .type = KWORK_CLASS_WORKQUEUE,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->work,
+ };
+
+ return update_timestart(&perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_end")
+int report_workqueue_execute_end(struct trace_event_raw_workqueue_execute_end *ctx)
+{
+ char name[MAX_KWORKNAME];
+ struct work_key key = {
+ .type = KWORK_CLASS_WORKQUEUE,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->work,
+ };
+ unsigned long long func_addr = (unsigned long long)ctx->function;
+
+ __builtin_memset(name, 0, sizeof(name));
+ bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr));
+
+ return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time,
+ &perf_kwork_names, &key, name);
+}
+
+SEC("tracepoint/workqueue/workqueue_activate_work")
+int latency_workqueue_activate_work(struct trace_event_raw_workqueue_activate_work *ctx)
+{
+ struct work_key key = {
+ .type = KWORK_CLASS_WORKQUEUE,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->work,
+ };
+
+ return update_timestart(&perf_kwork_time, &key);
+}
+
+SEC("tracepoint/workqueue/workqueue_execute_start")
+int latency_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx)
+{
+ char name[MAX_KWORKNAME];
+ struct work_key key = {
+ .type = KWORK_CLASS_WORKQUEUE,
+ .cpu = bpf_get_smp_processor_id(),
+ .id = (__u64)ctx->work,
+ };
+ unsigned long long func_addr = (unsigned long long)ctx->function;
+
+ __builtin_memset(name, 0, sizeof(name));
+ bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr));
+
+ return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time,
+ &perf_kwork_names, &key, name);
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c
new file mode 100644
index 000000000000..96e7d853b9ed
--- /dev/null
+++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c
@@ -0,0 +1,989 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+#include <asm-generic/errno-base.h>
+
+#include "lock_data.h"
+
+/* for collect_lock_syms(). 4096 was rejected by the verifier */
+#define MAX_CPUS 1024
+
+/* for collect_zone_lock(). It should be more than the actual zones. */
+#define MAX_ZONES 10
+
+/* for do_lock_delay(). Arbitrarily set to 1 million. */
+#define MAX_LOOP (1U << 20)
+
+/* lock contention flags from include/trace/events/lock.h */
+#define LCB_F_SPIN (1U << 0)
+#define LCB_F_READ (1U << 1)
+#define LCB_F_WRITE (1U << 2)
+#define LCB_F_RT (1U << 3)
+#define LCB_F_PERCPU (1U << 4)
+#define LCB_F_MUTEX (1U << 5)
+
+/* callstack storage */
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
+/* buffer for owner stacktrace */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, 1);
+} stack_buf SEC(".maps");
+
+/* a map for tracing owner stacktrace to owner stack id */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64)); // owner stacktrace
+ __uint(value_size, sizeof(__s32)); // owner stack id
+ __uint(max_entries, 1);
+} owner_stacks SEC(".maps");
+
+/* a map for tracing lock address to owner data */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64)); // lock address
+ __uint(value_size, sizeof(struct owner_tracing_data));
+ __uint(max_entries, 1);
+} owner_data SEC(".maps");
+
+/* a map for contention_key (stores owner stack id) to contention data */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct contention_key));
+ __uint(value_size, sizeof(struct contention_data));
+ __uint(max_entries, 1);
+} owner_stat SEC(".maps");
+
+/* maintain timestamp at the beginning of contention */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, struct tstamp_data);
+ __uint(max_entries, MAX_ENTRIES);
+} tstamp SEC(".maps");
+
+/* maintain per-CPU timestamp at the beginning of contention */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct tstamp_data));
+ __uint(max_entries, 1);
+} tstamp_cpu SEC(".maps");
+
+/* actual lock contention statistics */
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct contention_key));
+ __uint(value_size, sizeof(struct contention_data));
+ __uint(max_entries, MAX_ENTRIES);
+} lock_stat SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct contention_task_data));
+ __uint(max_entries, MAX_ENTRIES);
+} task_data SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u32));
+ __uint(max_entries, MAX_ENTRIES);
+} lock_syms SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} task_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} type_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} addr_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cgroup_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(long));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} slab_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(long));
+ __uint(value_size, sizeof(struct slab_cache_data));
+ __uint(max_entries, 1);
+} slab_caches SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, 1);
+} lock_delays SEC(".maps");
+
+struct rw_semaphore___old {
+ struct task_struct *owner;
+} __attribute__((preserve_access_index));
+
+struct rw_semaphore___new {
+ atomic_long_t owner;
+} __attribute__((preserve_access_index));
+
+struct mm_struct___old {
+ struct rw_semaphore mmap_sem;
+} __attribute__((preserve_access_index));
+
+struct mm_struct___new {
+ struct rw_semaphore mmap_lock;
+} __attribute__((preserve_access_index));
+
+extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak;
+
+/* control flags */
+const volatile int has_cpu;
+const volatile int has_task;
+const volatile int has_type;
+const volatile int has_addr;
+const volatile int has_cgroup;
+const volatile int has_slab;
+const volatile int needs_callstack;
+const volatile int stack_skip;
+const volatile int lock_owner;
+const volatile int use_cgroup_v2;
+const volatile int max_stack;
+const volatile int lock_delay;
+
+/* determine the key of lock stat */
+const volatile int aggr_mode;
+
+int enabled;
+
+int perf_subsys_id = -1;
+
+__u64 end_ts;
+
+__u32 slab_cache_id;
+
+/* error stat */
+int task_fail;
+int stack_fail;
+int time_fail;
+int data_fail;
+
+int task_map_full;
+int data_map_full;
+
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak;
+void bpf_task_release(struct task_struct *p) __ksym __weak;
+
+static inline __u64 get_current_cgroup_id(void)
+{
+ struct task_struct *task;
+ struct cgroup *cgrp;
+
+ if (use_cgroup_v2)
+ return bpf_get_current_cgroup_id();
+
+ task = bpf_get_current_task_btf();
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+
+ cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
+ return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static inline int can_record(u64 *ctx)
+{
+ if (has_cpu) {
+ __u32 cpu = bpf_get_smp_processor_id();
+ __u8 *ok;
+
+ ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_task) {
+ __u8 *ok;
+ __u32 pid = bpf_get_current_pid_tgid();
+
+ ok = bpf_map_lookup_elem(&task_filter, &pid);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_type) {
+ __u8 *ok;
+ __u32 flags = (__u32)ctx[1];
+
+ ok = bpf_map_lookup_elem(&type_filter, &flags);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_addr) {
+ __u8 *ok;
+ __u64 addr = ctx[0];
+
+ ok = bpf_map_lookup_elem(&addr_filter, &addr);
+ if (!ok && !has_slab)
+ return 0;
+ }
+
+ if (has_cgroup) {
+ __u8 *ok;
+ __u64 cgrp = get_current_cgroup_id();
+
+ ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_slab && bpf_get_kmem_cache) {
+ __u8 *ok;
+ __u64 addr = ctx[0];
+ long kmem_cache_addr;
+
+ kmem_cache_addr = (long)bpf_get_kmem_cache(addr);
+ ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr);
+ if (!ok)
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int update_task_data(struct task_struct *task)
+{
+ struct contention_task_data *p;
+ int pid, err;
+
+ err = bpf_core_read(&pid, sizeof(pid), &task->pid);
+ if (err)
+ return -1;
+
+ p = bpf_map_lookup_elem(&task_data, &pid);
+ if (p == NULL && !task_map_full) {
+ struct contention_task_data data = {};
+
+ BPF_CORE_READ_STR_INTO(&data.comm, task, comm);
+ if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG)
+ task_map_full = 1;
+ }
+
+ return 0;
+}
+
+#ifndef __has_builtin
+# define __has_builtin(x) 0
+#endif
+
+static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags)
+{
+ struct task_struct *task;
+ __u64 owner = 0;
+
+ if (flags & LCB_F_MUTEX) {
+ struct mutex *mutex = (void *)lock;
+ owner = BPF_CORE_READ(mutex, owner.counter);
+ } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) {
+ /*
+ * Support for the BPF_TYPE_MATCHES argument to the
+ * __builtin_preserve_type_info builtin was added at some point during
+ * development of clang 15 and it's what is needed for
+ * bpf_core_type_matches.
+ */
+#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15
+ if (bpf_core_type_matches(struct rw_semaphore___old)) {
+ struct rw_semaphore___old *rwsem = (void *)lock;
+ owner = (unsigned long)BPF_CORE_READ(rwsem, owner);
+ } else if (bpf_core_type_matches(struct rw_semaphore___new)) {
+ struct rw_semaphore___new *rwsem = (void *)lock;
+ owner = BPF_CORE_READ(rwsem, owner.counter);
+ }
+#else
+ /* assume new struct */
+ struct rw_semaphore *rwsem = (void *)lock;
+ owner = BPF_CORE_READ(rwsem, owner.counter);
+#endif
+ }
+
+ if (!owner)
+ return NULL;
+
+ task = (void *)(owner & ~7UL);
+ return task;
+}
+
+static inline __u32 check_lock_type(__u64 lock, __u32 flags)
+{
+ struct task_struct *curr;
+ struct mm_struct___old *mm_old;
+ struct mm_struct___new *mm_new;
+ struct sighand_struct *sighand;
+
+ switch (flags) {
+ case LCB_F_READ: /* rwsem */
+ case LCB_F_WRITE:
+ curr = bpf_get_current_task_btf();
+ if (curr->mm == NULL)
+ break;
+ mm_new = (void *)curr->mm;
+ if (bpf_core_field_exists(mm_new->mmap_lock)) {
+ if (&mm_new->mmap_lock == (void *)lock)
+ return LCD_F_MMAP_LOCK;
+ break;
+ }
+ mm_old = (void *)curr->mm;
+ if (bpf_core_field_exists(mm_old->mmap_sem)) {
+ if (&mm_old->mmap_sem == (void *)lock)
+ return LCD_F_MMAP_LOCK;
+ }
+ break;
+ case LCB_F_SPIN: /* spinlock */
+ curr = bpf_get_current_task_btf();
+ sighand = curr->sighand;
+
+ if (sighand && &sighand->siglock == (void *)lock)
+ return LCD_F_SIGHAND_LOCK;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static inline long delay_callback(__u64 idx, void *arg)
+{
+ __u64 target = *(__u64 *)arg;
+
+ if (target <= bpf_ktime_get_ns())
+ return 1;
+
+ /* just to kill time */
+ (void)bpf_get_prandom_u32();
+
+ return 0;
+}
+
+static inline void do_lock_delay(__u64 duration)
+{
+ __u64 target = bpf_ktime_get_ns() + duration;
+
+ bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0);
+}
+
+static inline void check_lock_delay(__u64 lock)
+{
+ __u64 *delay;
+
+ delay = bpf_map_lookup_elem(&lock_delays, &lock);
+ if (delay)
+ do_lock_delay(*delay);
+}
+
+static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
+{
+ __u32 pid;
+ struct tstamp_data *pelem;
+
+ /* Use per-cpu array map for spinlock and rwlock */
+ if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) {
+ __u32 idx = 0;
+
+ pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
+ /* Do not update the element for nested locks */
+ if (pelem && pelem->lock)
+ pelem = NULL;
+ return pelem;
+ }
+
+ pid = bpf_get_current_pid_tgid();
+ pelem = bpf_map_lookup_elem(&tstamp, &pid);
+ /* Do not update the element for nested locks */
+ if (pelem && pelem->lock)
+ return NULL;
+
+ if (pelem == NULL) {
+ struct tstamp_data zero = {};
+
+ if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
+ __sync_fetch_and_add(&task_fail, 1);
+ return NULL;
+ }
+
+ pelem = bpf_map_lookup_elem(&tstamp, &pid);
+ if (pelem == NULL) {
+ __sync_fetch_and_add(&task_fail, 1);
+ return NULL;
+ }
+ }
+ return pelem;
+}
+
+static inline s32 get_owner_stack_id(u64 *stacktrace)
+{
+ s32 *id, new_id;
+ static s64 id_gen = 1;
+
+ id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
+ if (id)
+ return *id;
+
+ new_id = (s32)__sync_fetch_and_add(&id_gen, 1);
+
+ bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST);
+
+ id = bpf_map_lookup_elem(&owner_stacks, stacktrace);
+ if (id)
+ return *id;
+
+ return -1;
+}
+
+static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count)
+{
+ __sync_fetch_and_add(&data->total_time, duration);
+ __sync_fetch_and_add(&data->count, count);
+
+ /* FIXME: need atomic operations */
+ if (data->max_time < duration)
+ data->max_time = duration;
+ if (data->min_time > duration)
+ data->min_time = duration;
+}
+
+static inline void update_owner_stat(u32 id, u64 duration, u32 flags)
+{
+ struct contention_key key = {
+ .stack_id = id,
+ .pid = 0,
+ .lock_addr_or_cgroup = 0,
+ };
+ struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key);
+
+ if (!data) {
+ struct contention_data first = {
+ .total_time = duration,
+ .max_time = duration,
+ .min_time = duration,
+ .count = 1,
+ .flags = flags,
+ };
+ bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST);
+ } else {
+ update_contention_data(data, duration, 1);
+ }
+}
+
+SEC("tp_btf/contention_begin")
+int contention_begin(u64 *ctx)
+{
+ struct tstamp_data *pelem;
+
+ if (!enabled || !can_record(ctx))
+ return 0;
+
+ pelem = get_tstamp_elem(ctx[1]);
+ if (pelem == NULL)
+ return 0;
+
+ pelem->timestamp = bpf_ktime_get_ns();
+ pelem->lock = (__u64)ctx[0];
+ pelem->flags = (__u32)ctx[1];
+
+ if (needs_callstack) {
+ u32 i = 0;
+ u32 id = 0;
+ int owner_pid;
+ u64 *buf;
+ struct task_struct *task;
+ struct owner_tracing_data *otdata;
+
+ if (!lock_owner)
+ goto skip_owner;
+
+ task = get_lock_owner(pelem->lock, pelem->flags);
+ if (!task)
+ goto skip_owner;
+
+ owner_pid = BPF_CORE_READ(task, pid);
+
+ buf = bpf_map_lookup_elem(&stack_buf, &i);
+ if (!buf)
+ goto skip_owner;
+ for (i = 0; i < max_stack; i++)
+ buf[i] = 0x0;
+
+ if (!bpf_task_from_pid)
+ goto skip_owner;
+
+ task = bpf_task_from_pid(owner_pid);
+ if (!task)
+ goto skip_owner;
+
+ bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0);
+ bpf_task_release(task);
+
+ otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
+ id = get_owner_stack_id(buf);
+
+ /*
+ * Contention just happens, or corner case `lock` is owned by process not
+ * `owner_pid`. For the corner case we treat it as unexpected internal error and
+ * just ignore the precvious tracing record.
+ */
+ if (!otdata || otdata->pid != owner_pid) {
+ struct owner_tracing_data first = {
+ .pid = owner_pid,
+ .timestamp = pelem->timestamp,
+ .count = 1,
+ .stack_id = id,
+ };
+ bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY);
+ }
+ /* Contention is ongoing and new waiter joins */
+ else {
+ __sync_fetch_and_add(&otdata->count, 1);
+
+ /*
+ * The owner is the same, but stacktrace might be changed. In this case we
+ * store/update `owner_stat` based on current owner stack id.
+ */
+ if (id != otdata->stack_id) {
+ update_owner_stat(id, pelem->timestamp - otdata->timestamp,
+ pelem->flags);
+
+ otdata->timestamp = pelem->timestamp;
+ otdata->stack_id = id;
+ }
+ }
+skip_owner:
+ pelem->stack_id = bpf_get_stackid(ctx, &stacks,
+ BPF_F_FAST_STACK_CMP | stack_skip);
+ if (pelem->stack_id < 0)
+ __sync_fetch_and_add(&stack_fail, 1);
+ } else if (aggr_mode == LOCK_AGGR_TASK) {
+ struct task_struct *task;
+
+ if (lock_owner) {
+ task = get_lock_owner(pelem->lock, pelem->flags);
+
+ /* The flags is not used anymore. Pass the owner pid. */
+ if (task)
+ pelem->flags = BPF_CORE_READ(task, pid);
+ else
+ pelem->flags = -1U;
+
+ } else {
+ task = bpf_get_current_task_btf();
+ }
+
+ if (task) {
+ if (update_task_data(task) < 0 && lock_owner)
+ pelem->flags = -1U;
+ }
+ }
+
+ return 0;
+}
+
+SEC("tp_btf/contention_end")
+int contention_end(u64 *ctx)
+{
+ __u32 pid = 0, idx = 0;
+ struct tstamp_data *pelem;
+ struct contention_key key = {};
+ struct contention_data *data;
+ __u64 timestamp;
+ __u64 duration;
+ bool need_delete = false;
+
+ if (!enabled)
+ return 0;
+
+ /*
+ * For spinlock and rwlock, it needs to get the timestamp for the
+ * per-cpu map. However, contention_end does not have the flags
+ * so it cannot know whether it reads percpu or hash map.
+ *
+ * Try per-cpu map first and check if there's active contention.
+ * If it is, do not read hash map because it cannot go to sleeping
+ * locks before releasing the spinning locks.
+ */
+ pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
+ if (pelem && pelem->lock) {
+ if (pelem->lock != ctx[0])
+ return 0;
+ } else {
+ pid = bpf_get_current_pid_tgid();
+ pelem = bpf_map_lookup_elem(&tstamp, &pid);
+ if (!pelem || pelem->lock != ctx[0])
+ return 0;
+ need_delete = true;
+ }
+
+ timestamp = bpf_ktime_get_ns();
+ duration = timestamp - pelem->timestamp;
+ if ((__s64)duration < 0) {
+ __sync_fetch_and_add(&time_fail, 1);
+ goto out;
+ }
+
+ if (needs_callstack && lock_owner) {
+ struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock);
+
+ if (!otdata)
+ goto skip_owner;
+
+ /* Update `owner_stat` */
+ update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags);
+
+ /* No contention is occurring, delete `lock` entry in `owner_data` */
+ if (otdata->count <= 1)
+ bpf_map_delete_elem(&owner_data, &pelem->lock);
+ /*
+ * Contention is still ongoing, with a new owner (current task). `owner_data`
+ * should be updated accordingly.
+ */
+ else {
+ u32 i = 0;
+ s32 ret = (s32)ctx[1];
+ u64 *buf;
+
+ otdata->timestamp = timestamp;
+ __sync_fetch_and_add(&otdata->count, -1);
+
+ buf = bpf_map_lookup_elem(&stack_buf, &i);
+ if (!buf)
+ goto skip_owner;
+ for (i = 0; i < (u32)max_stack; i++)
+ buf[i] = 0x0;
+
+ /*
+ * `ret` has the return code of the lock function.
+ * If `ret` is negative, the current task terminates lock waiting without
+ * acquiring it. Owner is not changed, but we still need to update the owner
+ * stack.
+ */
+ if (ret < 0) {
+ s32 id = 0;
+ struct task_struct *task;
+
+ if (!bpf_task_from_pid)
+ goto skip_owner;
+
+ task = bpf_task_from_pid(otdata->pid);
+ if (!task)
+ goto skip_owner;
+
+ bpf_get_task_stack(task, buf,
+ max_stack * sizeof(unsigned long), 0);
+ bpf_task_release(task);
+
+ id = get_owner_stack_id(buf);
+
+ /*
+ * If owner stack is changed, update owner stack id for this lock.
+ */
+ if (id != otdata->stack_id)
+ otdata->stack_id = id;
+ }
+ /*
+ * Otherwise, update tracing data with the current task, which is the new
+ * owner.
+ */
+ else {
+ otdata->pid = pid;
+ /*
+ * We don't want to retrieve callstack here, since it is where the
+ * current task acquires the lock and provides no additional
+ * information. We simply assign -1 to invalidate it.
+ */
+ otdata->stack_id = -1;
+ }
+ }
+ }
+skip_owner:
+ switch (aggr_mode) {
+ case LOCK_AGGR_CALLER:
+ key.stack_id = pelem->stack_id;
+ break;
+ case LOCK_AGGR_TASK:
+ if (lock_owner)
+ key.pid = pelem->flags;
+ else {
+ if (!need_delete)
+ pid = bpf_get_current_pid_tgid();
+ key.pid = pid;
+ }
+ if (needs_callstack)
+ key.stack_id = pelem->stack_id;
+ break;
+ case LOCK_AGGR_ADDR:
+ key.lock_addr_or_cgroup = pelem->lock;
+ if (needs_callstack)
+ key.stack_id = pelem->stack_id;
+ break;
+ case LOCK_AGGR_CGROUP:
+ key.lock_addr_or_cgroup = get_current_cgroup_id();
+ break;
+ default:
+ /* should not happen */
+ return 0;
+ }
+
+ data = bpf_map_lookup_elem(&lock_stat, &key);
+ if (!data) {
+ if (data_map_full) {
+ __sync_fetch_and_add(&data_fail, 1);
+ goto out;
+ }
+
+ struct contention_data first = {
+ .total_time = duration,
+ .max_time = duration,
+ .min_time = duration,
+ .count = 1,
+ .flags = pelem->flags,
+ };
+ int err;
+
+ if (aggr_mode == LOCK_AGGR_ADDR) {
+ first.flags |= check_lock_type(pelem->lock,
+ pelem->flags & LCB_F_TYPE_MASK);
+
+ /* Check if it's from a slab object */
+ if (bpf_get_kmem_cache) {
+ struct kmem_cache *s;
+ struct slab_cache_data *d;
+
+ s = bpf_get_kmem_cache(pelem->lock);
+ if (s != NULL) {
+ /*
+ * Save the ID of the slab cache in the flags
+ * (instead of full address) to reduce the
+ * space in the contention_data.
+ */
+ d = bpf_map_lookup_elem(&slab_caches, &s);
+ if (d != NULL)
+ first.flags |= d->id;
+ }
+ }
+ }
+
+ err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST);
+ if (err < 0) {
+ if (err == -EEXIST) {
+ /* it lost the race, try to get it again */
+ data = bpf_map_lookup_elem(&lock_stat, &key);
+ if (data != NULL)
+ goto found;
+ }
+ if (err == -E2BIG)
+ data_map_full = 1;
+ __sync_fetch_and_add(&data_fail, 1);
+ }
+ goto out;
+ }
+
+found:
+ update_contention_data(data, duration, 1);
+
+out:
+ if (lock_delay)
+ check_lock_delay(pelem->lock);
+
+ pelem->lock = 0;
+ if (need_delete)
+ bpf_map_delete_elem(&tstamp, &pid);
+ return 0;
+}
+
+extern struct rq runqueues __ksym;
+
+const volatile __u64 contig_page_data_addr;
+const volatile __u64 node_data_addr;
+const volatile int nr_nodes;
+const volatile int sizeof_zone;
+
+struct rq___old {
+ raw_spinlock_t lock;
+} __attribute__((preserve_access_index));
+
+struct rq___new {
+ raw_spinlock_t __lock;
+} __attribute__((preserve_access_index));
+
+static void collect_zone_lock(void)
+{
+ __u64 nr_zones, zone_off;
+ __u64 lock_addr, lock_off;
+ __u32 lock_flag = LOCK_CLASS_ZONE_LOCK;
+
+ zone_off = offsetof(struct pglist_data, node_zones);
+ lock_off = offsetof(struct zone, lock);
+
+ if (contig_page_data_addr) {
+ struct pglist_data *contig_page_data;
+
+ contig_page_data = (void *)(long)contig_page_data_addr;
+ nr_zones = BPF_CORE_READ(contig_page_data, nr_zones);
+
+ for (int i = 0; i < MAX_ZONES; i++) {
+ __u64 zone_addr;
+
+ if (i >= nr_zones)
+ break;
+
+ zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off;
+ lock_addr = zone_addr + lock_off;
+
+ bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+ }
+ } else if (nr_nodes > 0) {
+ struct pglist_data **node_data = (void *)(long)node_data_addr;
+
+ for (int i = 0; i < nr_nodes; i++) {
+ struct pglist_data *pgdat = NULL;
+ int err;
+
+ err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]);
+ if (err < 0 || pgdat == NULL)
+ break;
+
+ nr_zones = BPF_CORE_READ(pgdat, nr_zones);
+ for (int k = 0; k < MAX_ZONES; k++) {
+ __u64 zone_addr;
+
+ if (k >= nr_zones)
+ break;
+
+ zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off;
+ lock_addr = zone_addr + lock_off;
+
+ bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+ }
+ }
+ }
+}
+
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(collect_lock_syms)
+{
+ __u64 lock_addr, lock_off;
+ __u32 lock_flag;
+
+ if (bpf_core_field_exists(struct rq___new, __lock))
+ lock_off = offsetof(struct rq___new, __lock);
+ else
+ lock_off = offsetof(struct rq___old, lock);
+
+ for (int i = 0; i < MAX_CPUS; i++) {
+ struct rq *rq = bpf_per_cpu_ptr(&runqueues, i);
+
+ if (rq == NULL)
+ break;
+
+ lock_addr = (__u64)(void *)rq + lock_off;
+ lock_flag = LOCK_CLASS_RQLOCK;
+ bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY);
+ }
+
+ collect_zone_lock();
+
+ return 0;
+}
+
+SEC("raw_tp/bpf_test_finish")
+int BPF_PROG(end_timestamp)
+{
+ end_ts = bpf_ktime_get_ns();
+ return 0;
+}
+
+/*
+ * bpf_iter__kmem_cache added recently so old kernels don't have it in the
+ * vmlinux.h. But we cannot add it here since it will cause a compiler error
+ * due to redefinition of the struct on later kernels.
+ *
+ * So it uses a CO-RE trick to access the member only if it has the type.
+ * This will support both old and new kernels without compiler errors.
+ */
+struct bpf_iter__kmem_cache___new {
+ struct kmem_cache *s;
+} __attribute__((preserve_access_index));
+
+SEC("iter/kmem_cache")
+int slab_cache_iter(void *ctx)
+{
+ struct kmem_cache *s = NULL;
+ struct slab_cache_data d;
+ const char *nameptr;
+
+ if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) {
+ struct bpf_iter__kmem_cache___new *iter = ctx;
+
+ s = iter->s;
+ }
+
+ if (s == NULL)
+ return 0;
+
+ nameptr = s->name;
+ bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr);
+
+ d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT;
+ if (d.id >= LCB_F_SLAB_ID_END)
+ return 0;
+
+ bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST);
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h
new file mode 100644
index 000000000000..28c5e5aced7f
--- /dev/null
+++ b/tools/perf/util/bpf_skel/lock_data.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Data structures shared between BPF and tools. */
+#ifndef UTIL_BPF_SKEL_LOCK_DATA_H
+#define UTIL_BPF_SKEL_LOCK_DATA_H
+
+struct owner_tracing_data {
+ u32 pid; // Who has the lock.
+ u32 count; // How many waiters for this lock.
+ u64 timestamp; // The time while the owner acquires lock and contention is going on.
+ s32 stack_id; // Identifier for `owner_stat`, which stores as value in `owner_stacks`
+};
+
+struct tstamp_data {
+ u64 timestamp;
+ u64 lock;
+ u32 flags;
+ s32 stack_id;
+};
+
+struct contention_key {
+ s32 stack_id;
+ u32 pid;
+ u64 lock_addr_or_cgroup;
+};
+
+#define TASK_COMM_LEN 16
+
+struct contention_task_data {
+ char comm[TASK_COMM_LEN];
+};
+
+/* default buffer size */
+#define MAX_ENTRIES 16384
+
+/*
+ * Upper bits of the flags in the contention_data are used to identify
+ * some well-known locks which do not have symbols (non-global locks).
+ */
+#define LCD_F_MMAP_LOCK (1U << 31)
+#define LCD_F_SIGHAND_LOCK (1U << 30)
+
+#define LCB_F_SLAB_ID_SHIFT 16
+#define LCB_F_SLAB_ID_START (1U << 16)
+#define LCB_F_SLAB_ID_END (1U << 26)
+#define LCB_F_SLAB_ID_MASK 0x03FF0000U
+
+#define LCB_F_TYPE_MAX (1U << 7)
+#define LCB_F_TYPE_MASK 0x0000007FU
+
+#define SLAB_NAME_MAX 28
+
+struct contention_data {
+ u64 total_time;
+ u64 min_time;
+ u64 max_time;
+ u32 count;
+ u32 flags;
+};
+
+enum lock_aggr_mode {
+ LOCK_AGGR_ADDR = 0,
+ LOCK_AGGR_TASK,
+ LOCK_AGGR_CALLER,
+ LOCK_AGGR_CGROUP,
+};
+
+enum lock_class_sym {
+ LOCK_CLASS_NONE,
+ LOCK_CLASS_RQLOCK,
+ LOCK_CLASS_ZONE_LOCK,
+};
+
+struct slab_cache_data {
+ u32 id;
+ char name[SLAB_NAME_MAX];
+};
+
+#endif /* UTIL_BPF_SKEL_LOCK_DATA_H */
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
new file mode 100644
index 000000000000..72763bb8d1de
--- /dev/null
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -0,0 +1,372 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/* task->flags for off-cpu analysis */
+#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
+
+/* task->state for off-cpu analysis */
+#define TASK_INTERRUPTIBLE 0x0001
+#define TASK_UNINTERRUPTIBLE 0x0002
+
+/* create a new thread */
+#define CLONE_THREAD 0x10000
+
+#define MAX_STACKS 32
+#define MAX_ENTRIES 102400
+
+#define MAX_CPUS 4096
+#define MAX_OFFCPU_LEN 37
+
+// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1
+struct __stack {
+ u64 array[MAX_STACKS];
+};
+
+struct tstamp_data {
+ __u32 stack_id;
+ __u32 state;
+ __u64 timestamp;
+ struct __stack stack;
+};
+
+struct offcpu_key {
+ __u32 pid;
+ __u32 tgid;
+ __u32 stack_id;
+ __u32 state;
+ __u64 cgroup_id;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, MAX_STACKS * sizeof(__u64));
+ __uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
+struct offcpu_data {
+ u64 array[MAX_OFFCPU_LEN];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(max_entries, MAX_CPUS);
+} offcpu_output SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct offcpu_data));
+ __uint(max_entries, 1);
+} offcpu_payload SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct tstamp_data);
+} tstamp SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(struct offcpu_key));
+ __uint(value_size, sizeof(__u64));
+ __uint(max_entries, MAX_ENTRIES);
+} off_cpu SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cpu_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} task_filter SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(key_size, sizeof(__u64));
+ __uint(value_size, sizeof(__u8));
+ __uint(max_entries, 1);
+} cgroup_filter SEC(".maps");
+
+/* new kernel task_struct definition */
+struct task_struct___new {
+ long __state;
+} __attribute__((preserve_access_index));
+
+/* old kernel task_struct definition */
+struct task_struct___old {
+ long state;
+} __attribute__((preserve_access_index));
+
+int enabled = 0;
+
+const volatile int has_cpu = 0;
+const volatile int has_task = 0;
+const volatile int has_cgroup = 0;
+const volatile int uses_tgid = 0;
+
+const volatile bool has_prev_state = false;
+const volatile bool needs_cgroup = false;
+const volatile bool uses_cgroup_v1 = false;
+
+int perf_subsys_id = -1;
+
+__u64 offcpu_thresh_ns;
+
+/*
+ * Old kernel used to call it task_struct->state and now it's '__state'.
+ * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
+ *
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+static inline int get_task_state(struct task_struct *t)
+{
+ /* recast pointer to capture new type for compiler */
+ struct task_struct___new *t_new = (void *)t;
+
+ if (bpf_core_field_exists(t_new->__state)) {
+ return BPF_CORE_READ(t_new, __state);
+ } else {
+ /* recast pointer to capture old type for compiler */
+ struct task_struct___old *t_old = (void *)t;
+
+ return BPF_CORE_READ(t_old, state);
+ }
+}
+
+static inline __u64 get_cgroup_id(struct task_struct *t)
+{
+ struct cgroup *cgrp;
+
+ if (!uses_cgroup_v1)
+ return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+
+ cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
+ return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static inline int can_record(struct task_struct *t, int state)
+{
+ /* kernel threads don't have user stack */
+ if (t->flags & PF_KTHREAD)
+ return 0;
+
+ if (state != TASK_INTERRUPTIBLE &&
+ state != TASK_UNINTERRUPTIBLE)
+ return 0;
+
+ if (has_cpu) {
+ __u32 cpu = bpf_get_smp_processor_id();
+ __u8 *ok;
+
+ ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_task) {
+ __u8 *ok;
+ __u32 pid;
+
+ if (uses_tgid)
+ pid = t->tgid;
+ else
+ pid = t->pid;
+
+ ok = bpf_map_lookup_elem(&task_filter, &pid);
+ if (!ok)
+ return 0;
+ }
+
+ if (has_cgroup) {
+ __u8 *ok;
+ __u64 cgrp_id = get_cgroup_id(t);
+
+ ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
+ if (!ok)
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n)
+{
+ int len = 0;
+
+ for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len)
+ to->array[n + 2 + i] = from->array[i];
+
+ return len;
+}
+
+/**
+ * off_cpu_dump - dump off-cpu samples to ring buffer
+ * @data: payload for dumping off-cpu samples
+ * @key: off-cpu data
+ * @stack: stack trace of the task before being scheduled out
+ *
+ * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id
+ * information of the task, and dump it as a raw sample to perf ring buffer
+ */
+static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key,
+ struct __stack *stack, __u64 delta)
+{
+ int n = 0, len = 0;
+
+ data->array[n++] = (u64)key->tgid << 32 | key->pid;
+ data->array[n++] = delta;
+
+ /* data->array[n] is callchain->nr (updated later) */
+ data->array[n + 1] = PERF_CONTEXT_USER;
+ data->array[n + 2] = 0;
+ len = copy_stack(stack, data, n);
+
+ /* update length of callchain */
+ data->array[n] = len + 1;
+ n += len + 2;
+
+ data->array[n++] = key->cgroup_id;
+
+ return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64));
+}
+
+static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
+ struct task_struct *next, int state)
+{
+ __u64 ts;
+ __u32 stack_id;
+ struct tstamp_data *pelem;
+
+ ts = bpf_ktime_get_ns();
+
+ if (!can_record(prev, state))
+ goto next;
+
+ stack_id = bpf_get_stackid(ctx, &stacks,
+ BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
+
+ pelem = bpf_task_storage_get(&tstamp, prev, NULL,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!pelem)
+ goto next;
+
+ pelem->timestamp = ts;
+ pelem->state = state;
+ pelem->stack_id = stack_id;
+
+ /*
+ * If stacks are successfully collected by bpf_get_stackid(), collect them once more
+ * in task_storage for direct off-cpu sample dumping
+ */
+ if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) {
+ /*
+ * This empty if block is used to avoid 'result unused warning' from bpf_get_stack().
+ * If the collection fails, continue with the logic for the next task.
+ */
+ }
+next:
+ pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
+
+ if (pelem && pelem->timestamp) {
+ struct offcpu_key key = {
+ .pid = next->pid,
+ .tgid = next->tgid,
+ .stack_id = pelem->stack_id,
+ .state = pelem->state,
+ .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
+ };
+ __u64 delta = ts - pelem->timestamp;
+ __u64 *total;
+
+ if (delta >= offcpu_thresh_ns) {
+ int zero = 0;
+ struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero);
+
+ if (data)
+ off_cpu_dump(ctx, data, &key, &pelem->stack, delta);
+ } else {
+ total = bpf_map_lookup_elem(&off_cpu, &key);
+ if (total)
+ *total += delta;
+ else
+ bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
+ }
+
+ /* prevent to reuse the timestamp later */
+ pelem->timestamp = 0;
+ }
+
+ return 0;
+}
+
+SEC("tp_btf/task_newtask")
+int on_newtask(u64 *ctx)
+{
+ struct task_struct *task;
+ u64 clone_flags;
+ u32 pid;
+ u8 val = 1;
+
+ if (!uses_tgid)
+ return 0;
+
+ task = (struct task_struct *)bpf_get_current_task();
+
+ pid = BPF_CORE_READ(task, tgid);
+ if (!bpf_map_lookup_elem(&task_filter, &pid))
+ return 0;
+
+ task = (struct task_struct *)ctx[0];
+ clone_flags = ctx[1];
+
+ pid = task->tgid;
+ if (!(clone_flags & CLONE_THREAD))
+ bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
+
+ return 0;
+}
+
+SEC("tp_btf/sched_switch")
+int on_switch(u64 *ctx)
+{
+ struct task_struct *prev, *next;
+ int prev_state;
+
+ if (!enabled)
+ return 0;
+
+ prev = (struct task_struct *)ctx[1];
+ next = (struct task_struct *)ctx[2];
+
+ if (has_prev_state)
+ prev_state = (int)ctx[3];
+ else
+ prev_state = get_task_state(prev);
+
+ return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h
new file mode 100644
index 000000000000..683fec85e71e
--- /dev/null
+++ b/tools/perf/util/bpf_skel/sample-filter.h
@@ -0,0 +1,72 @@
+#ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
+#define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H
+
+#define MAX_FILTERS 64
+#define MAX_IDX_HASH (16 * 1024)
+#define MAX_EVT_HASH (1024 * 1024)
+
+/* supported filter operations */
+enum perf_bpf_filter_op {
+ PBF_OP_EQ,
+ PBF_OP_NEQ,
+ PBF_OP_GT,
+ PBF_OP_GE,
+ PBF_OP_LT,
+ PBF_OP_LE,
+ PBF_OP_AND,
+ PBF_OP_GROUP_BEGIN,
+ PBF_OP_GROUP_END,
+ PBF_OP_DONE,
+};
+
+enum perf_bpf_filter_term {
+ /* No term is in use. */
+ PBF_TERM_NONE = 0,
+ /* Terms that correspond to PERF_SAMPLE_xx values. */
+ PBF_TERM_SAMPLE_START = PBF_TERM_NONE + 1,
+ PBF_TERM_IP = PBF_TERM_SAMPLE_START + 0, /* SAMPLE_IP = 1U << 0 */
+ PBF_TERM_TID = PBF_TERM_SAMPLE_START + 1, /* SAMPLE_TID = 1U << 1 */
+ PBF_TERM_TIME = PBF_TERM_SAMPLE_START + 2, /* SAMPLE_TIME = 1U << 2 */
+ PBF_TERM_ADDR = PBF_TERM_SAMPLE_START + 3, /* SAMPLE_ADDR = 1U << 3 */
+ __PBF_UNUSED_TERM4 = PBF_TERM_SAMPLE_START + 4, /* SAMPLE_READ = 1U << 4 */
+ __PBF_UNUSED_TERM5 = PBF_TERM_SAMPLE_START + 5, /* SAMPLE_CALLCHAIN = 1U << 5 */
+ PBF_TERM_ID = PBF_TERM_SAMPLE_START + 6, /* SAMPLE_ID = 1U << 6 */
+ PBF_TERM_CPU = PBF_TERM_SAMPLE_START + 7, /* SAMPLE_CPU = 1U << 7 */
+ PBF_TERM_PERIOD = PBF_TERM_SAMPLE_START + 8, /* SAMPLE_PERIOD = 1U << 8 */
+ __PBF_UNUSED_TERM9 = PBF_TERM_SAMPLE_START + 9, /* SAMPLE_STREAM_ID = 1U << 9 */
+ __PBF_UNUSED_TERM10 = PBF_TERM_SAMPLE_START + 10, /* SAMPLE_RAW = 1U << 10 */
+ __PBF_UNUSED_TERM11 = PBF_TERM_SAMPLE_START + 11, /* SAMPLE_BRANCH_STACK = 1U << 11 */
+ __PBF_UNUSED_TERM12 = PBF_TERM_SAMPLE_START + 12, /* SAMPLE_REGS_USER = 1U << 12 */
+ __PBF_UNUSED_TERM13 = PBF_TERM_SAMPLE_START + 13, /* SAMPLE_STACK_USER = 1U << 13 */
+ PBF_TERM_WEIGHT = PBF_TERM_SAMPLE_START + 14, /* SAMPLE_WEIGHT = 1U << 14 */
+ PBF_TERM_DATA_SRC = PBF_TERM_SAMPLE_START + 15, /* SAMPLE_DATA_SRC = 1U << 15 */
+ __PBF_UNUSED_TERM16 = PBF_TERM_SAMPLE_START + 16, /* SAMPLE_IDENTIFIER = 1U << 16 */
+ PBF_TERM_TRANSACTION = PBF_TERM_SAMPLE_START + 17, /* SAMPLE_TRANSACTION = 1U << 17 */
+ __PBF_UNUSED_TERM18 = PBF_TERM_SAMPLE_START + 18, /* SAMPLE_REGS_INTR = 1U << 18 */
+ PBF_TERM_PHYS_ADDR = PBF_TERM_SAMPLE_START + 19, /* SAMPLE_PHYS_ADDR = 1U << 19 */
+ __PBF_UNUSED_TERM20 = PBF_TERM_SAMPLE_START + 20, /* SAMPLE_AUX = 1U << 20 */
+ PBF_TERM_CGROUP = PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */
+ PBF_TERM_DATA_PAGE_SIZE = PBF_TERM_SAMPLE_START + 22, /* SAMPLE_DATA_PAGE_SIZE = 1U << 22 */
+ PBF_TERM_CODE_PAGE_SIZE = PBF_TERM_SAMPLE_START + 23, /* SAMPLE_CODE_PAGE_SIZE = 1U << 23 */
+ PBF_TERM_WEIGHT_STRUCT = PBF_TERM_SAMPLE_START + 24, /* SAMPLE_WEIGHT_STRUCT = 1U << 24 */
+ PBF_TERM_SAMPLE_END = PBF_TERM_WEIGHT_STRUCT,
+ /* Terms computed from BPF helpers. */
+ PBF_TERM_UID,
+ PBF_TERM_GID,
+};
+
+/* BPF map entry for filtering */
+struct perf_bpf_filter_entry {
+ enum perf_bpf_filter_op op;
+ __u32 part; /* sub-sample type info when it has multiple values */
+ enum perf_bpf_filter_term term;
+ __u64 value;
+};
+
+struct idx_hash_key {
+ __u64 evt_id;
+ __u32 tgid;
+ __u32 reserved;
+};
+
+#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */
diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c
new file mode 100644
index 000000000000..b195e6efeb8b
--- /dev/null
+++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2023 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include "sample-filter.h"
+
+/* BPF map that will be filled by user space */
+struct filters {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int);
+ __type(value, struct perf_bpf_filter_entry[MAX_FILTERS]);
+ __uint(max_entries, 1);
+} filters SEC(".maps");
+
+/*
+ * An evsel has multiple instances for each CPU or task but we need a single
+ * id to be used as a key for the idx_hash. This hashmap would translate the
+ * instance's ID to a representative ID.
+ */
+struct event_hash {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, __u64);
+ __type(value, __u64);
+ __uint(max_entries, 1);
+} event_hash SEC(".maps");
+
+/* tgid/evtid to filter index */
+struct idx_hash {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct idx_hash_key);
+ __type(value, int);
+ __uint(max_entries, 1);
+} idx_hash SEC(".maps");
+
+/* tgid to filter index */
+struct lost_count {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __type(key, int);
+ __type(value, int);
+ __uint(max_entries, 1);
+} dropped SEC(".maps");
+
+volatile const int use_idx_hash;
+
+void *bpf_cast_to_kern_ctx(void *) __ksym;
+
+/* new kernel perf_sample_data definition */
+struct perf_sample_data___new {
+ __u64 sample_flags;
+} __attribute__((preserve_access_index));
+
+/* new kernel perf_mem_data_src definition */
+union perf_mem_data_src___new {
+ __u64 val;
+ struct {
+ __u64 mem_op:5, /* type of opcode */
+ mem_lvl:14, /* memory hierarchy level */
+ mem_snoop:5, /* snoop mode */
+ mem_lock:2, /* lock instr */
+ mem_dtlb:7, /* tlb access */
+ mem_lvl_num:4, /* memory hierarchy level number */
+ mem_remote:1, /* remote */
+ mem_snoopx:2, /* snoop mode, ext */
+ mem_blk:3, /* access blocked */
+ mem_hops:3, /* hop level */
+ mem_rsvd:18;
+ };
+};
+
+/* helper function to return the given perf sample data */
+static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx,
+ struct perf_bpf_filter_entry *entry)
+{
+ struct perf_sample_data___new *data = (void *)kctx->data;
+
+ if (!bpf_core_field_exists(data->sample_flags))
+ return 0;
+
+#define BUILD_CHECK_SAMPLE(x) \
+ _Static_assert((1 << (PBF_TERM_##x - PBF_TERM_SAMPLE_START)) == PERF_SAMPLE_##x, \
+ "Mismatched PBF term to sample bit " #x)
+ BUILD_CHECK_SAMPLE(IP);
+ BUILD_CHECK_SAMPLE(TID);
+ BUILD_CHECK_SAMPLE(TIME);
+ BUILD_CHECK_SAMPLE(ADDR);
+ BUILD_CHECK_SAMPLE(ID);
+ BUILD_CHECK_SAMPLE(CPU);
+ BUILD_CHECK_SAMPLE(PERIOD);
+ BUILD_CHECK_SAMPLE(WEIGHT);
+ BUILD_CHECK_SAMPLE(DATA_SRC);
+ BUILD_CHECK_SAMPLE(TRANSACTION);
+ BUILD_CHECK_SAMPLE(PHYS_ADDR);
+ BUILD_CHECK_SAMPLE(CGROUP);
+ BUILD_CHECK_SAMPLE(DATA_PAGE_SIZE);
+ BUILD_CHECK_SAMPLE(CODE_PAGE_SIZE);
+ BUILD_CHECK_SAMPLE(WEIGHT_STRUCT);
+#undef BUILD_CHECK_SAMPLE
+
+ /* For sample terms check the sample bit is set. */
+ if (entry->term >= PBF_TERM_SAMPLE_START && entry->term <= PBF_TERM_SAMPLE_END &&
+ (data->sample_flags & (1 << (entry->term - PBF_TERM_SAMPLE_START))) == 0)
+ return 0;
+
+ switch (entry->term) {
+ case PBF_TERM_IP:
+ return kctx->data->ip;
+ case PBF_TERM_ID:
+ return kctx->data->id;
+ case PBF_TERM_TID:
+ if (entry->part)
+ return kctx->data->tid_entry.pid;
+ else
+ return kctx->data->tid_entry.tid;
+ case PBF_TERM_CPU:
+ return kctx->data->cpu_entry.cpu;
+ case PBF_TERM_TIME:
+ return kctx->data->time;
+ case PBF_TERM_ADDR:
+ return kctx->data->addr;
+ case PBF_TERM_PERIOD:
+ return kctx->data->period;
+ case PBF_TERM_TRANSACTION:
+ return kctx->data->txn;
+ case PBF_TERM_WEIGHT_STRUCT:
+ if (entry->part == 1)
+ return kctx->data->weight.var1_dw;
+ if (entry->part == 2)
+ return kctx->data->weight.var2_w;
+ if (entry->part == 3)
+ return kctx->data->weight.var3_w;
+ /* fall through */
+ case PBF_TERM_WEIGHT:
+ return kctx->data->weight.full;
+ case PBF_TERM_PHYS_ADDR:
+ return kctx->data->phys_addr;
+ case PBF_TERM_CGROUP:
+ return kctx->data->cgroup;
+ case PBF_TERM_CODE_PAGE_SIZE:
+ return kctx->data->code_page_size;
+ case PBF_TERM_DATA_PAGE_SIZE:
+ return kctx->data->data_page_size;
+ case PBF_TERM_DATA_SRC:
+ if (entry->part == 1)
+ return kctx->data->data_src.mem_op;
+ if (entry->part == 2)
+ return kctx->data->data_src.mem_lvl_num;
+ if (entry->part == 3) {
+ __u32 snoop = kctx->data->data_src.mem_snoop;
+ __u32 snoopx = kctx->data->data_src.mem_snoopx;
+
+ return (snoopx << 5) | snoop;
+ }
+ if (entry->part == 4)
+ return kctx->data->data_src.mem_remote;
+ if (entry->part == 5)
+ return kctx->data->data_src.mem_lock;
+ if (entry->part == 6)
+ return kctx->data->data_src.mem_dtlb;
+ if (entry->part == 7)
+ return kctx->data->data_src.mem_blk;
+ if (entry->part == 8) {
+ union perf_mem_data_src___new *data = (void *)&kctx->data->data_src;
+
+ if (bpf_core_field_exists(data->mem_hops))
+ return data->mem_hops;
+
+ return 0;
+ }
+ /* return the whole word */
+ return kctx->data->data_src.val;
+ case PBF_TERM_UID:
+ return bpf_get_current_uid_gid() & 0xFFFFFFFF;
+ case PBF_TERM_GID:
+ return bpf_get_current_uid_gid() >> 32;
+ case PBF_TERM_NONE:
+ case __PBF_UNUSED_TERM4:
+ case __PBF_UNUSED_TERM5:
+ case __PBF_UNUSED_TERM9:
+ case __PBF_UNUSED_TERM10:
+ case __PBF_UNUSED_TERM11:
+ case __PBF_UNUSED_TERM12:
+ case __PBF_UNUSED_TERM13:
+ case __PBF_UNUSED_TERM16:
+ case __PBF_UNUSED_TERM18:
+ case __PBF_UNUSED_TERM20:
+ default:
+ break;
+ }
+ return 0;
+}
+
+#define CHECK_RESULT(data, op, val) \
+ if (!(data op val)) { \
+ if (!in_group) \
+ goto drop; \
+ } else if (in_group) { \
+ group_result = 1; \
+ }
+
+/* BPF program to be called from perf event overflow handler */
+SEC("perf_event")
+int perf_sample_filter(void *ctx)
+{
+ struct bpf_perf_event_data_kern *kctx;
+ struct perf_bpf_filter_entry *entry;
+ __u64 sample_data;
+ int in_group = 0;
+ int group_result = 0;
+ int i, k;
+ int *losts;
+
+ kctx = bpf_cast_to_kern_ctx(ctx);
+
+ k = 0;
+
+ if (use_idx_hash) {
+ struct idx_hash_key key = {
+ .tgid = bpf_get_current_pid_tgid() >> 32,
+ };
+ __u64 eid = kctx->event->id;
+ __u64 *key_id;
+ int *idx;
+
+ /* get primary_event_id */
+ if (kctx->event->parent)
+ eid = kctx->event->parent->id;
+
+ key_id = bpf_map_lookup_elem(&event_hash, &eid);
+ if (key_id == NULL)
+ goto drop;
+
+ key.evt_id = *key_id;
+
+ idx = bpf_map_lookup_elem(&idx_hash, &key);
+ if (idx)
+ k = *idx;
+ else
+ goto drop;
+ }
+
+ entry = bpf_map_lookup_elem(&filters, &k);
+ if (entry == NULL)
+ goto drop;
+
+ for (i = 0; i < MAX_FILTERS; i++) {
+ sample_data = perf_get_sample(kctx, &entry[i]);
+
+ switch (entry[i].op) {
+ case PBF_OP_EQ:
+ CHECK_RESULT(sample_data, ==, entry[i].value)
+ break;
+ case PBF_OP_NEQ:
+ CHECK_RESULT(sample_data, !=, entry[i].value)
+ break;
+ case PBF_OP_GT:
+ CHECK_RESULT(sample_data, >, entry[i].value)
+ break;
+ case PBF_OP_GE:
+ CHECK_RESULT(sample_data, >=, entry[i].value)
+ break;
+ case PBF_OP_LT:
+ CHECK_RESULT(sample_data, <, entry[i].value)
+ break;
+ case PBF_OP_LE:
+ CHECK_RESULT(sample_data, <=, entry[i].value)
+ break;
+ case PBF_OP_AND:
+ CHECK_RESULT(sample_data, &, entry[i].value)
+ break;
+ case PBF_OP_GROUP_BEGIN:
+ in_group = 1;
+ group_result = 0;
+ break;
+ case PBF_OP_GROUP_END:
+ if (group_result == 0)
+ goto drop;
+ in_group = 0;
+ break;
+ case PBF_OP_DONE:
+ /* no failures so far, accept it */
+ return 1;
+ }
+ }
+ /* generate sample data */
+ return 1;
+
+drop:
+ losts = bpf_map_lookup_elem(&dropped, &k);
+ if (losts != NULL)
+ __sync_fetch_and_add(losts, 1);
+
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
new file mode 100644
index 000000000000..1bcd066a5199
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace raw_syscalls tracepoints to collect system call statistics.
+ */
+
+#include "vmlinux.h"
+#include "syscall_summary.h"
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/* This is to calculate a delta between sys-enter and sys-exit for each thread */
+struct syscall_trace {
+ int nr; /* syscall number is only available at sys-enter */
+ int unused;
+ u64 timestamp;
+};
+
+#define MAX_ENTRIES (128 * 1024)
+
+struct syscall_trace_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, int); /* tid */
+ __type(value, struct syscall_trace);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_trace_map SEC(".maps");
+
+struct syscall_stats_map {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, struct syscall_key);
+ __type(value, struct syscall_stats);
+ __uint(max_entries, MAX_ENTRIES);
+} syscall_stats_map SEC(".maps");
+
+int enabled; /* controlled from userspace */
+
+const volatile enum syscall_aggr_mode aggr_mode;
+const volatile int use_cgroup_v2;
+
+int perf_subsys_id = -1;
+
+static inline __u64 get_current_cgroup_id(void)
+{
+ struct task_struct *task;
+ struct cgroup *cgrp;
+
+ if (use_cgroup_v2)
+ return bpf_get_current_cgroup_id();
+
+ task = bpf_get_current_task_btf();
+
+ if (perf_subsys_id == -1) {
+#if __has_builtin(__builtin_preserve_enum_value)
+ perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
+ perf_event_cgrp_id);
+#else
+ perf_subsys_id = perf_event_cgrp_id;
+#endif
+ }
+
+ cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup);
+ return BPF_CORE_READ(cgrp, kn, id);
+}
+
+static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration,
+ long ret)
+{
+ struct syscall_key key = {
+ .cpu_or_tid = cpu_or_tid,
+ .cgroup = cgroup_id,
+ .nr = nr,
+ };
+ struct syscall_stats *stats;
+
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL) {
+ struct syscall_stats zero = {};
+
+ bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST);
+ stats = bpf_map_lookup_elem(&syscall_stats_map, &key);
+ if (stats == NULL)
+ return;
+ }
+
+ __sync_fetch_and_add(&stats->count, 1);
+ if (ret < 0)
+ __sync_fetch_and_add(&stats->error, 1);
+
+ if (duration > 0) {
+ __sync_fetch_and_add(&stats->total_time, duration);
+ __sync_fetch_and_add(&stats->squared_sum, duration * duration);
+ if (stats->max_time < duration)
+ stats->max_time = duration;
+ if (stats->min_time > duration || stats->min_time == 0)
+ stats->min_time = duration;
+ }
+
+ return;
+}
+
+SEC("tp_btf/sys_enter")
+int sys_enter(u64 *ctx)
+{
+ int tid;
+ struct syscall_trace st;
+
+ if (!enabled)
+ return 0;
+
+ st.nr = ctx[1]; /* syscall number */
+ st.unused = 0;
+ st.timestamp = bpf_ktime_get_ns();
+
+ tid = bpf_get_current_pid_tgid();
+ bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY);
+
+ return 0;
+}
+
+SEC("tp_btf/sys_exit")
+int sys_exit(u64 *ctx)
+{
+ int tid;
+ int key = 0;
+ u64 cgroup = 0;
+ long ret = ctx[1]; /* return value of the syscall */
+ struct syscall_trace *st;
+ s64 delta;
+
+ if (!enabled)
+ return 0;
+
+ tid = bpf_get_current_pid_tgid();
+ st = bpf_map_lookup_elem(&syscall_trace_map, &tid);
+ if (st == NULL)
+ return 0;
+
+ if (aggr_mode == SYSCALL_AGGR_THREAD)
+ key = tid;
+ else if (aggr_mode == SYSCALL_AGGR_CGROUP)
+ cgroup = get_current_cgroup_id();
+ else
+ key = bpf_get_smp_processor_id();
+
+ delta = bpf_ktime_get_ns() - st->timestamp;
+ update_stats(key, cgroup, st->nr, delta, ret);
+
+ bpf_map_delete_elem(&syscall_trace_map, &tid);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h
new file mode 100644
index 000000000000..72ccccb45925
--- /dev/null
+++ b/tools/perf/util/bpf_skel/syscall_summary.h
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Data structures shared between BPF and tools. */
+#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H
+
+enum syscall_aggr_mode {
+ SYSCALL_AGGR_THREAD,
+ SYSCALL_AGGR_CPU,
+ SYSCALL_AGGR_CGROUP,
+};
+
+struct syscall_key {
+ u64 cgroup;
+ int cpu_or_tid;
+ int nr;
+};
+
+struct syscall_stats {
+ u64 total_time;
+ u64 squared_sum;
+ u64 max_time;
+ u64 min_time;
+ u32 count;
+ u32 error;
+};
+
+#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */
diff --git a/tools/perf/util/bpf_skel/vmlinux/.gitignore b/tools/perf/util/bpf_skel/vmlinux/.gitignore
new file mode 100644
index 000000000000..49502c04183a
--- /dev/null
+++ b/tools/perf/util/bpf_skel/vmlinux/.gitignore
@@ -0,0 +1 @@
+!vmlinux.h
diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
new file mode 100644
index 000000000000..a59ce912be18
--- /dev/null
+++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h
@@ -0,0 +1,215 @@
+#ifndef __VMLINUX_H
+#define __VMLINUX_H
+
+#include <linux/stddef.h> // for define __always_inline
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include <stdbool.h>
+
+// non-UAPI kernel data structures, used in the .bpf.c BPF tool component.
+
+// Just the fields used in these tools preserving the access index so that
+// libbpf can fixup offsets with the ones used in the kernel when loading the
+// BPF bytecode, if they differ from what is used here.
+
+typedef __u8 u8;
+typedef __u32 u32;
+typedef __s32 s32;
+typedef __u64 u64;
+typedef __s64 s64;
+
+typedef int pid_t;
+
+typedef __s64 time64_t;
+
+struct timespec64 {
+ time64_t tv_sec;
+ long int tv_nsec;
+};
+
+enum cgroup_subsys_id {
+ perf_event_cgrp_id = 8,
+};
+
+enum {
+ HI_SOFTIRQ = 0,
+ TIMER_SOFTIRQ,
+ NET_TX_SOFTIRQ,
+ NET_RX_SOFTIRQ,
+ BLOCK_SOFTIRQ,
+ IRQ_POLL_SOFTIRQ,
+ TASKLET_SOFTIRQ,
+ SCHED_SOFTIRQ,
+ HRTIMER_SOFTIRQ,
+ RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
+
+ NR_SOFTIRQS
+};
+
+typedef struct {
+ s64 counter;
+} __attribute__((preserve_access_index)) atomic64_t;
+
+typedef atomic64_t atomic_long_t;
+
+struct raw_spinlock {
+ int rawlock;
+} __attribute__((preserve_access_index));
+
+typedef struct raw_spinlock raw_spinlock_t;
+
+typedef struct {
+ struct raw_spinlock rlock;
+} __attribute__((preserve_access_index)) spinlock_t;
+
+struct sighand_struct {
+ spinlock_t siglock;
+} __attribute__((preserve_access_index));
+
+struct rw_semaphore {
+ atomic_long_t owner;
+} __attribute__((preserve_access_index));
+
+struct mutex {
+ atomic_long_t owner;
+} __attribute__((preserve_access_index));
+
+struct kernfs_node {
+ u64 id;
+} __attribute__((preserve_access_index));
+
+struct cgroup {
+ struct kernfs_node *kn;
+ int level;
+} __attribute__((preserve_access_index));
+
+struct cgroup_subsys_state {
+ struct cgroup *cgroup;
+} __attribute__((preserve_access_index));
+
+struct css_set {
+ struct cgroup_subsys_state *subsys[13];
+ struct cgroup *dfl_cgrp;
+} __attribute__((preserve_access_index));
+
+struct mm_struct {
+ struct rw_semaphore mmap_lock;
+} __attribute__((preserve_access_index));
+
+struct task_struct {
+ unsigned int flags;
+ struct mm_struct *mm;
+ pid_t pid;
+ pid_t tgid;
+ char comm[16];
+ struct sighand_struct *sighand;
+ struct css_set *cgroups;
+} __attribute__((preserve_access_index));
+
+struct trace_entry {
+ short unsigned int type;
+ unsigned char flags;
+ unsigned char preempt_count;
+ int pid;
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_irq_handler_entry {
+ struct trace_entry ent;
+ int irq;
+ u32 __data_loc_name;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_irq_handler_exit {
+ struct trace_entry ent;
+ int irq;
+ int ret;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_softirq {
+ struct trace_entry ent;
+ unsigned int vec;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_execute_start {
+ struct trace_entry ent;
+ void *work;
+ void *function;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_execute_end {
+ struct trace_entry ent;
+ void *work;
+ void *function;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct trace_event_raw_workqueue_activate_work {
+ struct trace_entry ent;
+ void *work;
+ char __data[];
+} __attribute__((preserve_access_index));
+
+struct perf_sample_data {
+ u64 addr;
+ u64 period;
+ union perf_sample_weight weight;
+ u64 txn;
+ union perf_mem_data_src data_src;
+ u64 ip;
+ struct {
+ u32 pid;
+ u32 tid;
+ } tid_entry;
+ u64 time;
+ u64 id;
+ struct {
+ u32 cpu;
+ } cpu_entry;
+ u64 phys_addr;
+ u64 cgroup;
+ u64 data_page_size;
+ u64 code_page_size;
+} __attribute__((__aligned__(64))) __attribute__((preserve_access_index));
+
+struct perf_event {
+ struct perf_event *parent;
+ u64 id;
+} __attribute__((preserve_access_index));
+
+struct bpf_perf_event_data_kern {
+ struct perf_sample_data *data;
+ struct perf_event *event;
+} __attribute__((preserve_access_index));
+
+/*
+ * If 'struct rq' isn't defined for lock_contention.bpf.c, for the sake of
+ * rq___old and rq___new, then the type for the 'runqueue' variable ends up
+ * being a forward declaration (BTF_KIND_FWD) while the kernel has it defined
+ * (BTF_KIND_STRUCT). The definition appears in vmlinux.h rather than
+ * lock_contention.bpf.c for consistency with a generated vmlinux.h.
+ */
+struct rq {};
+
+struct kmem_cache {
+ const char *name;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__kmem_cache {
+ struct kmem_cache *s;
+} __attribute__((preserve_access_index));
+
+struct zone {
+ spinlock_t lock;
+} __attribute__((preserve_access_index));
+
+struct pglist_data {
+ struct zone node_zones[6]; /* value for all possible config */
+ int nr_zones;
+} __attribute__((preserve_access_index));
+
+#endif // __VMLINUX_H