diff options
Diffstat (limited to 'tools/perf/util/bpf_skel')
20 files changed, 4263 insertions, 0 deletions
diff --git a/tools/perf/util/bpf_skel/.gitignore b/tools/perf/util/bpf_skel/.gitignore new file mode 100644 index 000000000000..cd01455e1b53 --- /dev/null +++ b/tools/perf/util/bpf_skel/.gitignore @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +.tmp +*.skel.h +vmlinux.h diff --git a/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c new file mode 100644 index 000000000000..e4352881e3fa --- /dev/null +++ b/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c @@ -0,0 +1,579 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. + * + * This exactly matches what is marshalled into the raw_syscall:sys_enter + * payload expected by the 'perf trace' beautifiers. + */ + +#include "vmlinux.h" +#include "../trace_augment.h" + +#include <bpf/bpf_helpers.h> +#include <linux/limits.h> + +#define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) +#define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) + +/** + * is_power_of_2() - check if a value is a power of two + * @n: the value to check + * + * Determine whether some value is a power of two, where zero is *not* + * considered a power of two. Return: true if @n is a power of 2, otherwise + * false. + */ +#define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) + +#define MAX_CPUS 4096 + +/* bpf-output associated map */ +struct __augmented_syscalls__ { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __type(key, int); + __type(value, __u32); + __uint(max_entries, MAX_CPUS); +} __augmented_syscalls__ SEC(".maps"); + +/* + * What to augment at entry? + * + * Pointer arg payloads (filenames, etc) passed from userspace to the kernel + */ +struct syscalls_sys_enter { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 512); +} syscalls_sys_enter SEC(".maps"); + +/* + * What to augment at exit? + * + * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. + */ +struct syscalls_sys_exit { + __uint(type, BPF_MAP_TYPE_PROG_ARRAY); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 512); +} syscalls_sys_exit SEC(".maps"); + +struct syscall_enter_args { + unsigned long long common_tp_fields; + long syscall_nr; + unsigned long args[6]; +}; + +struct syscall_exit_args { + unsigned long long common_tp_fields; + long syscall_nr; + long ret; +}; + +/* + * Desired design of maximum size and alignment (see RFC2553) + */ +#define SS_MAXSIZE 128 /* Implementation specific max size */ + +typedef unsigned short sa_family_t; + +/* + * FIXME: Should come from system headers + * + * The definition uses anonymous union and struct in order to control the + * default alignment. + */ +struct sockaddr_storage { + union { + struct { + sa_family_t ss_family; /* address family */ + /* Following field(s) are implementation specific */ + char __data[SS_MAXSIZE - sizeof(unsigned short)]; + /* space to achieve desired size, */ + /* _SS_MAXSIZE value minus size of ss_family */ + }; + void *__align; /* implementation specific desired alignment */ + }; +}; + +struct augmented_arg { + unsigned int size; + int err; + union { + char value[PATH_MAX]; + struct sockaddr_storage saddr; + }; +}; + +struct pids_filtered { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, pid_t); + __type(value, bool); + __uint(max_entries, 64); +} pids_filtered SEC(".maps"); + +struct augmented_args_payload { + struct syscall_enter_args args; + struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) +}; + +// We need more tmp space than the BPF stack can give us +struct augmented_args_tmp { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, int); + __type(value, struct augmented_args_payload); + __uint(max_entries, 1); +} augmented_args_tmp SEC(".maps"); + +struct beauty_map_enter { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, __u32[6]); + __uint(max_entries, 512); +} beauty_map_enter SEC(".maps"); + +struct beauty_payload_enter { + struct syscall_enter_args args; + struct augmented_arg aug_args[6]; +}; + +struct beauty_payload_enter_map { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __type(key, int); + __type(value, struct beauty_payload_enter); + __uint(max_entries, 1); +} beauty_payload_enter_map SEC(".maps"); + +static inline struct augmented_args_payload *augmented_args_payload(void) +{ + int key = 0; + return bpf_map_lookup_elem(&augmented_args_tmp, &key); +} + +static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) +{ + /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ + return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); +} + +static inline int augmented__beauty_output(void *ctx, void *data, int len) +{ + return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len); +} + +static inline +unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) +{ + unsigned int augmented_len = sizeof(*augmented_arg); + int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); + + augmented_arg->size = augmented_arg->err = 0; + /* + * probe_read_str may return < 0, e.g. -EFAULT + * So we leave that in the augmented_arg->size that userspace will + */ + if (string_len > 0) { + augmented_len -= sizeof(augmented_arg->value) - string_len; + _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); + augmented_len &= sizeof(augmented_arg->value) - 1; + augmented_arg->size = string_len; + } else { + /* + * So that username notice the error while still being able + * to skip this augmented arg record + */ + augmented_arg->err = string_len; + augmented_len = offsetof(struct augmented_arg, value); + } + + return augmented_len; +} + +SEC("tp/raw_syscalls/sys_enter") +int syscall_unaugmented(struct syscall_enter_args *args) +{ + return 1; +} + +/* + * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in + * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go + * on from there, reading the first syscall arg as a string, i.e. open's + * filename. + */ +SEC("tp/syscalls/sys_enter_connect") +int sys_enter_connect(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *sockaddr_arg = (const void *)args->args[1]; + unsigned int socklen = args->args[2]; + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two"); + socklen &= sizeof(augmented_args->arg.saddr) - 1; + + bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); + augmented_args->arg.size = socklen; + augmented_args->arg.err = 0; + + return augmented__output(args, augmented_args, len + socklen); +} + +SEC("tp/syscalls/sys_enter_sendto") +int sys_enter_sendto(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *sockaddr_arg = (const void *)args->args[4]; + unsigned int socklen = args->args[5]; + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + socklen &= sizeof(augmented_args->arg.saddr) - 1; + + bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); + + return augmented__output(args, augmented_args, len + socklen); +} + +SEC("tp/syscalls/sys_enter_open") +int sys_enter_open(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *filename_arg = (const void *)args->args[0]; + unsigned int len = sizeof(augmented_args->args); + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); + + return augmented__output(args, augmented_args, len); +} + +SEC("tp/syscalls/sys_enter_openat") +int sys_enter_openat(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *filename_arg = (const void *)args->args[1]; + unsigned int len = sizeof(augmented_args->args); + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); + + return augmented__output(args, augmented_args, len); +} + +SEC("tp/syscalls/sys_enter_rename") +int sys_enter_rename(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *oldpath_arg = (const void *)args->args[0], + *newpath_arg = (const void *)args->args[1]; + unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... + + oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); + augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); + len += augmented_args->arg.size; + + /* Every read from userspace is limited to value size */ + if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) + return 1; /* Failure: don't filter */ + + struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; + + newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); + arg2->size = newpath_len; + + len += newpath_len; + + return augmented__output(args, augmented_args, len); +} + +SEC("tp/syscalls/sys_enter_renameat2") +int sys_enter_renameat2(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *oldpath_arg = (const void *)args->args[1], + *newpath_arg = (const void *)args->args[3]; + unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; + + if (augmented_args == NULL) + return 1; /* Failure: don't filter */ + + len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... + + oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); + augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); + len += augmented_args->arg.size; + + /* Every read from userspace is limited to value size */ + if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) + return 1; /* Failure: don't filter */ + + struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; + + newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); + arg2->size = newpath_len; + + len += newpath_len; + + return augmented__output(args, augmented_args, len); +} + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +// we need just the start, get the size to then copy it +struct perf_event_attr_size { + __u32 type; + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; +}; + +SEC("tp/syscalls/sys_enter_perf_event_open") +int sys_enter_perf_event_open(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs + + if (augmented_args == NULL) + goto failure; + + if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0) + goto failure; + + attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value; + + __u32 size = attr_read->size; + + if (!size) + size = PERF_ATTR_SIZE_VER0; + + if (size > sizeof(augmented_args->arg.value)) + goto failure; + + // Now that we read attr->size and tested it against the size limits, read it completely + if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0) + goto failure; + + return augmented__output(args, augmented_args, len + size); +failure: + return 1; /* Failure: don't filter */ +} + +SEC("tp/syscalls/sys_enter_clock_nanosleep") +int sys_enter_clock_nanosleep(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *rqtp_arg = (const void *)args->args[2]; + unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs + __u32 size = sizeof(struct timespec64); + + if (augmented_args == NULL) + goto failure; + + if (size > sizeof(augmented_args->arg.value)) + goto failure; + + bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg); + + return augmented__output(args, augmented_args, len + size); +failure: + return 1; /* Failure: don't filter */ +} + +SEC("tp/syscalls/sys_enter_nanosleep") +int sys_enter_nanosleep(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args = augmented_args_payload(); + const void *req_arg = (const void *)args->args[0]; + unsigned int len = sizeof(augmented_args->args); + __u32 size = sizeof(struct timespec64); + + if (augmented_args == NULL) + goto failure; + + if (size > sizeof(augmented_args->arg.value)) + goto failure; + + bpf_probe_read_user(&augmented_args->arg.value, size, req_arg); + + return augmented__output(args, augmented_args, len + size); +failure: + return 1; /* Failure: don't filter */ +} + +static pid_t getpid(void) +{ + return bpf_get_current_pid_tgid(); +} + +static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) +{ + return bpf_map_lookup_elem(pids, &pid) != NULL; +} + +static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) +{ + bool augmented, do_output = false; + int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); + u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */ + s64 aug_size, size; + unsigned int nr, *beauty_map; + struct beauty_payload_enter *payload; + void *arg, *payload_offset; + + /* fall back to do predefined tail call */ + if (args == NULL) + return 1; + + /* use syscall number to get beauty_map entry */ + nr = (__u32)args->syscall_nr; + beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr); + + /* set up payload for output */ + payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero); + payload_offset = (void *)&payload->aug_args; + + if (beauty_map == NULL || payload == NULL) + return 1; + + /* copy the sys_enter header, which has the syscall_nr */ + __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args)); + + /* + * Determine what type of argument and how many bytes to read from user space, using the + * value in the beauty_map. This is the relation of parameter type and its corresponding + * value in the beauty map, and how many bytes we read eventually: + * + * string: 1 -> size of string + * struct: size of struct -> size of struct + * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF) + */ + for (int i = 0; i < 6; i++) { + arg = (void *)args->args[i]; + augmented = false; + size = beauty_map[i]; + aug_size = size; /* size of the augmented data read from user space */ + + if (size == 0 || arg == NULL) + continue; + + if (size == 1) { /* string */ + aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg); + /* minimum of 0 to pass the verifier */ + if (aug_size < 0) + aug_size = 0; + + augmented = true; + } else if (size > 0 && size <= value_size) { /* struct */ + if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg)) + augmented = true; + } else if ((int)size < 0 && size >= -6) { /* buffer */ + index = -(size + 1); + barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick. + index &= 7; // Satisfy the bounds checking with the verifier in some kernels. + aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index]; + + if (aug_size > 0) { + if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) + augmented = true; + } + } + + /* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */ + if (aug_size > value_size) + aug_size = value_size; + + /* write data to payload */ + if (augmented) { + int written = offsetof(struct augmented_arg, value) + aug_size; + + if (written < 0 || written > sizeof(struct augmented_arg)) + return 1; + + ((struct augmented_arg *)payload_offset)->size = aug_size; + output += written; + payload_offset += written; + do_output = true; + } + } + + if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter)) + return 1; + + return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output); +} + +SEC("tp/raw_syscalls/sys_enter") +int sys_enter(struct syscall_enter_args *args) +{ + struct augmented_args_payload *augmented_args; + /* + * We start len, the amount of data that will be in the perf ring + * buffer, if this is not filtered out by one of pid_filter__has(), + * syscall->enabled, etc, with the non-augmented raw syscall payload, + * i.e. sizeof(augmented_args->args). + * + * We'll add to this as we add augmented syscalls right after that + * initial, non-augmented raw_syscalls:sys_enter payload. + */ + + if (pid_filter__has(&pids_filtered, getpid())) + return 0; + + augmented_args = augmented_args_payload(); + if (augmented_args == NULL) + return 1; + + bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); + + /* + * Jump to syscall specific augmenter, even if the default one, + * "!raw_syscalls:unaugmented" that will just return 1 to return the + * unaugmented tracepoint payload. + */ + if (augment_sys_enter(args, &augmented_args->args)) + bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); + + // If not found on the PROG_ARRAY syscalls map, then we're filtering it: + return 0; +} + +SEC("tp/raw_syscalls/sys_exit") +int sys_exit(struct syscall_exit_args *args) +{ + struct syscall_exit_args exit_args; + + if (pid_filter__has(&pids_filtered, getpid())) + return 0; + + bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); + /* + * Jump to syscall specific return augmenter, even if the default one, + * "!raw_syscalls:unaugmented" that will just return 1 to return the + * unaugmented tracepoint payload. + */ + bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); + /* + * If not found on the PROG_ARRAY syscalls map, then we're filtering it: + */ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/perf/util/bpf_skel/bench_uprobe.bpf.c b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c new file mode 100644 index 000000000000..a01c7f791fcd --- /dev/null +++ b/tools/perf/util/bpf_skel/bench_uprobe.bpf.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2023 Red Hat +#include "vmlinux.h" +#include <bpf/bpf_tracing.h> + +unsigned int nr_uprobes; +unsigned int nr_uretprobes; + +SEC("uprobe") +int BPF_UPROBE(empty) +{ + return 0; +} + +SEC("uprobe") +int BPF_UPROBE(trace_printk) +{ + char fmt[] = "perf bench uprobe %u"; + + bpf_trace_printk(fmt, sizeof(fmt), ++nr_uprobes); + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(empty_ret) +{ + return 0; +} + +SEC("uretprobe") +int BPF_URETPROBE(trace_printk_ret) +{ + char fmt[] = "perf bench uretprobe %u"; + + bpf_trace_printk(fmt, sizeof(fmt), ++nr_uretprobes); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c new file mode 100644 index 000000000000..57cab7647a9a --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +// Copyright (c) 2021 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#define MAX_LEVELS 10 // max cgroup hierarchy level: arbitrary +#define MAX_EVENTS 32 // max events per cgroup: arbitrary + +// NOTE: many of map and global data will be modified before loading +// from the userspace (perf tool) using the skeleton helpers. + +// single set of global perf events to measure +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(max_entries, 1); +} events SEC(".maps"); + +// from cgroup id to event index +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, 1); +} cgrp_idx SEC(".maps"); + +// per-cpu event snapshots to calculate delta +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} prev_readings SEC(".maps"); + +// aggregated event values for each cgroup (per-cpu) +// will be read from the user-space +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); +} cgrp_readings SEC(".maps"); + +/* new kernel cgroup definition */ +struct cgroup___new { + int level; + struct cgroup *ancestors[]; +} __attribute__((preserve_access_index)); + +/* old kernel cgroup definition */ +struct cgroup___old { + int level; + u64 ancestor_ids[]; +} __attribute__((preserve_access_index)); + +const volatile __u32 num_events = 1; +const volatile __u32 num_cpus = 1; +const volatile int use_cgroup_v2 = 0; + +int enabled = 0; +int perf_subsys_id = -1; + +static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) +{ + /* recast pointer to capture new type for compiler */ + struct cgroup___new *cgrp_new = (void *)cgrp; + + if (bpf_core_field_exists(cgrp_new->ancestors)) { + return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); + } else { + /* recast pointer to capture old type for compiler */ + struct cgroup___old *cgrp_old = (void *)cgrp; + + return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); + } +} + +static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) +{ + struct task_struct *p = (void *)bpf_get_current_task(); + struct cgroup *cgrp; + register int i = 0; + __u32 *elem; + int level; + int cnt; + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); + level = BPF_CORE_READ(cgrp, level); + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id; + + if (i > level) + break; + + // convert cgroup-id to a map index + cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) +{ + register int i = 0; + __u32 *elem; + int cnt; + + for (cnt = 0; i < MAX_LEVELS; i++) { + __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); + + if (cgrp_id == 0) + break; + + // convert cgroup-id to a map index + elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); + if (!elem) + continue; + + cgrps[cnt++] = *elem; + if (cnt == size) + break; + } + + return cnt; +} + +static int bperf_cgroup_count(void) +{ + register __u32 idx = 0; // to have it in a register to pass BPF verifier + register int c = 0; + struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; + __u32 cpu = bpf_get_smp_processor_id(); + __u32 cgrp_idx[MAX_LEVELS]; + int cgrp_cnt; + __u32 key, cgrp; + long err; + + if (use_cgroup_v2) + cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, MAX_LEVELS); + else + cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, MAX_LEVELS); + + for ( ; idx < MAX_EVENTS; idx++) { + if (idx == num_events) + break; + + // XXX: do not pass idx directly (for verifier) + key = idx; + // this is per-cpu array for diff + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) { + val.counter = val.enabled = val.running = 0; + bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); + + prev_val = bpf_map_lookup_elem(&prev_readings, &key); + if (!prev_val) + continue; + } + + // read from global perf_event array + key = idx * num_cpus + cpu; + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + continue; + + if (enabled) { + delta.counter = val.counter - prev_val->counter; + delta.enabled = val.enabled - prev_val->enabled; + delta.running = val.running - prev_val->running; + + for (c = 0; c < MAX_LEVELS; c++) { + if (c == cgrp_cnt) + break; + + cgrp = cgrp_idx[c]; + + // aggregate the result by cgroup + key = cgrp * num_events + idx; + cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); + if (cgrp_val) { + cgrp_val->counter += delta.counter; + cgrp_val->enabled += delta.enabled; + cgrp_val->running += delta.running; + } else { + bpf_map_update_elem(&cgrp_readings, &key, + &delta, BPF_ANY); + } + } + } + + *prev_val = val; + } + return 0; +} + +// This will be attached to cgroup-switches event for each cpu +SEC("perf_event") +int BPF_PROG(on_cgrp_switch) +{ + return bperf_cgroup_count(); +} + +SEC("raw_tp/sched_switch") +int BPF_PROG(trigger_read) +{ + return bperf_cgroup_count(); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_follower.bpf.c b/tools/perf/util/bpf_skel/bperf_follower.bpf.c new file mode 100644 index 000000000000..0595063139a3 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_follower.bpf.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bperf_u.h" + +#define MAX_ENTRIES 102400 + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bperf_filter_value)); + __uint(max_entries, MAX_ENTRIES); + __uint(map_flags, BPF_F_NO_PREALLOC); +} filter SEC(".maps"); + +enum bperf_filter_type type = 0; +int enabled = 0; +int inherit; + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value *diff_val, *accum_val; + __u32 filter_key, zero = 0; + __u32 accum_key; + struct bperf_filter_value *fval; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_GLOBAL: + accum_key = zero; + goto do_add; + case BPERF_FILTER_CPU: + filter_key = bpf_get_smp_processor_id(); + break; + case BPERF_FILTER_PID: + filter_key = bpf_get_current_pid_tgid() & 0xffffffff; + break; + case BPERF_FILTER_TGID: + /* Use pid as the filter_key to exclude new task counts + * when inherit is disabled. Don't worry about the existing + * children in TGID losing their counts, bpf_counter has + * already added them to the filter map via perf_thread_map + * before this bpf prog runs. + */ + filter_key = inherit ? + bpf_get_current_pid_tgid() >> 32 : + bpf_get_current_pid_tgid() & 0xffffffff; + break; + default: + return 0; + } + + fval = bpf_map_lookup_elem(&filter, &filter_key); + if (!fval) + return 0; + + accum_key = fval->accum_key; + if (fval->exited) + bpf_map_delete_elem(&filter, &filter_key); + +do_add: + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + accum_val = bpf_map_lookup_elem(&accum_readings, &accum_key); + if (!accum_val) + return 0; + + accum_val->counter += diff_val->counter; + accum_val->enabled += diff_val->enabled; + accum_val->running += diff_val->running; + + return 0; +} + +/* The program is only used for PID or TGID filter types. */ +SEC("tp_btf/task_newtask") +int BPF_PROG(on_newtask, struct task_struct *task, __u64 clone_flags) +{ + __u32 parent_key, child_key; + struct bperf_filter_value *parent_fval; + struct bperf_filter_value child_fval = { 0 }; + + if (!enabled) + return 0; + + switch (type) { + case BPERF_FILTER_PID: + parent_key = bpf_get_current_pid_tgid() & 0xffffffff; + child_key = task->pid; + break; + case BPERF_FILTER_TGID: + parent_key = bpf_get_current_pid_tgid() >> 32; + child_key = task->tgid; + if (child_key == parent_key) + return 0; + break; + default: + return 0; + } + + /* Check if the current task is one of the target tasks to be counted */ + parent_fval = bpf_map_lookup_elem(&filter, &parent_key); + if (!parent_fval) + return 0; + + /* Start counting for the new task by adding it into filter map, + * inherit the accum key of its parent task so that they can be + * counted together. + */ + child_fval.accum_key = parent_fval->accum_key; + child_fval.exited = 0; + bpf_map_update_elem(&filter, &child_key, &child_fval, BPF_NOEXIST); + + return 0; +} + +/* The program is only used for PID or TGID filter types. */ +SEC("tp_btf/sched_process_exit") +int BPF_PROG(on_exittask, struct task_struct *task) +{ + __u32 pid; + struct bperf_filter_value *fval; + + if (!enabled) + return 0; + + /* Stop counting for this task by removing it from filter map. + * For TGID type, if the pid can be found in the map, it means that + * this pid belongs to the leader task. After the task exits, the + * tgid of its child tasks (if any) will be 1, so the pid can be + * safely removed. + */ + pid = task->pid; + fval = bpf_map_lookup_elem(&filter, &pid); + if (fval) + fval->exited = 1; + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_leader.bpf.c b/tools/perf/util/bpf_skel/bperf_leader.bpf.c new file mode 100644 index 000000000000..e2a2d4cd7779 --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_leader.bpf.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); + __uint(map_flags, BPF_F_PRESERVE_ELEMS); +} events SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} prev_readings SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} diff_readings SEC(".maps"); + +SEC("raw_tp/sched_switch") +int BPF_PROG(on_switch) +{ + struct bpf_perf_event_value val, *prev_val, *diff_val; + __u32 key = bpf_get_smp_processor_id(); + __u32 zero = 0; + long err; + + prev_val = bpf_map_lookup_elem(&prev_readings, &zero); + if (!prev_val) + return 0; + + diff_val = bpf_map_lookup_elem(&diff_readings, &zero); + if (!diff_val) + return 0; + + err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); + if (err) + return 0; + + diff_val->counter = val.counter - prev_val->counter; + diff_val->enabled = val.enabled - prev_val->enabled; + diff_val->running = val.running - prev_val->running; + *prev_val = val; + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/bperf_u.h b/tools/perf/util/bpf_skel/bperf_u.h new file mode 100644 index 000000000000..4a4a753980be --- /dev/null +++ b/tools/perf/util/bpf_skel/bperf_u.h @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Facebook + +#ifndef __BPERF_STAT_U_H +#define __BPERF_STAT_U_H + +enum bperf_filter_type { + BPERF_FILTER_GLOBAL = 1, + BPERF_FILTER_CPU, + BPERF_FILTER_PID, + BPERF_FILTER_TGID, +}; + +struct bperf_filter_value { + __u32 accum_key; + __u8 exited; +}; + +#endif /* __BPERF_STAT_U_H */ diff --git a/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c new file mode 100644 index 000000000000..97037d3b3d9f --- /dev/null +++ b/tools/perf/util/bpf_skel/bpf_prog_profiler.bpf.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2020 Facebook +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +/* map of perf event fds, num_cpu * num_metric entries */ +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(int)); +} events SEC(".maps"); + +/* readings at fentry */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} fentry_readings SEC(".maps"); + +/* accumulated readings */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct bpf_perf_event_value)); + __uint(max_entries, 1); +} accum_readings SEC(".maps"); + +const volatile __u32 num_cpu = 1; + +SEC("fentry/XXX") +int BPF_PROG(fentry_XXX) +{ + __u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *ptr; + __u32 zero = 0; + long err; + + /* look up before reading, to reduce error */ + ptr = bpf_map_lookup_elem(&fentry_readings, &zero); + if (!ptr) + return 0; + + err = bpf_perf_event_read_value(&events, key, ptr, sizeof(*ptr)); + if (err) + return 0; + + return 0; +} + +static inline void +fexit_update_maps(struct bpf_perf_event_value *after) +{ + struct bpf_perf_event_value *before, diff; + __u32 zero = 0; + + before = bpf_map_lookup_elem(&fentry_readings, &zero); + /* only account samples with a valid fentry_reading */ + if (before && before->counter) { + struct bpf_perf_event_value *accum; + + diff.counter = after->counter - before->counter; + diff.enabled = after->enabled - before->enabled; + diff.running = after->running - before->running; + + accum = bpf_map_lookup_elem(&accum_readings, &zero); + if (accum) { + accum->counter += diff.counter; + accum->enabled += diff.enabled; + accum->running += diff.running; + } + } +} + +SEC("fexit/XXX") +int BPF_PROG(fexit_XXX) +{ + struct bpf_perf_event_value reading; + __u32 cpu = bpf_get_smp_processor_id(); + int err; + + /* read all events before updating the maps, to reduce error */ + err = bpf_perf_event_read_value(&events, cpu, &reading, sizeof(reading)); + if (err) + return 0; + + fexit_update_maps(&reading); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c new file mode 100644 index 000000000000..e731a79a753a --- /dev/null +++ b/tools/perf/util/bpf_skel/func_latency.bpf.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2021 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +// This should be in sync with "util/ftrace.h" +#define NUM_BUCKET 22 + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 10000); +} functime SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, NUM_BUCKET); +} latency SEC(".maps"); + + +int enabled = 0; + +// stats +__s64 total; +__s64 count; +__s64 max; +__s64 min; + +const volatile int has_cpu = 0; +const volatile int has_task = 0; +const volatile int use_nsec = 0; +const volatile unsigned int bucket_range; +const volatile unsigned int min_latency; +const volatile unsigned int max_latency; +const volatile unsigned int bucket_num = NUM_BUCKET; + +SEC("kprobe/func") +int BPF_PROG(func_begin) +{ + __u64 key, now; + + if (!enabled) + return 0; + + key = bpf_get_current_pid_tgid(); + + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u32 pid = key & 0xffffffff; + __u8 *ok; + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + now = bpf_ktime_get_ns(); + + // overwrite timestamp for nested functions + bpf_map_update_elem(&functime, &key, &now, BPF_ANY); + return 0; +} + +SEC("kretprobe/func") +int BPF_PROG(func_end) +{ + __u64 tid; + __u64 *start; + __u64 cmp_base = use_nsec ? 1 : 1000; + + if (!enabled) + return 0; + + tid = bpf_get_current_pid_tgid(); + + start = bpf_map_lookup_elem(&functime, &tid); + if (start) { + __s64 delta = bpf_ktime_get_ns() - *start; + __u64 val = delta; + __u32 key = 0; + __u64 *hist; + + bpf_map_delete_elem(&functime, &tid); + + if (delta < 0) + return 0; + + if (bucket_range != 0) { + val = delta / cmp_base; + + if (min_latency > 0) { + if (val > min_latency) + val -= min_latency; + else + goto do_lookup; + } + + // Less than 1 unit (ms or ns), or, in the future, + // than the min latency desired. + if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units ) + key = val / bucket_range + 1; + if (key >= bucket_num) + key = bucket_num - 1; + } + + goto do_lookup; + } + // calculate index using delta + for (key = 0; key < (bucket_num - 1); key++) { + if (delta < (cmp_base << key)) + break; + } + +do_lookup: + hist = bpf_map_lookup_elem(&latency, &key); + if (!hist) + return 0; + + __sync_fetch_and_add(hist, 1); + + __sync_fetch_and_add(&total, delta); // always in nsec + __sync_fetch_and_add(&count, 1); + + if (delta > max) + max = delta; + if (delta < min) + min = delta; + } + + return 0; +} diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c new file mode 100644 index 000000000000..73e32e063030 --- /dev/null +++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022, Huawei + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* + * This should be in sync with "util/kwork.h" + */ +enum kwork_class_type { + KWORK_CLASS_IRQ, + KWORK_CLASS_SOFTIRQ, + KWORK_CLASS_WORKQUEUE, + KWORK_CLASS_SCHED, + KWORK_CLASS_MAX, +}; + +#define MAX_ENTRIES 102400 +#ifndef MAX_NR_CPUS +#define MAX_NR_CPUS 4096 +#endif +#define PF_KTHREAD 0x00200000 +#define MAX_COMMAND_LEN 16 + +struct time_data { + __u64 timestamp; +}; + +struct work_data { + __u64 runtime; +}; + +struct task_data { + __u32 tgid; + __u32 is_kthread; + char comm[MAX_COMMAND_LEN]; +}; + +struct work_key { + __u32 type; + __u32 pid; + __u64 task_p; +}; + +struct task_key { + __u32 pid; + __u32 cpu; +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct time_data); +} kwork_top_task_time SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(struct time_data)); + __uint(max_entries, MAX_ENTRIES); +} kwork_top_irq_time SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct task_key)); + __uint(value_size, sizeof(struct task_data)); + __uint(max_entries, MAX_ENTRIES); +} kwork_top_tasks SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(struct work_data)); + __uint(max_entries, MAX_ENTRIES); +} kwork_top_works SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(u32)); + __uint(value_size, sizeof(u8)); + __uint(max_entries, MAX_NR_CPUS); +} kwork_top_cpu_filter SEC(".maps"); + +int enabled = 0; + +const volatile int has_cpu_filter = 0; + +__u64 from_timestamp = 0; +__u64 to_timestamp = 0; + +static __always_inline int cpu_is_filtered(__u32 cpu) +{ + __u8 *cpu_val; + + if (has_cpu_filter) { + cpu_val = bpf_map_lookup_elem(&kwork_top_cpu_filter, &cpu); + if (!cpu_val) + return 1; + } + + return 0; +} + +static __always_inline void update_task_info(struct task_struct *task, __u32 cpu) +{ + struct task_key key = { + .pid = task->pid, + .cpu = cpu, + }; + + if (!bpf_map_lookup_elem(&kwork_top_tasks, &key)) { + struct task_data data = { + .tgid = task->tgid, + .is_kthread = task->flags & PF_KTHREAD ? 1 : 0, + }; + BPF_CORE_READ_STR_INTO(&data.comm, task, comm); + + bpf_map_update_elem(&kwork_top_tasks, &key, &data, BPF_ANY); + } +} + +static __always_inline void update_work(struct work_key *key, __u64 delta) +{ + struct work_data *data; + + data = bpf_map_lookup_elem(&kwork_top_works, key); + if (data) { + data->runtime += delta; + } else { + struct work_data new_data = { + .runtime = delta, + }; + + bpf_map_update_elem(&kwork_top_works, key, &new_data, BPF_ANY); + } +} + +static void on_sched_out(struct task_struct *task, __u64 ts, __u32 cpu) +{ + __u64 delta; + struct time_data *pelem; + + pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL, 0); + if (pelem) + delta = ts - pelem->timestamp; + else + delta = ts - from_timestamp; + + struct work_key key = { + .type = KWORK_CLASS_SCHED, + .pid = task->pid, + .task_p = (__u64)task, + }; + + update_work(&key, delta); + update_task_info(task, cpu); +} + +static void on_sched_in(struct task_struct *task, __u64 ts) +{ + struct time_data *pelem; + + pelem = bpf_task_storage_get(&kwork_top_task_time, task, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (pelem) + pelem->timestamp = ts; +} + +SEC("tp_btf/sched_switch") +int on_switch(u64 *ctx) +{ + struct task_struct *prev, *next; + + prev = (struct task_struct *)ctx[1]; + next = (struct task_struct *)ctx[2]; + + if (!enabled) + return 0; + + __u32 cpu = bpf_get_smp_processor_id(); + + if (cpu_is_filtered(cpu)) + return 0; + + __u64 ts = bpf_ktime_get_ns(); + + on_sched_out(prev, ts, cpu); + on_sched_in(next, ts); + + return 0; +} + +SEC("tp_btf/irq_handler_entry") +int on_irq_handler_entry(u64 *cxt) +{ + struct task_struct *task; + + if (!enabled) + return 0; + + __u32 cpu = bpf_get_smp_processor_id(); + + if (cpu_is_filtered(cpu)) + return 0; + + __u64 ts = bpf_ktime_get_ns(); + + task = (struct task_struct *)bpf_get_current_task(); + if (!task) + return 0; + + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .pid = BPF_CORE_READ(task, pid), + .task_p = (__u64)task, + }; + + struct time_data data = { + .timestamp = ts, + }; + + bpf_map_update_elem(&kwork_top_irq_time, &key, &data, BPF_ANY); + + return 0; +} + +SEC("tp_btf/irq_handler_exit") +int on_irq_handler_exit(u64 *cxt) +{ + __u64 delta; + struct task_struct *task; + struct time_data *pelem; + + if (!enabled) + return 0; + + __u32 cpu = bpf_get_smp_processor_id(); + + if (cpu_is_filtered(cpu)) + return 0; + + __u64 ts = bpf_ktime_get_ns(); + + task = (struct task_struct *)bpf_get_current_task(); + if (!task) + return 0; + + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .pid = BPF_CORE_READ(task, pid), + .task_p = (__u64)task, + }; + + pelem = bpf_map_lookup_elem(&kwork_top_irq_time, &key); + if (pelem && pelem->timestamp != 0) + delta = ts - pelem->timestamp; + else + delta = ts - from_timestamp; + + update_work(&key, delta); + + return 0; +} + +SEC("tp_btf/softirq_entry") +int on_softirq_entry(u64 *cxt) +{ + struct task_struct *task; + + if (!enabled) + return 0; + + __u32 cpu = bpf_get_smp_processor_id(); + + if (cpu_is_filtered(cpu)) + return 0; + + __u64 ts = bpf_ktime_get_ns(); + + task = (struct task_struct *)bpf_get_current_task(); + if (!task) + return 0; + + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .pid = BPF_CORE_READ(task, pid), + .task_p = (__u64)task, + }; + + struct time_data data = { + .timestamp = ts, + }; + + bpf_map_update_elem(&kwork_top_irq_time, &key, &data, BPF_ANY); + + return 0; +} + +SEC("tp_btf/softirq_exit") +int on_softirq_exit(u64 *cxt) +{ + __u64 delta; + struct task_struct *task; + struct time_data *pelem; + + if (!enabled) + return 0; + + __u32 cpu = bpf_get_smp_processor_id(); + + if (cpu_is_filtered(cpu)) + return 0; + + __u64 ts = bpf_ktime_get_ns(); + + task = (struct task_struct *)bpf_get_current_task(); + if (!task) + return 0; + + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .pid = BPF_CORE_READ(task, pid), + .task_p = (__u64)task, + }; + + pelem = bpf_map_lookup_elem(&kwork_top_irq_time, &key); + if (pelem) + delta = ts - pelem->timestamp; + else + delta = ts - from_timestamp; + + update_work(&key, delta); + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c new file mode 100644 index 000000000000..9ce9c8dddc4b --- /dev/null +++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022, Huawei + +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +#define KWORK_COUNT 100 +#define MAX_KWORKNAME 128 + +/* + * This should be in sync with "util/kwork.h" + */ +enum kwork_class_type { + KWORK_CLASS_IRQ, + KWORK_CLASS_SOFTIRQ, + KWORK_CLASS_WORKQUEUE, + KWORK_CLASS_MAX, +}; + +struct work_key { + __u32 type; + __u32 cpu; + __u64 id; +}; + +struct report_data { + __u64 nr; + __u64 total_time; + __u64 max_time; + __u64 max_time_start; + __u64 max_time_end; +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, MAX_KWORKNAME); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_names SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_time SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct work_key)); + __uint(value_size, sizeof(struct report_data)); + __uint(max_entries, KWORK_COUNT); +} perf_kwork_report SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} perf_kwork_cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_KWORKNAME); + __uint(max_entries, 1); +} perf_kwork_name_filter SEC(".maps"); + +int enabled = 0; + +const volatile int has_cpu_filter = 0; +const volatile int has_name_filter = 0; + +static __always_inline int local_strncmp(const char *s1, + unsigned int sz, const char *s2) +{ + int ret = 0; + unsigned int i; + + for (i = 0; i < sz; i++) { + ret = (unsigned char)s1[i] - (unsigned char)s2[i]; + if (ret || !s1[i]) + break; + } + + return ret; +} + +static __always_inline int trace_event_match(struct work_key *key, char *name) +{ + __u8 *cpu_val; + char *name_val; + __u32 zero = 0; + __u32 cpu = bpf_get_smp_processor_id(); + + if (!enabled) + return 0; + + if (has_cpu_filter) { + cpu_val = bpf_map_lookup_elem(&perf_kwork_cpu_filter, &cpu); + if (!cpu_val) + return 0; + } + + if (has_name_filter && (name != NULL)) { + name_val = bpf_map_lookup_elem(&perf_kwork_name_filter, &zero); + if (name_val && + (local_strncmp(name_val, MAX_KWORKNAME, name) != 0)) { + return 0; + } + } + + return 1; +} + +static __always_inline void do_update_time(void *map, struct work_key *key, + __u64 time_start, __u64 time_end) +{ + struct report_data zero, *data; + __s64 delta = time_end - time_start; + + if (delta < 0) + return; + + data = bpf_map_lookup_elem(map, key); + if (!data) { + __builtin_memset(&zero, 0, sizeof(zero)); + bpf_map_update_elem(map, key, &zero, BPF_NOEXIST); + data = bpf_map_lookup_elem(map, key); + if (!data) + return; + } + + if ((delta > data->max_time) || + (data->max_time == 0)) { + data->max_time = delta; + data->max_time_start = time_start; + data->max_time_end = time_end; + } + + data->total_time += delta; + data->nr++; +} + +static __always_inline void do_update_timestart(void *map, struct work_key *key) +{ + __u64 ts = bpf_ktime_get_ns(); + + bpf_map_update_elem(map, key, &ts, BPF_ANY); +} + +static __always_inline void do_update_timeend(void *report_map, void *time_map, + struct work_key *key) +{ + __u64 *time = bpf_map_lookup_elem(time_map, key); + + if (time) { + bpf_map_delete_elem(time_map, key); + do_update_time(report_map, key, *time, bpf_ktime_get_ns()); + } +} + +static __always_inline void do_update_name(void *map, + struct work_key *key, char *name) +{ + if (!bpf_map_lookup_elem(map, key)) + bpf_map_update_elem(map, key, name, BPF_ANY); +} + +static __always_inline int update_timestart(void *map, struct work_key *key) +{ + if (!trace_event_match(key, NULL)) + return 0; + + do_update_timestart(map, key); + return 0; +} + +static __always_inline int update_timestart_and_name(void *time_map, + void *names_map, + struct work_key *key, + char *name) +{ + if (!trace_event_match(key, name)) + return 0; + + do_update_timestart(time_map, key); + do_update_name(names_map, key, name); + + return 0; +} + +static __always_inline int update_timeend(void *report_map, + void *time_map, struct work_key *key) +{ + if (!trace_event_match(key, NULL)) + return 0; + + do_update_timeend(report_map, time_map, key); + + return 0; +} + +static __always_inline int update_timeend_and_name(void *report_map, + void *time_map, + void *names_map, + struct work_key *key, + char *name) +{ + if (!trace_event_match(key, name)) + return 0; + + do_update_timeend(report_map, time_map, key); + do_update_name(names_map, key, name); + + return 0; +} + +SEC("tracepoint/irq/irq_handler_entry") +int report_irq_handler_entry(struct trace_event_raw_irq_handler_entry *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->irq, + }; + void *name_addr = (void *)ctx + (ctx->__data_loc_name & 0xffff); + + bpf_probe_read_kernel_str(name, sizeof(name), name_addr); + + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, name); +} + +SEC("tracepoint/irq/irq_handler_exit") +int report_irq_handler_exit(struct trace_event_raw_irq_handler_exit *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_IRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->irq, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +static char softirq_name_list[NR_SOFTIRQS][MAX_KWORKNAME] = { + { "HI" }, + { "TIMER" }, + { "NET_TX" }, + { "NET_RX" }, + { "BLOCK" }, + { "IRQ_POLL" }, + { "TASKLET" }, + { "SCHED" }, + { "HRTIMER" }, + { "RCU" }, +}; + +SEC("tracepoint/irq/softirq_entry") +int report_softirq_entry(struct trace_event_raw_softirq *ctx) +{ + unsigned int vec = ctx->vec; + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)vec, + }; + + if (vec < NR_SOFTIRQS) { + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, + softirq_name_list[vec]); + } + + return 0; +} + +SEC("tracepoint/irq/softirq_exit") +int report_softirq_exit(struct trace_event_raw_softirq *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->vec, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +SEC("tracepoint/irq/softirq_raise") +int latency_softirq_raise(struct trace_event_raw_softirq *ctx) +{ + unsigned int vec = ctx->vec; + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)vec, + }; + + if (vec < NR_SOFTIRQS) { + return update_timestart_and_name(&perf_kwork_time, + &perf_kwork_names, &key, + softirq_name_list[vec]); + } + + return 0; +} + +SEC("tracepoint/irq/softirq_entry") +int latency_softirq_entry(struct trace_event_raw_softirq *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_SOFTIRQ, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->vec, + }; + + return update_timeend(&perf_kwork_report, &perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_start") +int report_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + + return update_timestart(&perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_end") +int report_workqueue_execute_end(struct trace_event_raw_workqueue_execute_end *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + unsigned long long func_addr = (unsigned long long)ctx->function; + + __builtin_memset(name, 0, sizeof(name)); + bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr)); + + return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time, + &perf_kwork_names, &key, name); +} + +SEC("tracepoint/workqueue/workqueue_activate_work") +int latency_workqueue_activate_work(struct trace_event_raw_workqueue_activate_work *ctx) +{ + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + + return update_timestart(&perf_kwork_time, &key); +} + +SEC("tracepoint/workqueue/workqueue_execute_start") +int latency_workqueue_execute_start(struct trace_event_raw_workqueue_execute_start *ctx) +{ + char name[MAX_KWORKNAME]; + struct work_key key = { + .type = KWORK_CLASS_WORKQUEUE, + .cpu = bpf_get_smp_processor_id(), + .id = (__u64)ctx->work, + }; + unsigned long long func_addr = (unsigned long long)ctx->function; + + __builtin_memset(name, 0, sizeof(name)); + bpf_snprintf(name, sizeof(name), "%ps", &func_addr, sizeof(func_addr)); + + return update_timeend_and_name(&perf_kwork_report, &perf_kwork_time, + &perf_kwork_names, &key, name); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c new file mode 100644 index 000000000000..96e7d853b9ed --- /dev/null +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -0,0 +1,989 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> +#include <asm-generic/errno-base.h> + +#include "lock_data.h" + +/* for collect_lock_syms(). 4096 was rejected by the verifier */ +#define MAX_CPUS 1024 + +/* for collect_zone_lock(). It should be more than the actual zones. */ +#define MAX_ZONES 10 + +/* for do_lock_delay(). Arbitrarily set to 1 million. */ +#define MAX_LOOP (1U << 20) + +/* lock contention flags from include/trace/events/lock.h */ +#define LCB_F_SPIN (1U << 0) +#define LCB_F_READ (1U << 1) +#define LCB_F_WRITE (1U << 2) +#define LCB_F_RT (1U << 3) +#define LCB_F_PERCPU (1U << 4) +#define LCB_F_MUTEX (1U << 5) + +/* callstack storage */ +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} stacks SEC(".maps"); + +/* buffer for owner stacktrace */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} stack_buf SEC(".maps"); + +/* a map for tracing owner stacktrace to owner stack id */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // owner stacktrace + __uint(value_size, sizeof(__s32)); // owner stack id + __uint(max_entries, 1); +} owner_stacks SEC(".maps"); + +/* a map for tracing lock address to owner data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // lock address + __uint(value_size, sizeof(struct owner_tracing_data)); + __uint(max_entries, 1); +} owner_data SEC(".maps"); + +/* a map for contention_key (stores owner stack id) to contention data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct contention_key)); + __uint(value_size, sizeof(struct contention_data)); + __uint(max_entries, 1); +} owner_stat SEC(".maps"); + +/* maintain timestamp at the beginning of contention */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct tstamp_data); + __uint(max_entries, MAX_ENTRIES); +} tstamp SEC(".maps"); + +/* maintain per-CPU timestamp at the beginning of contention */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct tstamp_data)); + __uint(max_entries, 1); +} tstamp_cpu SEC(".maps"); + +/* actual lock contention statistics */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct contention_key)); + __uint(value_size, sizeof(struct contention_data)); + __uint(max_entries, MAX_ENTRIES); +} lock_stat SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct contention_task_data)); + __uint(max_entries, MAX_ENTRIES); +} task_data SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u32)); + __uint(max_entries, MAX_ENTRIES); +} lock_syms SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} type_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} addr_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cgroup_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} slab_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(long)); + __uint(value_size, sizeof(struct slab_cache_data)); + __uint(max_entries, 1); +} slab_caches SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} lock_delays SEC(".maps"); + +struct rw_semaphore___old { + struct task_struct *owner; +} __attribute__((preserve_access_index)); + +struct rw_semaphore___new { + atomic_long_t owner; +} __attribute__((preserve_access_index)); + +struct mm_struct___old { + struct rw_semaphore mmap_sem; +} __attribute__((preserve_access_index)); + +struct mm_struct___new { + struct rw_semaphore mmap_lock; +} __attribute__((preserve_access_index)); + +extern struct kmem_cache *bpf_get_kmem_cache(u64 addr) __ksym __weak; + +/* control flags */ +const volatile int has_cpu; +const volatile int has_task; +const volatile int has_type; +const volatile int has_addr; +const volatile int has_cgroup; +const volatile int has_slab; +const volatile int needs_callstack; +const volatile int stack_skip; +const volatile int lock_owner; +const volatile int use_cgroup_v2; +const volatile int max_stack; +const volatile int lock_delay; + +/* determine the key of lock stat */ +const volatile int aggr_mode; + +int enabled; + +int perf_subsys_id = -1; + +__u64 end_ts; + +__u32 slab_cache_id; + +/* error stat */ +int task_fail; +int stack_fail; +int time_fail; +int data_fail; + +int task_map_full; +int data_map_full; + +struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; +void bpf_task_release(struct task_struct *p) __ksym __weak; + +static inline __u64 get_current_cgroup_id(void) +{ + struct task_struct *task; + struct cgroup *cgrp; + + if (use_cgroup_v2) + return bpf_get_current_cgroup_id(); + + task = bpf_get_current_task_btf(); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static inline int can_record(u64 *ctx) +{ + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u8 *ok; + __u32 pid = bpf_get_current_pid_tgid(); + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + if (has_type) { + __u8 *ok; + __u32 flags = (__u32)ctx[1]; + + ok = bpf_map_lookup_elem(&type_filter, &flags); + if (!ok) + return 0; + } + + if (has_addr) { + __u8 *ok; + __u64 addr = ctx[0]; + + ok = bpf_map_lookup_elem(&addr_filter, &addr); + if (!ok && !has_slab) + return 0; + } + + if (has_cgroup) { + __u8 *ok; + __u64 cgrp = get_current_cgroup_id(); + + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp); + if (!ok) + return 0; + } + + if (has_slab && bpf_get_kmem_cache) { + __u8 *ok; + __u64 addr = ctx[0]; + long kmem_cache_addr; + + kmem_cache_addr = (long)bpf_get_kmem_cache(addr); + ok = bpf_map_lookup_elem(&slab_filter, &kmem_cache_addr); + if (!ok) + return 0; + } + + return 1; +} + +static inline int update_task_data(struct task_struct *task) +{ + struct contention_task_data *p; + int pid, err; + + err = bpf_core_read(&pid, sizeof(pid), &task->pid); + if (err) + return -1; + + p = bpf_map_lookup_elem(&task_data, &pid); + if (p == NULL && !task_map_full) { + struct contention_task_data data = {}; + + BPF_CORE_READ_STR_INTO(&data.comm, task, comm); + if (bpf_map_update_elem(&task_data, &pid, &data, BPF_NOEXIST) == -E2BIG) + task_map_full = 1; + } + + return 0; +} + +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +static inline struct task_struct *get_lock_owner(__u64 lock, __u32 flags) +{ + struct task_struct *task; + __u64 owner = 0; + + if (flags & LCB_F_MUTEX) { + struct mutex *mutex = (void *)lock; + owner = BPF_CORE_READ(mutex, owner.counter); + } else if (flags == LCB_F_READ || flags == LCB_F_WRITE) { + /* + * Support for the BPF_TYPE_MATCHES argument to the + * __builtin_preserve_type_info builtin was added at some point during + * development of clang 15 and it's what is needed for + * bpf_core_type_matches. + */ +#if __has_builtin(__builtin_preserve_type_info) && __clang_major__ >= 15 + if (bpf_core_type_matches(struct rw_semaphore___old)) { + struct rw_semaphore___old *rwsem = (void *)lock; + owner = (unsigned long)BPF_CORE_READ(rwsem, owner); + } else if (bpf_core_type_matches(struct rw_semaphore___new)) { + struct rw_semaphore___new *rwsem = (void *)lock; + owner = BPF_CORE_READ(rwsem, owner.counter); + } +#else + /* assume new struct */ + struct rw_semaphore *rwsem = (void *)lock; + owner = BPF_CORE_READ(rwsem, owner.counter); +#endif + } + + if (!owner) + return NULL; + + task = (void *)(owner & ~7UL); + return task; +} + +static inline __u32 check_lock_type(__u64 lock, __u32 flags) +{ + struct task_struct *curr; + struct mm_struct___old *mm_old; + struct mm_struct___new *mm_new; + struct sighand_struct *sighand; + + switch (flags) { + case LCB_F_READ: /* rwsem */ + case LCB_F_WRITE: + curr = bpf_get_current_task_btf(); + if (curr->mm == NULL) + break; + mm_new = (void *)curr->mm; + if (bpf_core_field_exists(mm_new->mmap_lock)) { + if (&mm_new->mmap_lock == (void *)lock) + return LCD_F_MMAP_LOCK; + break; + } + mm_old = (void *)curr->mm; + if (bpf_core_field_exists(mm_old->mmap_sem)) { + if (&mm_old->mmap_sem == (void *)lock) + return LCD_F_MMAP_LOCK; + } + break; + case LCB_F_SPIN: /* spinlock */ + curr = bpf_get_current_task_btf(); + sighand = curr->sighand; + + if (sighand && &sighand->siglock == (void *)lock) + return LCD_F_SIGHAND_LOCK; + break; + default: + break; + } + return 0; +} + +static inline long delay_callback(__u64 idx, void *arg) +{ + __u64 target = *(__u64 *)arg; + + if (target <= bpf_ktime_get_ns()) + return 1; + + /* just to kill time */ + (void)bpf_get_prandom_u32(); + + return 0; +} + +static inline void do_lock_delay(__u64 duration) +{ + __u64 target = bpf_ktime_get_ns() + duration; + + bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); +} + +static inline void check_lock_delay(__u64 lock) +{ + __u64 *delay; + + delay = bpf_map_lookup_elem(&lock_delays, &lock); + if (delay) + do_lock_delay(*delay); +} + +static inline struct tstamp_data *get_tstamp_elem(__u32 flags) +{ + __u32 pid; + struct tstamp_data *pelem; + + /* Use per-cpu array map for spinlock and rwlock */ + if ((flags & (LCB_F_SPIN | LCB_F_MUTEX)) == LCB_F_SPIN) { + __u32 idx = 0; + + pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); + /* Do not update the element for nested locks */ + if (pelem && pelem->lock) + pelem = NULL; + return pelem; + } + + pid = bpf_get_current_pid_tgid(); + pelem = bpf_map_lookup_elem(&tstamp, &pid); + /* Do not update the element for nested locks */ + if (pelem && pelem->lock) + return NULL; + + if (pelem == NULL) { + struct tstamp_data zero = {}; + + if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) { + __sync_fetch_and_add(&task_fail, 1); + return NULL; + } + + pelem = bpf_map_lookup_elem(&tstamp, &pid); + if (pelem == NULL) { + __sync_fetch_and_add(&task_fail, 1); + return NULL; + } + } + return pelem; +} + +static inline s32 get_owner_stack_id(u64 *stacktrace) +{ + s32 *id, new_id; + static s64 id_gen = 1; + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + new_id = (s32)__sync_fetch_and_add(&id_gen, 1); + + bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + return -1; +} + +static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) +{ + __sync_fetch_and_add(&data->total_time, duration); + __sync_fetch_and_add(&data->count, count); + + /* FIXME: need atomic operations */ + if (data->max_time < duration) + data->max_time = duration; + if (data->min_time > duration) + data->min_time = duration; +} + +static inline void update_owner_stat(u32 id, u64 duration, u32 flags) +{ + struct contention_key key = { + .stack_id = id, + .pid = 0, + .lock_addr_or_cgroup = 0, + }; + struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); + + if (!data) { + struct contention_data first = { + .total_time = duration, + .max_time = duration, + .min_time = duration, + .count = 1, + .flags = flags, + }; + bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); + } else { + update_contention_data(data, duration, 1); + } +} + +SEC("tp_btf/contention_begin") +int contention_begin(u64 *ctx) +{ + struct tstamp_data *pelem; + + if (!enabled || !can_record(ctx)) + return 0; + + pelem = get_tstamp_elem(ctx[1]); + if (pelem == NULL) + return 0; + + pelem->timestamp = bpf_ktime_get_ns(); + pelem->lock = (__u64)ctx[0]; + pelem->flags = (__u32)ctx[1]; + + if (needs_callstack) { + u32 i = 0; + u32 id = 0; + int owner_pid; + u64 *buf; + struct task_struct *task; + struct owner_tracing_data *otdata; + + if (!lock_owner) + goto skip_owner; + + task = get_lock_owner(pelem->lock, pelem->flags); + if (!task) + goto skip_owner; + + owner_pid = BPF_CORE_READ(task, pid); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < max_stack; i++) + buf[i] = 0x0; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(owner_pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + id = get_owner_stack_id(buf); + + /* + * Contention just happens, or corner case `lock` is owned by process not + * `owner_pid`. For the corner case we treat it as unexpected internal error and + * just ignore the precvious tracing record. + */ + if (!otdata || otdata->pid != owner_pid) { + struct owner_tracing_data first = { + .pid = owner_pid, + .timestamp = pelem->timestamp, + .count = 1, + .stack_id = id, + }; + bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); + } + /* Contention is ongoing and new waiter joins */ + else { + __sync_fetch_and_add(&otdata->count, 1); + + /* + * The owner is the same, but stacktrace might be changed. In this case we + * store/update `owner_stat` based on current owner stack id. + */ + if (id != otdata->stack_id) { + update_owner_stat(id, pelem->timestamp - otdata->timestamp, + pelem->flags); + + otdata->timestamp = pelem->timestamp; + otdata->stack_id = id; + } + } +skip_owner: + pelem->stack_id = bpf_get_stackid(ctx, &stacks, + BPF_F_FAST_STACK_CMP | stack_skip); + if (pelem->stack_id < 0) + __sync_fetch_and_add(&stack_fail, 1); + } else if (aggr_mode == LOCK_AGGR_TASK) { + struct task_struct *task; + + if (lock_owner) { + task = get_lock_owner(pelem->lock, pelem->flags); + + /* The flags is not used anymore. Pass the owner pid. */ + if (task) + pelem->flags = BPF_CORE_READ(task, pid); + else + pelem->flags = -1U; + + } else { + task = bpf_get_current_task_btf(); + } + + if (task) { + if (update_task_data(task) < 0 && lock_owner) + pelem->flags = -1U; + } + } + + return 0; +} + +SEC("tp_btf/contention_end") +int contention_end(u64 *ctx) +{ + __u32 pid = 0, idx = 0; + struct tstamp_data *pelem; + struct contention_key key = {}; + struct contention_data *data; + __u64 timestamp; + __u64 duration; + bool need_delete = false; + + if (!enabled) + return 0; + + /* + * For spinlock and rwlock, it needs to get the timestamp for the + * per-cpu map. However, contention_end does not have the flags + * so it cannot know whether it reads percpu or hash map. + * + * Try per-cpu map first and check if there's active contention. + * If it is, do not read hash map because it cannot go to sleeping + * locks before releasing the spinning locks. + */ + pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx); + if (pelem && pelem->lock) { + if (pelem->lock != ctx[0]) + return 0; + } else { + pid = bpf_get_current_pid_tgid(); + pelem = bpf_map_lookup_elem(&tstamp, &pid); + if (!pelem || pelem->lock != ctx[0]) + return 0; + need_delete = true; + } + + timestamp = bpf_ktime_get_ns(); + duration = timestamp - pelem->timestamp; + if ((__s64)duration < 0) { + __sync_fetch_and_add(&time_fail, 1); + goto out; + } + + if (needs_callstack && lock_owner) { + struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + + if (!otdata) + goto skip_owner; + + /* Update `owner_stat` */ + update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); + + /* No contention is occurring, delete `lock` entry in `owner_data` */ + if (otdata->count <= 1) + bpf_map_delete_elem(&owner_data, &pelem->lock); + /* + * Contention is still ongoing, with a new owner (current task). `owner_data` + * should be updated accordingly. + */ + else { + u32 i = 0; + s32 ret = (s32)ctx[1]; + u64 *buf; + + otdata->timestamp = timestamp; + __sync_fetch_and_add(&otdata->count, -1); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < (u32)max_stack; i++) + buf[i] = 0x0; + + /* + * `ret` has the return code of the lock function. + * If `ret` is negative, the current task terminates lock waiting without + * acquiring it. Owner is not changed, but we still need to update the owner + * stack. + */ + if (ret < 0) { + s32 id = 0; + struct task_struct *task; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(otdata->pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, + max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + id = get_owner_stack_id(buf); + + /* + * If owner stack is changed, update owner stack id for this lock. + */ + if (id != otdata->stack_id) + otdata->stack_id = id; + } + /* + * Otherwise, update tracing data with the current task, which is the new + * owner. + */ + else { + otdata->pid = pid; + /* + * We don't want to retrieve callstack here, since it is where the + * current task acquires the lock and provides no additional + * information. We simply assign -1 to invalidate it. + */ + otdata->stack_id = -1; + } + } + } +skip_owner: + switch (aggr_mode) { + case LOCK_AGGR_CALLER: + key.stack_id = pelem->stack_id; + break; + case LOCK_AGGR_TASK: + if (lock_owner) + key.pid = pelem->flags; + else { + if (!need_delete) + pid = bpf_get_current_pid_tgid(); + key.pid = pid; + } + if (needs_callstack) + key.stack_id = pelem->stack_id; + break; + case LOCK_AGGR_ADDR: + key.lock_addr_or_cgroup = pelem->lock; + if (needs_callstack) + key.stack_id = pelem->stack_id; + break; + case LOCK_AGGR_CGROUP: + key.lock_addr_or_cgroup = get_current_cgroup_id(); + break; + default: + /* should not happen */ + return 0; + } + + data = bpf_map_lookup_elem(&lock_stat, &key); + if (!data) { + if (data_map_full) { + __sync_fetch_and_add(&data_fail, 1); + goto out; + } + + struct contention_data first = { + .total_time = duration, + .max_time = duration, + .min_time = duration, + .count = 1, + .flags = pelem->flags, + }; + int err; + + if (aggr_mode == LOCK_AGGR_ADDR) { + first.flags |= check_lock_type(pelem->lock, + pelem->flags & LCB_F_TYPE_MASK); + + /* Check if it's from a slab object */ + if (bpf_get_kmem_cache) { + struct kmem_cache *s; + struct slab_cache_data *d; + + s = bpf_get_kmem_cache(pelem->lock); + if (s != NULL) { + /* + * Save the ID of the slab cache in the flags + * (instead of full address) to reduce the + * space in the contention_data. + */ + d = bpf_map_lookup_elem(&slab_caches, &s); + if (d != NULL) + first.flags |= d->id; + } + } + } + + err = bpf_map_update_elem(&lock_stat, &key, &first, BPF_NOEXIST); + if (err < 0) { + if (err == -EEXIST) { + /* it lost the race, try to get it again */ + data = bpf_map_lookup_elem(&lock_stat, &key); + if (data != NULL) + goto found; + } + if (err == -E2BIG) + data_map_full = 1; + __sync_fetch_and_add(&data_fail, 1); + } + goto out; + } + +found: + update_contention_data(data, duration, 1); + +out: + if (lock_delay) + check_lock_delay(pelem->lock); + + pelem->lock = 0; + if (need_delete) + bpf_map_delete_elem(&tstamp, &pid); + return 0; +} + +extern struct rq runqueues __ksym; + +const volatile __u64 contig_page_data_addr; +const volatile __u64 node_data_addr; +const volatile int nr_nodes; +const volatile int sizeof_zone; + +struct rq___old { + raw_spinlock_t lock; +} __attribute__((preserve_access_index)); + +struct rq___new { + raw_spinlock_t __lock; +} __attribute__((preserve_access_index)); + +static void collect_zone_lock(void) +{ + __u64 nr_zones, zone_off; + __u64 lock_addr, lock_off; + __u32 lock_flag = LOCK_CLASS_ZONE_LOCK; + + zone_off = offsetof(struct pglist_data, node_zones); + lock_off = offsetof(struct zone, lock); + + if (contig_page_data_addr) { + struct pglist_data *contig_page_data; + + contig_page_data = (void *)(long)contig_page_data_addr; + nr_zones = BPF_CORE_READ(contig_page_data, nr_zones); + + for (int i = 0; i < MAX_ZONES; i++) { + __u64 zone_addr; + + if (i >= nr_zones) + break; + + zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } else if (nr_nodes > 0) { + struct pglist_data **node_data = (void *)(long)node_data_addr; + + for (int i = 0; i < nr_nodes; i++) { + struct pglist_data *pgdat = NULL; + int err; + + err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]); + if (err < 0 || pgdat == NULL) + break; + + nr_zones = BPF_CORE_READ(pgdat, nr_zones); + for (int k = 0; k < MAX_ZONES; k++) { + __u64 zone_addr; + + if (k >= nr_zones) + break; + + zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } + } +} + +SEC("raw_tp/bpf_test_finish") +int BPF_PROG(collect_lock_syms) +{ + __u64 lock_addr, lock_off; + __u32 lock_flag; + + if (bpf_core_field_exists(struct rq___new, __lock)) + lock_off = offsetof(struct rq___new, __lock); + else + lock_off = offsetof(struct rq___old, lock); + + for (int i = 0; i < MAX_CPUS; i++) { + struct rq *rq = bpf_per_cpu_ptr(&runqueues, i); + + if (rq == NULL) + break; + + lock_addr = (__u64)(void *)rq + lock_off; + lock_flag = LOCK_CLASS_RQLOCK; + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + + collect_zone_lock(); + + return 0; +} + +SEC("raw_tp/bpf_test_finish") +int BPF_PROG(end_timestamp) +{ + end_ts = bpf_ktime_get_ns(); + return 0; +} + +/* + * bpf_iter__kmem_cache added recently so old kernels don't have it in the + * vmlinux.h. But we cannot add it here since it will cause a compiler error + * due to redefinition of the struct on later kernels. + * + * So it uses a CO-RE trick to access the member only if it has the type. + * This will support both old and new kernels without compiler errors. + */ +struct bpf_iter__kmem_cache___new { + struct kmem_cache *s; +} __attribute__((preserve_access_index)); + +SEC("iter/kmem_cache") +int slab_cache_iter(void *ctx) +{ + struct kmem_cache *s = NULL; + struct slab_cache_data d; + const char *nameptr; + + if (bpf_core_type_exists(struct bpf_iter__kmem_cache)) { + struct bpf_iter__kmem_cache___new *iter = ctx; + + s = iter->s; + } + + if (s == NULL) + return 0; + + nameptr = s->name; + bpf_probe_read_kernel_str(d.name, sizeof(d.name), nameptr); + + d.id = ++slab_cache_id << LCB_F_SLAB_ID_SHIFT; + if (d.id >= LCB_F_SLAB_ID_END) + return 0; + + bpf_map_update_elem(&slab_caches, &s, &d, BPF_NOEXIST); + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h new file mode 100644 index 000000000000..28c5e5aced7f --- /dev/null +++ b/tools/perf/util/bpf_skel/lock_data.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Data structures shared between BPF and tools. */ +#ifndef UTIL_BPF_SKEL_LOCK_DATA_H +#define UTIL_BPF_SKEL_LOCK_DATA_H + +struct owner_tracing_data { + u32 pid; // Who has the lock. + u32 count; // How many waiters for this lock. + u64 timestamp; // The time while the owner acquires lock and contention is going on. + s32 stack_id; // Identifier for `owner_stat`, which stores as value in `owner_stacks` +}; + +struct tstamp_data { + u64 timestamp; + u64 lock; + u32 flags; + s32 stack_id; +}; + +struct contention_key { + s32 stack_id; + u32 pid; + u64 lock_addr_or_cgroup; +}; + +#define TASK_COMM_LEN 16 + +struct contention_task_data { + char comm[TASK_COMM_LEN]; +}; + +/* default buffer size */ +#define MAX_ENTRIES 16384 + +/* + * Upper bits of the flags in the contention_data are used to identify + * some well-known locks which do not have symbols (non-global locks). + */ +#define LCD_F_MMAP_LOCK (1U << 31) +#define LCD_F_SIGHAND_LOCK (1U << 30) + +#define LCB_F_SLAB_ID_SHIFT 16 +#define LCB_F_SLAB_ID_START (1U << 16) +#define LCB_F_SLAB_ID_END (1U << 26) +#define LCB_F_SLAB_ID_MASK 0x03FF0000U + +#define LCB_F_TYPE_MAX (1U << 7) +#define LCB_F_TYPE_MASK 0x0000007FU + +#define SLAB_NAME_MAX 28 + +struct contention_data { + u64 total_time; + u64 min_time; + u64 max_time; + u32 count; + u32 flags; +}; + +enum lock_aggr_mode { + LOCK_AGGR_ADDR = 0, + LOCK_AGGR_TASK, + LOCK_AGGR_CALLER, + LOCK_AGGR_CGROUP, +}; + +enum lock_class_sym { + LOCK_CLASS_NONE, + LOCK_CLASS_RQLOCK, + LOCK_CLASS_ZONE_LOCK, +}; + +struct slab_cache_data { + u32 id; + char name[SLAB_NAME_MAX]; +}; + +#endif /* UTIL_BPF_SKEL_LOCK_DATA_H */ diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c new file mode 100644 index 000000000000..72763bb8d1de --- /dev/null +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2022 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* task->flags for off-cpu analysis */ +#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ + +/* task->state for off-cpu analysis */ +#define TASK_INTERRUPTIBLE 0x0001 +#define TASK_UNINTERRUPTIBLE 0x0002 + +/* create a new thread */ +#define CLONE_THREAD 0x10000 + +#define MAX_STACKS 32 +#define MAX_ENTRIES 102400 + +#define MAX_CPUS 4096 +#define MAX_OFFCPU_LEN 37 + +// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1 +struct __stack { + u64 array[MAX_STACKS]; +}; + +struct tstamp_data { + __u32 stack_id; + __u32 state; + __u64 timestamp; + struct __stack stack; +}; + +struct offcpu_key { + __u32 pid; + __u32 tgid; + __u32 stack_id; + __u32 state; + __u64 cgroup_id; +}; + +struct { + __uint(type, BPF_MAP_TYPE_STACK_TRACE); + __uint(key_size, sizeof(__u32)); + __uint(value_size, MAX_STACKS * sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} stacks SEC(".maps"); + +struct offcpu_data { + u64 array[MAX_OFFCPU_LEN]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, MAX_CPUS); +} offcpu_output SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct offcpu_data)); + __uint(max_entries, 1); +} offcpu_payload SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tstamp_data); +} tstamp SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct offcpu_key)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, MAX_ENTRIES); +} off_cpu SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cpu_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} task_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u8)); + __uint(max_entries, 1); +} cgroup_filter SEC(".maps"); + +/* new kernel task_struct definition */ +struct task_struct___new { + long __state; +} __attribute__((preserve_access_index)); + +/* old kernel task_struct definition */ +struct task_struct___old { + long state; +} __attribute__((preserve_access_index)); + +int enabled = 0; + +const volatile int has_cpu = 0; +const volatile int has_task = 0; +const volatile int has_cgroup = 0; +const volatile int uses_tgid = 0; + +const volatile bool has_prev_state = false; +const volatile bool needs_cgroup = false; +const volatile bool uses_cgroup_v1 = false; + +int perf_subsys_id = -1; + +__u64 offcpu_thresh_ns; + +/* + * Old kernel used to call it task_struct->state and now it's '__state'. + * Use BPF CO-RE "ignored suffix rule" to deal with it like below: + * + * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes + */ +static inline int get_task_state(struct task_struct *t) +{ + /* recast pointer to capture new type for compiler */ + struct task_struct___new *t_new = (void *)t; + + if (bpf_core_field_exists(t_new->__state)) { + return BPF_CORE_READ(t_new, __state); + } else { + /* recast pointer to capture old type for compiler */ + struct task_struct___old *t_old = (void *)t; + + return BPF_CORE_READ(t_old, state); + } +} + +static inline __u64 get_cgroup_id(struct task_struct *t) +{ + struct cgroup *cgrp; + + if (!uses_cgroup_v1) + return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static inline int can_record(struct task_struct *t, int state) +{ + /* kernel threads don't have user stack */ + if (t->flags & PF_KTHREAD) + return 0; + + if (state != TASK_INTERRUPTIBLE && + state != TASK_UNINTERRUPTIBLE) + return 0; + + if (has_cpu) { + __u32 cpu = bpf_get_smp_processor_id(); + __u8 *ok; + + ok = bpf_map_lookup_elem(&cpu_filter, &cpu); + if (!ok) + return 0; + } + + if (has_task) { + __u8 *ok; + __u32 pid; + + if (uses_tgid) + pid = t->tgid; + else + pid = t->pid; + + ok = bpf_map_lookup_elem(&task_filter, &pid); + if (!ok) + return 0; + } + + if (has_cgroup) { + __u8 *ok; + __u64 cgrp_id = get_cgroup_id(t); + + ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); + if (!ok) + return 0; + } + + return 1; +} + +static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n) +{ + int len = 0; + + for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len) + to->array[n + 2 + i] = from->array[i]; + + return len; +} + +/** + * off_cpu_dump - dump off-cpu samples to ring buffer + * @data: payload for dumping off-cpu samples + * @key: off-cpu data + * @stack: stack trace of the task before being scheduled out + * + * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id + * information of the task, and dump it as a raw sample to perf ring buffer + */ +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, + struct __stack *stack, __u64 delta) +{ + int n = 0, len = 0; + + data->array[n++] = (u64)key->tgid << 32 | key->pid; + data->array[n++] = delta; + + /* data->array[n] is callchain->nr (updated later) */ + data->array[n + 1] = PERF_CONTEXT_USER; + data->array[n + 2] = 0; + len = copy_stack(stack, data, n); + + /* update length of callchain */ + data->array[n] = len + 1; + n += len + 2; + + data->array[n++] = key->cgroup_id; + + return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64)); +} + +static int off_cpu_stat(u64 *ctx, struct task_struct *prev, + struct task_struct *next, int state) +{ + __u64 ts; + __u32 stack_id; + struct tstamp_data *pelem; + + ts = bpf_ktime_get_ns(); + + if (!can_record(prev, state)) + goto next; + + stack_id = bpf_get_stackid(ctx, &stacks, + BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); + + pelem = bpf_task_storage_get(&tstamp, prev, NULL, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!pelem) + goto next; + + pelem->timestamp = ts; + pelem->state = state; + pelem->stack_id = stack_id; + + /* + * If stacks are successfully collected by bpf_get_stackid(), collect them once more + * in task_storage for direct off-cpu sample dumping + */ + if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) { + /* + * This empty if block is used to avoid 'result unused warning' from bpf_get_stack(). + * If the collection fails, continue with the logic for the next task. + */ + } +next: + pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); + + if (pelem && pelem->timestamp) { + struct offcpu_key key = { + .pid = next->pid, + .tgid = next->tgid, + .stack_id = pelem->stack_id, + .state = pelem->state, + .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, + }; + __u64 delta = ts - pelem->timestamp; + __u64 *total; + + if (delta >= offcpu_thresh_ns) { + int zero = 0; + struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); + + if (data) + off_cpu_dump(ctx, data, &key, &pelem->stack, delta); + } else { + total = bpf_map_lookup_elem(&off_cpu, &key); + if (total) + *total += delta; + else + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + } + + /* prevent to reuse the timestamp later */ + pelem->timestamp = 0; + } + + return 0; +} + +SEC("tp_btf/task_newtask") +int on_newtask(u64 *ctx) +{ + struct task_struct *task; + u64 clone_flags; + u32 pid; + u8 val = 1; + + if (!uses_tgid) + return 0; + + task = (struct task_struct *)bpf_get_current_task(); + + pid = BPF_CORE_READ(task, tgid); + if (!bpf_map_lookup_elem(&task_filter, &pid)) + return 0; + + task = (struct task_struct *)ctx[0]; + clone_flags = ctx[1]; + + pid = task->tgid; + if (!(clone_flags & CLONE_THREAD)) + bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); + + return 0; +} + +SEC("tp_btf/sched_switch") +int on_switch(u64 *ctx) +{ + struct task_struct *prev, *next; + int prev_state; + + if (!enabled) + return 0; + + prev = (struct task_struct *)ctx[1]; + next = (struct task_struct *)ctx[2]; + + if (has_prev_state) + prev_state = (int)ctx[3]; + else + prev_state = get_task_state(prev); + + return off_cpu_stat(ctx, prev, next, prev_state & 0xff); +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/sample-filter.h b/tools/perf/util/bpf_skel/sample-filter.h new file mode 100644 index 000000000000..683fec85e71e --- /dev/null +++ b/tools/perf/util/bpf_skel/sample-filter.h @@ -0,0 +1,72 @@ +#ifndef PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H +#define PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H + +#define MAX_FILTERS 64 +#define MAX_IDX_HASH (16 * 1024) +#define MAX_EVT_HASH (1024 * 1024) + +/* supported filter operations */ +enum perf_bpf_filter_op { + PBF_OP_EQ, + PBF_OP_NEQ, + PBF_OP_GT, + PBF_OP_GE, + PBF_OP_LT, + PBF_OP_LE, + PBF_OP_AND, + PBF_OP_GROUP_BEGIN, + PBF_OP_GROUP_END, + PBF_OP_DONE, +}; + +enum perf_bpf_filter_term { + /* No term is in use. */ + PBF_TERM_NONE = 0, + /* Terms that correspond to PERF_SAMPLE_xx values. */ + PBF_TERM_SAMPLE_START = PBF_TERM_NONE + 1, + PBF_TERM_IP = PBF_TERM_SAMPLE_START + 0, /* SAMPLE_IP = 1U << 0 */ + PBF_TERM_TID = PBF_TERM_SAMPLE_START + 1, /* SAMPLE_TID = 1U << 1 */ + PBF_TERM_TIME = PBF_TERM_SAMPLE_START + 2, /* SAMPLE_TIME = 1U << 2 */ + PBF_TERM_ADDR = PBF_TERM_SAMPLE_START + 3, /* SAMPLE_ADDR = 1U << 3 */ + __PBF_UNUSED_TERM4 = PBF_TERM_SAMPLE_START + 4, /* SAMPLE_READ = 1U << 4 */ + __PBF_UNUSED_TERM5 = PBF_TERM_SAMPLE_START + 5, /* SAMPLE_CALLCHAIN = 1U << 5 */ + PBF_TERM_ID = PBF_TERM_SAMPLE_START + 6, /* SAMPLE_ID = 1U << 6 */ + PBF_TERM_CPU = PBF_TERM_SAMPLE_START + 7, /* SAMPLE_CPU = 1U << 7 */ + PBF_TERM_PERIOD = PBF_TERM_SAMPLE_START + 8, /* SAMPLE_PERIOD = 1U << 8 */ + __PBF_UNUSED_TERM9 = PBF_TERM_SAMPLE_START + 9, /* SAMPLE_STREAM_ID = 1U << 9 */ + __PBF_UNUSED_TERM10 = PBF_TERM_SAMPLE_START + 10, /* SAMPLE_RAW = 1U << 10 */ + __PBF_UNUSED_TERM11 = PBF_TERM_SAMPLE_START + 11, /* SAMPLE_BRANCH_STACK = 1U << 11 */ + __PBF_UNUSED_TERM12 = PBF_TERM_SAMPLE_START + 12, /* SAMPLE_REGS_USER = 1U << 12 */ + __PBF_UNUSED_TERM13 = PBF_TERM_SAMPLE_START + 13, /* SAMPLE_STACK_USER = 1U << 13 */ + PBF_TERM_WEIGHT = PBF_TERM_SAMPLE_START + 14, /* SAMPLE_WEIGHT = 1U << 14 */ + PBF_TERM_DATA_SRC = PBF_TERM_SAMPLE_START + 15, /* SAMPLE_DATA_SRC = 1U << 15 */ + __PBF_UNUSED_TERM16 = PBF_TERM_SAMPLE_START + 16, /* SAMPLE_IDENTIFIER = 1U << 16 */ + PBF_TERM_TRANSACTION = PBF_TERM_SAMPLE_START + 17, /* SAMPLE_TRANSACTION = 1U << 17 */ + __PBF_UNUSED_TERM18 = PBF_TERM_SAMPLE_START + 18, /* SAMPLE_REGS_INTR = 1U << 18 */ + PBF_TERM_PHYS_ADDR = PBF_TERM_SAMPLE_START + 19, /* SAMPLE_PHYS_ADDR = 1U << 19 */ + __PBF_UNUSED_TERM20 = PBF_TERM_SAMPLE_START + 20, /* SAMPLE_AUX = 1U << 20 */ + PBF_TERM_CGROUP = PBF_TERM_SAMPLE_START + 21, /* SAMPLE_CGROUP = 1U << 21 */ + PBF_TERM_DATA_PAGE_SIZE = PBF_TERM_SAMPLE_START + 22, /* SAMPLE_DATA_PAGE_SIZE = 1U << 22 */ + PBF_TERM_CODE_PAGE_SIZE = PBF_TERM_SAMPLE_START + 23, /* SAMPLE_CODE_PAGE_SIZE = 1U << 23 */ + PBF_TERM_WEIGHT_STRUCT = PBF_TERM_SAMPLE_START + 24, /* SAMPLE_WEIGHT_STRUCT = 1U << 24 */ + PBF_TERM_SAMPLE_END = PBF_TERM_WEIGHT_STRUCT, + /* Terms computed from BPF helpers. */ + PBF_TERM_UID, + PBF_TERM_GID, +}; + +/* BPF map entry for filtering */ +struct perf_bpf_filter_entry { + enum perf_bpf_filter_op op; + __u32 part; /* sub-sample type info when it has multiple values */ + enum perf_bpf_filter_term term; + __u64 value; +}; + +struct idx_hash_key { + __u64 evt_id; + __u32 tgid; + __u32 reserved; +}; + +#endif /* PERF_UTIL_BPF_SKEL_SAMPLE_FILTER_H */ diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c new file mode 100644 index 000000000000..b195e6efeb8b --- /dev/null +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +// Copyright (c) 2023 Google +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +#include "sample-filter.h" + +/* BPF map that will be filled by user space */ +struct filters { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); + __type(value, struct perf_bpf_filter_entry[MAX_FILTERS]); + __uint(max_entries, 1); +} filters SEC(".maps"); + +/* + * An evsel has multiple instances for each CPU or task but we need a single + * id to be used as a key for the idx_hash. This hashmap would translate the + * instance's ID to a representative ID. + */ +struct event_hash { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, __u64); + __type(value, __u64); + __uint(max_entries, 1); +} event_hash SEC(".maps"); + +/* tgid/evtid to filter index */ +struct idx_hash { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct idx_hash_key); + __type(value, int); + __uint(max_entries, 1); +} idx_hash SEC(".maps"); + +/* tgid to filter index */ +struct lost_count { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); +} dropped SEC(".maps"); + +volatile const int use_idx_hash; + +void *bpf_cast_to_kern_ctx(void *) __ksym; + +/* new kernel perf_sample_data definition */ +struct perf_sample_data___new { + __u64 sample_flags; +} __attribute__((preserve_access_index)); + +/* new kernel perf_mem_data_src definition */ +union perf_mem_data_src___new { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; + +/* helper function to return the given perf sample data */ +static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, + struct perf_bpf_filter_entry *entry) +{ + struct perf_sample_data___new *data = (void *)kctx->data; + + if (!bpf_core_field_exists(data->sample_flags)) + return 0; + +#define BUILD_CHECK_SAMPLE(x) \ + _Static_assert((1 << (PBF_TERM_##x - PBF_TERM_SAMPLE_START)) == PERF_SAMPLE_##x, \ + "Mismatched PBF term to sample bit " #x) + BUILD_CHECK_SAMPLE(IP); + BUILD_CHECK_SAMPLE(TID); + BUILD_CHECK_SAMPLE(TIME); + BUILD_CHECK_SAMPLE(ADDR); + BUILD_CHECK_SAMPLE(ID); + BUILD_CHECK_SAMPLE(CPU); + BUILD_CHECK_SAMPLE(PERIOD); + BUILD_CHECK_SAMPLE(WEIGHT); + BUILD_CHECK_SAMPLE(DATA_SRC); + BUILD_CHECK_SAMPLE(TRANSACTION); + BUILD_CHECK_SAMPLE(PHYS_ADDR); + BUILD_CHECK_SAMPLE(CGROUP); + BUILD_CHECK_SAMPLE(DATA_PAGE_SIZE); + BUILD_CHECK_SAMPLE(CODE_PAGE_SIZE); + BUILD_CHECK_SAMPLE(WEIGHT_STRUCT); +#undef BUILD_CHECK_SAMPLE + + /* For sample terms check the sample bit is set. */ + if (entry->term >= PBF_TERM_SAMPLE_START && entry->term <= PBF_TERM_SAMPLE_END && + (data->sample_flags & (1 << (entry->term - PBF_TERM_SAMPLE_START))) == 0) + return 0; + + switch (entry->term) { + case PBF_TERM_IP: + return kctx->data->ip; + case PBF_TERM_ID: + return kctx->data->id; + case PBF_TERM_TID: + if (entry->part) + return kctx->data->tid_entry.pid; + else + return kctx->data->tid_entry.tid; + case PBF_TERM_CPU: + return kctx->data->cpu_entry.cpu; + case PBF_TERM_TIME: + return kctx->data->time; + case PBF_TERM_ADDR: + return kctx->data->addr; + case PBF_TERM_PERIOD: + return kctx->data->period; + case PBF_TERM_TRANSACTION: + return kctx->data->txn; + case PBF_TERM_WEIGHT_STRUCT: + if (entry->part == 1) + return kctx->data->weight.var1_dw; + if (entry->part == 2) + return kctx->data->weight.var2_w; + if (entry->part == 3) + return kctx->data->weight.var3_w; + /* fall through */ + case PBF_TERM_WEIGHT: + return kctx->data->weight.full; + case PBF_TERM_PHYS_ADDR: + return kctx->data->phys_addr; + case PBF_TERM_CGROUP: + return kctx->data->cgroup; + case PBF_TERM_CODE_PAGE_SIZE: + return kctx->data->code_page_size; + case PBF_TERM_DATA_PAGE_SIZE: + return kctx->data->data_page_size; + case PBF_TERM_DATA_SRC: + if (entry->part == 1) + return kctx->data->data_src.mem_op; + if (entry->part == 2) + return kctx->data->data_src.mem_lvl_num; + if (entry->part == 3) { + __u32 snoop = kctx->data->data_src.mem_snoop; + __u32 snoopx = kctx->data->data_src.mem_snoopx; + + return (snoopx << 5) | snoop; + } + if (entry->part == 4) + return kctx->data->data_src.mem_remote; + if (entry->part == 5) + return kctx->data->data_src.mem_lock; + if (entry->part == 6) + return kctx->data->data_src.mem_dtlb; + if (entry->part == 7) + return kctx->data->data_src.mem_blk; + if (entry->part == 8) { + union perf_mem_data_src___new *data = (void *)&kctx->data->data_src; + + if (bpf_core_field_exists(data->mem_hops)) + return data->mem_hops; + + return 0; + } + /* return the whole word */ + return kctx->data->data_src.val; + case PBF_TERM_UID: + return bpf_get_current_uid_gid() & 0xFFFFFFFF; + case PBF_TERM_GID: + return bpf_get_current_uid_gid() >> 32; + case PBF_TERM_NONE: + case __PBF_UNUSED_TERM4: + case __PBF_UNUSED_TERM5: + case __PBF_UNUSED_TERM9: + case __PBF_UNUSED_TERM10: + case __PBF_UNUSED_TERM11: + case __PBF_UNUSED_TERM12: + case __PBF_UNUSED_TERM13: + case __PBF_UNUSED_TERM16: + case __PBF_UNUSED_TERM18: + case __PBF_UNUSED_TERM20: + default: + break; + } + return 0; +} + +#define CHECK_RESULT(data, op, val) \ + if (!(data op val)) { \ + if (!in_group) \ + goto drop; \ + } else if (in_group) { \ + group_result = 1; \ + } + +/* BPF program to be called from perf event overflow handler */ +SEC("perf_event") +int perf_sample_filter(void *ctx) +{ + struct bpf_perf_event_data_kern *kctx; + struct perf_bpf_filter_entry *entry; + __u64 sample_data; + int in_group = 0; + int group_result = 0; + int i, k; + int *losts; + + kctx = bpf_cast_to_kern_ctx(ctx); + + k = 0; + + if (use_idx_hash) { + struct idx_hash_key key = { + .tgid = bpf_get_current_pid_tgid() >> 32, + }; + __u64 eid = kctx->event->id; + __u64 *key_id; + int *idx; + + /* get primary_event_id */ + if (kctx->event->parent) + eid = kctx->event->parent->id; + + key_id = bpf_map_lookup_elem(&event_hash, &eid); + if (key_id == NULL) + goto drop; + + key.evt_id = *key_id; + + idx = bpf_map_lookup_elem(&idx_hash, &key); + if (idx) + k = *idx; + else + goto drop; + } + + entry = bpf_map_lookup_elem(&filters, &k); + if (entry == NULL) + goto drop; + + for (i = 0; i < MAX_FILTERS; i++) { + sample_data = perf_get_sample(kctx, &entry[i]); + + switch (entry[i].op) { + case PBF_OP_EQ: + CHECK_RESULT(sample_data, ==, entry[i].value) + break; + case PBF_OP_NEQ: + CHECK_RESULT(sample_data, !=, entry[i].value) + break; + case PBF_OP_GT: + CHECK_RESULT(sample_data, >, entry[i].value) + break; + case PBF_OP_GE: + CHECK_RESULT(sample_data, >=, entry[i].value) + break; + case PBF_OP_LT: + CHECK_RESULT(sample_data, <, entry[i].value) + break; + case PBF_OP_LE: + CHECK_RESULT(sample_data, <=, entry[i].value) + break; + case PBF_OP_AND: + CHECK_RESULT(sample_data, &, entry[i].value) + break; + case PBF_OP_GROUP_BEGIN: + in_group = 1; + group_result = 0; + break; + case PBF_OP_GROUP_END: + if (group_result == 0) + goto drop; + in_group = 0; + break; + case PBF_OP_DONE: + /* no failures so far, accept it */ + return 1; + } + } + /* generate sample data */ + return 1; + +drop: + losts = bpf_map_lookup_elem(&dropped, &k); + if (losts != NULL) + __sync_fetch_and_add(losts, 1); + + return 0; +} + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c new file mode 100644 index 000000000000..1bcd066a5199 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Trace raw_syscalls tracepoints to collect system call statistics. + */ + +#include "vmlinux.h" +#include "syscall_summary.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* This is to calculate a delta between sys-enter and sys-exit for each thread */ +struct syscall_trace { + int nr; /* syscall number is only available at sys-enter */ + int unused; + u64 timestamp; +}; + +#define MAX_ENTRIES (128 * 1024) + +struct syscall_trace_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); /* tid */ + __type(value, struct syscall_trace); + __uint(max_entries, MAX_ENTRIES); +} syscall_trace_map SEC(".maps"); + +struct syscall_stats_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct syscall_key); + __type(value, struct syscall_stats); + __uint(max_entries, MAX_ENTRIES); +} syscall_stats_map SEC(".maps"); + +int enabled; /* controlled from userspace */ + +const volatile enum syscall_aggr_mode aggr_mode; +const volatile int use_cgroup_v2; + +int perf_subsys_id = -1; + +static inline __u64 get_current_cgroup_id(void) +{ + struct task_struct *task; + struct cgroup *cgrp; + + if (use_cgroup_v2) + return bpf_get_current_cgroup_id(); + + task = bpf_get_current_task_btf(); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration, + long ret) +{ + struct syscall_key key = { + .cpu_or_tid = cpu_or_tid, + .cgroup = cgroup_id, + .nr = nr, + }; + struct syscall_stats *stats; + + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) { + struct syscall_stats zero = {}; + + bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST); + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) + return; + } + + __sync_fetch_and_add(&stats->count, 1); + if (ret < 0) + __sync_fetch_and_add(&stats->error, 1); + + if (duration > 0) { + __sync_fetch_and_add(&stats->total_time, duration); + __sync_fetch_and_add(&stats->squared_sum, duration * duration); + if (stats->max_time < duration) + stats->max_time = duration; + if (stats->min_time > duration || stats->min_time == 0) + stats->min_time = duration; + } + + return; +} + +SEC("tp_btf/sys_enter") +int sys_enter(u64 *ctx) +{ + int tid; + struct syscall_trace st; + + if (!enabled) + return 0; + + st.nr = ctx[1]; /* syscall number */ + st.unused = 0; + st.timestamp = bpf_ktime_get_ns(); + + tid = bpf_get_current_pid_tgid(); + bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY); + + return 0; +} + +SEC("tp_btf/sys_exit") +int sys_exit(u64 *ctx) +{ + int tid; + int key = 0; + u64 cgroup = 0; + long ret = ctx[1]; /* return value of the syscall */ + struct syscall_trace *st; + s64 delta; + + if (!enabled) + return 0; + + tid = bpf_get_current_pid_tgid(); + st = bpf_map_lookup_elem(&syscall_trace_map, &tid); + if (st == NULL) + return 0; + + if (aggr_mode == SYSCALL_AGGR_THREAD) + key = tid; + else if (aggr_mode == SYSCALL_AGGR_CGROUP) + cgroup = get_current_cgroup_id(); + else + key = bpf_get_smp_processor_id(); + + delta = bpf_ktime_get_ns() - st->timestamp; + update_stats(key, cgroup, st->nr, delta, ret); + + bpf_map_delete_elem(&syscall_trace_map, &tid); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h new file mode 100644 index 000000000000..72ccccb45925 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Data structures shared between BPF and tools. */ +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H + +enum syscall_aggr_mode { + SYSCALL_AGGR_THREAD, + SYSCALL_AGGR_CPU, + SYSCALL_AGGR_CGROUP, +}; + +struct syscall_key { + u64 cgroup; + int cpu_or_tid; + int nr; +}; + +struct syscall_stats { + u64 total_time; + u64 squared_sum; + u64 max_time; + u64 min_time; + u32 count; + u32 error; +}; + +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */ diff --git a/tools/perf/util/bpf_skel/vmlinux/.gitignore b/tools/perf/util/bpf_skel/vmlinux/.gitignore new file mode 100644 index 000000000000..49502c04183a --- /dev/null +++ b/tools/perf/util/bpf_skel/vmlinux/.gitignore @@ -0,0 +1 @@ +!vmlinux.h diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h new file mode 100644 index 000000000000..a59ce912be18 --- /dev/null +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -0,0 +1,215 @@ +#ifndef __VMLINUX_H +#define __VMLINUX_H + +#include <linux/stddef.h> // for define __always_inline +#include <linux/bpf.h> +#include <linux/types.h> +#include <linux/perf_event.h> +#include <stdbool.h> + +// non-UAPI kernel data structures, used in the .bpf.c BPF tool component. + +// Just the fields used in these tools preserving the access index so that +// libbpf can fixup offsets with the ones used in the kernel when loading the +// BPF bytecode, if they differ from what is used here. + +typedef __u8 u8; +typedef __u32 u32; +typedef __s32 s32; +typedef __u64 u64; +typedef __s64 s64; + +typedef int pid_t; + +typedef __s64 time64_t; + +struct timespec64 { + time64_t tv_sec; + long int tv_nsec; +}; + +enum cgroup_subsys_id { + perf_event_cgrp_id = 8, +}; + +enum { + HI_SOFTIRQ = 0, + TIMER_SOFTIRQ, + NET_TX_SOFTIRQ, + NET_RX_SOFTIRQ, + BLOCK_SOFTIRQ, + IRQ_POLL_SOFTIRQ, + TASKLET_SOFTIRQ, + SCHED_SOFTIRQ, + HRTIMER_SOFTIRQ, + RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ + + NR_SOFTIRQS +}; + +typedef struct { + s64 counter; +} __attribute__((preserve_access_index)) atomic64_t; + +typedef atomic64_t atomic_long_t; + +struct raw_spinlock { + int rawlock; +} __attribute__((preserve_access_index)); + +typedef struct raw_spinlock raw_spinlock_t; + +typedef struct { + struct raw_spinlock rlock; +} __attribute__((preserve_access_index)) spinlock_t; + +struct sighand_struct { + spinlock_t siglock; +} __attribute__((preserve_access_index)); + +struct rw_semaphore { + atomic_long_t owner; +} __attribute__((preserve_access_index)); + +struct mutex { + atomic_long_t owner; +} __attribute__((preserve_access_index)); + +struct kernfs_node { + u64 id; +} __attribute__((preserve_access_index)); + +struct cgroup { + struct kernfs_node *kn; + int level; +} __attribute__((preserve_access_index)); + +struct cgroup_subsys_state { + struct cgroup *cgroup; +} __attribute__((preserve_access_index)); + +struct css_set { + struct cgroup_subsys_state *subsys[13]; + struct cgroup *dfl_cgrp; +} __attribute__((preserve_access_index)); + +struct mm_struct { + struct rw_semaphore mmap_lock; +} __attribute__((preserve_access_index)); + +struct task_struct { + unsigned int flags; + struct mm_struct *mm; + pid_t pid; + pid_t tgid; + char comm[16]; + struct sighand_struct *sighand; + struct css_set *cgroups; +} __attribute__((preserve_access_index)); + +struct trace_entry { + short unsigned int type; + unsigned char flags; + unsigned char preempt_count; + int pid; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_irq_handler_entry { + struct trace_entry ent; + int irq; + u32 __data_loc_name; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_irq_handler_exit { + struct trace_entry ent; + int irq; + int ret; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_softirq { + struct trace_entry ent; + unsigned int vec; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_execute_start { + struct trace_entry ent; + void *work; + void *function; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_execute_end { + struct trace_entry ent; + void *work; + void *function; + char __data[]; +} __attribute__((preserve_access_index)); + +struct trace_event_raw_workqueue_activate_work { + struct trace_entry ent; + void *work; + char __data[]; +} __attribute__((preserve_access_index)); + +struct perf_sample_data { + u64 addr; + u64 period; + union perf_sample_weight weight; + u64 txn; + union perf_mem_data_src data_src; + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; + u64 id; + struct { + u32 cpu; + } cpu_entry; + u64 phys_addr; + u64 cgroup; + u64 data_page_size; + u64 code_page_size; +} __attribute__((__aligned__(64))) __attribute__((preserve_access_index)); + +struct perf_event { + struct perf_event *parent; + u64 id; +} __attribute__((preserve_access_index)); + +struct bpf_perf_event_data_kern { + struct perf_sample_data *data; + struct perf_event *event; +} __attribute__((preserve_access_index)); + +/* + * If 'struct rq' isn't defined for lock_contention.bpf.c, for the sake of + * rq___old and rq___new, then the type for the 'runqueue' variable ends up + * being a forward declaration (BTF_KIND_FWD) while the kernel has it defined + * (BTF_KIND_STRUCT). The definition appears in vmlinux.h rather than + * lock_contention.bpf.c for consistency with a generated vmlinux.h. + */ +struct rq {}; + +struct kmem_cache { + const char *name; +} __attribute__((preserve_access_index)); + +struct bpf_iter__kmem_cache { + struct kmem_cache *s; +} __attribute__((preserve_access_index)); + +struct zone { + spinlock_t lock; +} __attribute__((preserve_access_index)); + +struct pglist_data { + struct zone node_zones[6]; /* value for all possible config */ + int nr_zones; +} __attribute__((preserve_access_index)); + +#endif // __VMLINUX_H |