diff options
Diffstat (limited to '')
-rw-r--r-- | tools/perf/builtin-trace.c | 2048 |
1 files changed, 1367 insertions, 681 deletions
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 44a75f234db1..2ab1b8e05ad3 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -15,9 +15,15 @@ */ #include "util/record.h" -#include <traceevent/event-parse.h> #include <api/fs/tracing_path.h> +#ifdef HAVE_LIBBPF_SUPPORT #include <bpf/bpf.h> +#include <bpf/libbpf.h> +#include <bpf/btf.h> +#ifdef HAVE_BPF_SKEL +#include "bpf_skel/augmented_raw_syscalls.skel.h" +#endif +#endif #include "util/bpf_map.h" #include "util/rlimit.h" #include "builtin.h" @@ -33,6 +39,7 @@ #include "util/synthetic-events.h" #include "util/evlist.h" #include "util/evswitch.h" +#include "util/hashmap.h" #include "util/mmap.h" #include <subcmd/pager.h> #include <subcmd/exec-cmd.h> @@ -48,17 +55,19 @@ #include "util/thread_map.h" #include "util/stat.h" #include "util/tool.h" +#include "util/trace.h" #include "util/util.h" #include "trace/beauty/beauty.h" #include "trace-event.h" #include "util/parse-events.h" -#include "util/bpf-loader.h" +#include "util/tracepoint.h" #include "callchain.h" #include "print_binary.h" #include "string2.h" #include "syscalltbl.h" -#include "rb_resort.h" #include "../perf.h" +#include "trace_augment.h" +#include "dwarf-regs.h" #include <errno.h> #include <inttypes.h> @@ -69,6 +78,7 @@ #include <linux/err.h> #include <linux/filter.h> #include <linux/kernel.h> +#include <linux/list_sort.h> #include <linux/random.h> #include <linux/stringify.h> #include <linux/time64.h> @@ -78,6 +88,11 @@ #include <linux/ctype.h> #include <perf/mmap.h> +#include <tools/libc_compat.h> + +#ifdef HAVE_LIBTRACEEVENT +#include <event-parse.h> +#endif #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 @@ -87,8 +102,16 @@ # define F_LINUX_SPECIFIC_BASE 1024 #endif +#define RAW_SYSCALL_ARGS_NUM 6 + /* * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100 + * + * We have to explicitely mark the direction of the flow of data, if from the + * kernel to user space or the other way around, since the BPF collector we + * have so far copies only from user to kernel space, mark the arguments that + * go that direction, so that we don“t end up collecting the previous contents + * for syscall args that goes from kernel to user space. */ struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); @@ -97,7 +120,12 @@ struct syscall_arg_fmt { void *parm; const char *name; u16 nr_entries; // for arrays + bool from_user; bool show_zero; +#ifdef HAVE_LIBBPF_SUPPORT + const struct btf_type *type; + int type_id; /* used in btf_dump */ +#endif }; struct syscall_fmt { @@ -107,7 +135,7 @@ struct syscall_fmt { const char *sys_enter, *sys_exit; } bpf_prog_name; - struct syscall_arg_fmt arg[6]; + struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM]; u8 nr_args; bool errpid; bool timeout; @@ -116,29 +144,27 @@ struct syscall_fmt { struct trace { struct perf_tool tool; - struct syscalltbl *sctbl; struct { - struct syscall *table; - struct bpf_map *map; - struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY - struct bpf_map *sys_enter, - *sys_exit; - } prog_array; + /** Sorted sycall numbers used by the trace. */ + struct syscall **table; + /** Size of table. */ + size_t table_size; struct { struct evsel *sys_enter, - *sys_exit, - *augmented; + *sys_exit, + *bpf_output; } events; - struct bpf_program *unaugmented_prog; } syscalls; - struct { - struct bpf_map *map; - } dump; +#ifdef HAVE_BPF_SKEL + struct augmented_raw_syscalls_bpf *skel; +#endif +#ifdef HAVE_LIBBPF_SUPPORT + struct btf *btf; +#endif struct record_opts opts; struct evlist *evlist; struct machine *host; struct thread *current; - struct bpf_object *bpf_obj; struct cgroup *cgroup; u64 base_time; FILE *output; @@ -156,14 +182,25 @@ struct trace { pid_t *entries; struct bpf_map *map; } filter_pids; + /* + * TODO: The map is from an ID (aka system call number) to struct + * syscall_stats. If there is >1 e_machine, such as i386 and x86-64 + * processes, then the stats here will gather wrong the statistics for + * the non EM_HOST system calls. A fix would be to add the e_machine + * into the key, but this would make the code inconsistent with the + * per-thread version. + */ + struct hashmap *syscall_stats; double duration_filter; double runtime_ms; + unsigned long pfmaj, pfmin; struct { u64 vfs_getname, proc_getname; } stats; unsigned int max_stack; unsigned int min_stack; + enum trace_summary_mode summary_mode; int raw_augmented_syscalls_args_size; bool raw_augmented_syscalls; bool fd_path_disabled; @@ -191,6 +228,8 @@ struct trace { bool show_string_prefix; bool force; bool vfs_getname; + bool force_btf; + bool summary_bpf; int trace_pgfaults; char *perfconfig_events; struct { @@ -199,6 +238,20 @@ struct trace { } oe; }; +static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused) +{ +#ifdef HAVE_LIBBPF_SUPPORT + if (trace->btf != NULL) + return; + + trace->btf = btf__load_vmlinux_btf(); + if (verbose > 0) { + fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" : + "Failed to load vmlinux BTF\n"); + } +#endif +} + struct tp_field { int offset; union { @@ -353,7 +406,12 @@ static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel) } if (et->fmt == NULL) { - et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt)); + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format == NULL) + goto out_delete; + + et->fmt = calloc(tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt)); if (et->fmt == NULL) goto out_delete; } @@ -408,6 +466,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel) if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") && evsel__init_tp_uint_field(evsel, &sc->id, "nr")) return -ENOENT; + return 0; } @@ -614,11 +673,8 @@ bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *re if (isalpha(*tok) || *tok == '_') { if (!strarray__strtoul(sa, tok, toklen, &val)) return false; - } else { - bool is_hexa = tok[0] == 0 && (tok[1] = 'x' || tok[1] == 'X'); - - val = strtoul(tok, NULL, is_hexa ? 16 : 0); - } + } else + val = strtoul(tok, NULL, 0); *ret |= (1 << (val - 1)); @@ -707,7 +763,15 @@ static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct sy static const char *bpf_cmd[] = { "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", - "MAP_GET_NEXT_KEY", "PROG_LOAD", + "MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH", + "PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID", + "PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD", + "PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID", + "TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE", + "BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH", + "MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE", + "LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE", + "LINK_DETACH", "PROG_BIND_MAP", }; static DEFINE_STRARRAY(bpf_cmd, "BPF_"); @@ -754,7 +818,7 @@ static const char *fcntl_cmds[] = { static DEFINE_STRARRAY(fcntl_cmds, "F_"); static const char *fcntl_linux_specific_cmds[] = { - "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", + "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC", "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", }; @@ -819,6 +883,15 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, #define SCA_FILENAME syscall_arg__scnprintf_filename +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_FILENAME_FROM_USER(argname) \ + { .scnprintf = SCA_FILENAME, \ + .from_user = true, } + +static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg); + +#define SCA_BUF syscall_arg__scnprintf_buf + static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -876,6 +949,177 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags +#ifdef HAVE_LIBBPF_SUPPORT +static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) +{ + int id; + + type = strstr(type, "enum "); + if (type == NULL) + return; + + type += 5; // skip "enum " to get the enumeration name + + id = btf__find_by_name(btf, type); + if (id < 0) + return; + + arg_fmt->type = btf__type_by_id(btf, id); +} + +static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt = arg->fmt->type; + struct btf *btf = arg->trace->btf; + struct btf_enum *be = btf_enum(bt); + + for (int i = 0; i < btf_vlen(bt); ++i, ++be) { + const char *name = btf__name_by_offset(btf, be->name_off); + int max_len = max(size, strlen(name)); + + if (strncmp(name, bf, max_len) == 0) { + *val = be->val; + return true; + } + } + + return false; +} + +static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt; + char *type = arg->type_name; + struct btf *btf; + + trace__load_vmlinux_btf(arg->trace); + + btf = arg->trace->btf; + if (btf == NULL) + return false; + + if (arg->fmt->type == NULL) { + // See if this is an enum + syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type); + } + + // Now let's see if we have a BTF type resolved + bt = arg->fmt->type; + if (bt == NULL) + return false; + + // If it is an enum: + if (btf_is_enum(arg->fmt->type)) + return syscall_arg__strtoul_btf_enum(bf, size, arg, val); + + return false; +} + +static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val) +{ + struct btf_enum *be = btf_enum(type); + const int nr_entries = btf_vlen(type); + + for (int i = 0; i < nr_entries; ++i, ++be) { + if (be->val == val) { + return scnprintf(bf, size, "%s", + btf__name_by_offset(btf, be->name_off)); + } + } + + return 0; +} + +struct trace_btf_dump_snprintf_ctx { + char *bf; + size_t printed, size; +}; + +static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args) +{ + struct trace_btf_dump_snprintf_ctx *ctx = vctx; + + ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args); +} + +static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg) +{ + struct trace_btf_dump_snprintf_ctx ctx = { + .bf = bf, + .size = size, + }; + struct augmented_arg *augmented_arg = arg->augmented.args; + int type_id = arg->fmt->type_id, consumed; + struct btf_dump *btf_dump; + + LIBBPF_OPTS(btf_dump_opts, dump_opts); + LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts); + + if (arg == NULL || arg->augmented.args == NULL) + return 0; + + dump_data_opts.compact = true; + dump_data_opts.skip_names = !arg->trace->show_arg_names; + + btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts); + if (btf_dump == NULL) + return 0; + + /* pretty print the struct data here */ + if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0) + return 0; + + consumed = sizeof(*augmented_arg) + augmented_arg->size; + arg->augmented.args = ((void *)arg->augmented.args) + consumed; + arg->augmented.size -= consumed; + + btf_dump__free(btf_dump); + + return ctx.printed; +} + +static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf, + size_t size, int val, char *type) +{ + struct syscall_arg_fmt *arg_fmt = arg->fmt; + + if (trace->btf == NULL) + return 0; + + if (arg_fmt->type == NULL) { + // Check if this is an enum and if we have the BTF type for it. + syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type); + } + + // Did we manage to find a BTF type for the syscall/tracepoint argument? + if (arg_fmt->type == NULL) + return 0; + + if (btf_is_enum(arg_fmt->type)) + return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); + else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type)) + return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg); + + return 0; +} + +#else // HAVE_LIBBPF_SUPPORT +static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused, + char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, + char *type __maybe_unused) +{ + return 0; +} + +static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused, + struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused) +{ + return false; +} +#endif // HAVE_LIBBPF_SUPPORT + +#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type + #define STRARRAY(name, array) \ { .scnprintf = SCA_STRARRAY, \ .strtoul = STUL_STRARRAY, \ @@ -886,7 +1130,6 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .strtoul = STUL_STRARRAY_FLAGS, \ .parm = &strarray__##array, } -#include "trace/beauty/arch_errno_names.c" #include "trace/beauty/eventfd.c" #include "trace/beauty/futex_op.c" #include "trace/beauty/futex_val3.c" @@ -902,7 +1145,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #include "trace/beauty/socket_type.c" #include "trace/beauty/waitid_options.c" -static struct syscall_fmt syscall_fmts[] = { +static const struct syscall_fmt syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, { .name = "arch_prctl", @@ -910,14 +1153,17 @@ static struct syscall_fmt syscall_fmts[] = { [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, }, { .name = "bind", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(umyaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "bpf", - .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, + .arg = { [0] = STRARRAY(cmd, bpf_cmd), + [1] = { .from_user = true /* attr */, }, } }, { .name = "brk", .hexret = true, .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, }, { .name = "clock_gettime", .arg = { [0] = STRARRAY(clk_id, clockid), }, }, + { .name = "clock_nanosleep", + .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, }, { .name = "clone", .errpid = true, .nr_args = 5, .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, @@ -928,12 +1174,21 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, { .name = "connect", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(servaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "epoll_ctl", .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, + { .name = "faccessat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "faccessat2", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, + [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, }, { .name = "fchmodat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, { .name = "fchownat", @@ -953,10 +1208,9 @@ static struct syscall_fmt syscall_fmts[] = { [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, }, { .name = "fspick", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* path */ }, + [1] = SCA_FILENAME_FROM_USER(path), [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, { .name = "fstat", .alias = "newfstat", }, - { .name = "fstatat", .alias = "newfstatat", }, { .name = "futex", .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, @@ -971,6 +1225,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, { .name = "getrlimit", .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, + { .name = "getsockopt", + .arg = { [1] = STRARRAY(level, socket_level), }, }, { .name = "gettid", .errpid = true, }, { .name = "ioctl", .arg = { @@ -1010,32 +1266,36 @@ static struct syscall_fmt syscall_fmts[] = { #if defined(__s390x__) .alias = "old_mmap", #endif - .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, + .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ .strtoul = STUL_STRARRAY_FLAGS, .parm = &strarray__mmap_flags, }, [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, }, { .name = "mount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + .arg = { [0] = SCA_FILENAME_FROM_USER(devname), [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "move_mount", .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ }, - [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ }, + [3] = SCA_FILENAME_FROM_USER(pathname), [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, - [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, + [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, }, { .name = "mq_unlink", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, }, { .name = "mremap", .hexret = true, .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, { .name = "name_to_handle_at", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, - { .name = "newfstatat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + { .name = "nanosleep", + .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, }, + { .name = "newfstatat", .alias = "fstatat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "open", .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "open_by_handle_at", @@ -1045,7 +1305,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "perf_event_open", - .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, + .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr), + [2] = { .scnprintf = SCA_INT, /* cpu */ }, [3] = { .scnprintf = SCA_FD, /* group_fd */ }, [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, { .name = "pipe2", @@ -1056,7 +1317,7 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, { .name = "pkey_mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, - [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, + [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, { .name = "poll", .timeout = true, }, { .name = "ppoll", .timeout = true, }, @@ -1069,7 +1330,8 @@ static struct syscall_fmt syscall_fmts[] = { { .name = "pread", .alias = "pread64", }, { .name = "preadv", .alias = "pread", }, { .name = "prlimit64", - .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, + .arg = { [1] = STRARRAY(resource, rlimit_resources), + [2] = { .from_user = true /* new_rlim */, }, }, }, { .name = "pwrite", .alias = "pwrite64", }, { .name = "readlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, @@ -1086,6 +1348,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ }, [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, }, + { .name = "rseq", + .arg = { [0] = { .from_user = true /* rseq */, }, }, }, { .name = "rt_sigaction", .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "rt_sigprocmask", @@ -1107,12 +1371,17 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, { .name = "sendto", .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, - [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, + [4] = SCA_SOCKADDR_FROM_USER(addr), }, }, + { .name = "set_robust_list", + .arg = { [0] = { .from_user = true /* head */, }, }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .arg = { [0] = STRARRAY(which, itimers), }, }, { .name = "setrlimit", - .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, + .arg = { [0] = STRARRAY(resource, rlimit_resources), + [1] = { .from_user = true /* rlim */, }, }, }, + { .name = "setsockopt", + .arg = { [1] = STRARRAY(level, socket_level), }, }, { .name = "socket", .arg = { [0] = STRARRAY(family, socket_families), [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, @@ -1124,12 +1393,12 @@ static struct syscall_fmt syscall_fmts[] = { { .name = "stat", .alias = "newstat", }, { .name = "statx", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, - [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } , [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, { .name = "swapoff", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "swapon", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "symlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "sync_file_range", @@ -1139,16 +1408,20 @@ static struct syscall_fmt syscall_fmts[] = { { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "umount2", .alias = "umount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "utimensat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, { .name = "wait4", .errpid = true, .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, { .name = "waitid", .errpid = true, .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, + { .name = "write", + .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, }, }; static int syscall_fmt__cmp(const void *name, const void *fmtp) @@ -1157,18 +1430,21 @@ static int syscall_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_fmt *__syscall_fmt__find(struct syscall_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts, + const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } -static struct syscall_fmt *syscall_fmt__find(const char *name) +static const struct syscall_fmt *syscall_fmt__find(const char *name) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find(syscall_fmts, nmemb, name); } -static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts, const int nmemb, const char *alias) +static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts, + const int nmemb, const char *alias) { int i; @@ -1180,46 +1456,52 @@ static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts return NULL; } -static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias); } -/* - * is_exit: is this "exit" or "exit_group"? - * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. - * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. - * nonexistent: Just a hole in the syscall table, syscall id not allocated +/** + * struct syscall */ struct syscall { + /** @e_machine: The ELF machine associated with the entry. */ + int e_machine; + /** @id: id value from the tracepoint, the system call number. */ + int id; struct tep_event *tp_format; int nr_args; + /** + * @args_size: sum of the sizes of the syscall arguments, anything + * after that is augmented stuff: pathname for openat, etc. + */ + int args_size; struct { struct bpf_program *sys_enter, *sys_exit; } bpf_prog; + /** @is_exit: is this "exit" or "exit_group"? */ bool is_exit; + /** + * @is_open: is this "open" or "openat"? To associate the fd returned in + * sys_exit with the pathname in sys_enter. + */ bool is_open; + /** + * @nonexistent: Name lookup failed. Just a hole in the syscall table, + * syscall id not allocated. + */ bool nonexistent; + bool use_btf; struct tep_format_field *args; const char *name; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt; }; /* - * Must match what is in the BPF program: - * - * tools/perf/examples/bpf/augmented_raw_syscalls.c - */ -struct bpf_map_syscall_entry { - bool enabled; - u16 string_args_len[6]; -}; - -/* * We need to have this 'calculated' boolean because in some cases we really * don't know what is the duration of a syscall, for instance, when we start * a session and some threads are waiting for a syscall to finish, say 'poll', @@ -1269,22 +1551,70 @@ struct thread_trace { struct file *table; } files; - struct intlist *syscall_stats; + struct hashmap *syscall_stats; }; -static struct thread_trace *thread_trace__new(void) +static size_t syscall_id_hash(long key, void *ctx __maybe_unused) +{ + return key; +} + +static bool syscall_id_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return key1 == key2; +} + +static struct hashmap *alloc_syscall_stats(void) +{ + return hashmap__new(syscall_id_hash, syscall_id_equal, NULL); +} + +static void delete_syscall_stats(struct hashmap *syscall_stats) +{ + struct hashmap_entry *pos; + size_t bkt; + + if (syscall_stats == NULL) + return; + + hashmap__for_each_entry(syscall_stats, pos, bkt) + zfree(&pos->pvalue); + hashmap__free(syscall_stats); +} + +static struct thread_trace *thread_trace__new(struct trace *trace) { struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); if (ttrace) { ttrace->files.max = -1; - ttrace->syscall_stats = intlist__new(NULL); + if (trace->summary) { + ttrace->syscall_stats = alloc_syscall_stats(); + if (IS_ERR(ttrace->syscall_stats)) + zfree(&ttrace); + } } return ttrace; } -static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) +static void thread_trace__free_files(struct thread_trace *ttrace); + +static void thread_trace__delete(void *pttrace) +{ + struct thread_trace *ttrace = pttrace; + + if (!ttrace) + return; + + delete_syscall_stats(ttrace->syscall_stats); + ttrace->syscall_stats = NULL; + thread_trace__free_files(ttrace); + zfree(&ttrace->entry_str); + free(ttrace); +} + +static struct thread_trace *thread__trace(struct thread *thread, struct trace *trace) { struct thread_trace *ttrace; @@ -1292,7 +1622,7 @@ static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) goto fail; if (thread__priv(thread) == NULL) - thread__set_priv(thread, thread_trace__new()); + thread__set_priv(thread, thread_trace__new(trace)); if (thread__priv(thread) == NULL) goto fail; @@ -1302,7 +1632,7 @@ static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) return ttrace; fail: - color_fprintf(fp, PERF_COLOR_RED, + color_fprintf(trace->output, PERF_COLOR_RED, "WARNING: not enough memory, dropping samples!\n"); return NULL; } @@ -1321,6 +1651,17 @@ void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, static const size_t trace__entry_str_size = 2048; +static void thread_trace__free_files(struct thread_trace *ttrace) +{ + for (int i = 0; i <= ttrace->files.max; ++i) { + struct file *file = ttrace->files.table + i; + zfree(&file->pathname); + } + + zfree(&ttrace->files.table); + ttrace->files.max = -1; +} + static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd) { if (fd < 0) @@ -1358,6 +1699,7 @@ static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pat if (file != NULL) { struct stat st; + if (stat(pathname, &st) == 0) file->dev_maj = major(st.st_rdev); file->pathname = strdup(pathname); @@ -1374,12 +1716,13 @@ static int thread__read_fd_path(struct thread *thread, int fd) struct stat st; int ret; - if (thread->pid_ == thread->tid) { + if (thread__pid(thread) == thread__tid(thread)) { scnprintf(linkname, sizeof(linkname), - "/proc/%d/fd/%d", thread->pid_, fd); + "/proc/%d/fd/%d", thread__pid(thread), fd); } else { scnprintf(linkname, sizeof(linkname), - "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); + "/proc/%d/task/%d/fd/%d", + thread__pid(thread), thread__tid(thread), fd); } if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) @@ -1498,6 +1841,32 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, return 0; } +#define MAX_CONTROL_CHAR 31 +#define MAX_ASCII 127 + +static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg) +{ + struct augmented_arg *augmented_arg = arg->augmented.args; + unsigned char *orig = (unsigned char *)augmented_arg->value; + size_t printed = 0; + int consumed; + + if (augmented_arg == NULL) + return 0; + + for (int j = 0; j < augmented_arg->size; ++j) { + bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII; + /* print control characters (0~31 and 127), and non-ascii characters in \(digits) */ + printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]); + } + + consumed = sizeof(*augmented_arg) + augmented_arg->size; + arg->augmented.args = ((void *)arg->augmented.args) + consumed; + arg->augmented.size -= consumed; + + return printed; +} + static bool trace__filter_duration(struct trace *trace, double t) { return t < (trace->duration_filter * NSEC_PER_MSEC); @@ -1524,13 +1893,20 @@ static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) return fprintf(fp, " ? "); } -static bool done = false; -static bool interrupted = false; +static pid_t workload_pid = -1; +static volatile sig_atomic_t done = false; +static volatile sig_atomic_t interrupted = false; -static void sig_handler(int sig) +static void sighandler_interrupt(int sig __maybe_unused) { - done = true; - interrupted = sig == SIGINT; + done = interrupted = true; +} + +static void sighandler_chld(int sig __maybe_unused, siginfo_t *info, + void *context __maybe_unused) +{ + if (info->si_pid == workload_pid) + done = true; } static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) @@ -1540,7 +1916,7 @@ static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread if (trace->multiple_threads) { if (trace->show_comm) printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); - printed += fprintf(fp, "%d ", thread->tid); + printed += fprintf(fp, "%d ", thread__tid(thread)); } return printed; @@ -1566,7 +1942,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, switch (event->header.type) { case PERF_RECORD_LOST: color_fprintf(trace->output, PERF_COLOR_RED, - "LOST %" PRIu64 " events!\n", event->lost.lost); + "LOST %" PRIu64 " events!\n", (u64)event->lost.lost); ret = machine__process_lost_event(machine, event, sample); break; default: @@ -1577,7 +1953,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, return ret; } -static int trace__tool_process(struct perf_tool *tool, +static int trace__tool_process(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -1615,13 +1991,15 @@ static int trace__symbols_init(struct trace *trace, struct evlist *evlist) if (trace->host == NULL) return -ENOMEM; + thread__set_priv_destructor(thread_trace__delete); + err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr); if (err < 0) goto out; err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, - evlist->core.threads, trace__tool_process, false, - 1); + evlist->core.threads, trace__tool_process, + true, false, 1); out: if (err) symbol__exit(); @@ -1641,7 +2019,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) { int idx; - if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) + if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0) nr_args = sc->fmt->nr_args; sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); @@ -1657,7 +2035,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) return 0; } -static struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { +static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, }, { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, }, }; @@ -1668,20 +2046,22 @@ static int syscall_arg_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_arg_fmt * -__syscall_arg_fmt__find_by_name(struct syscall_arg_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_arg_fmt * +__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp); } -static struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) +static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) { const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name); return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name); } static struct tep_format_field * -syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field) +syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field, + bool *use_btf) { struct tep_format_field *last_field = NULL; int len; @@ -1694,11 +2074,15 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field len = strlen(field->name); + // As far as heuristics (or intention) goes this seems to hold true, and makes sense! + if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const ")) + arg->from_user = true; + if (strcmp(field->type, "const char *") == 0 && ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || - strstr(field->name, "path") != NULL)) + strstr(field->name, "path") != NULL)) { arg->scnprintf = SCA_FILENAME; - else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) + } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) arg->scnprintf = SCA_PTR; else if (strcmp(field->type, "pid_t") == 0) arg->scnprintf = SCA_PID; @@ -1713,14 +2097,18 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field len >= 2 && strcmp(field->name + len - 2, "fd") == 0) { /* * /sys/kernel/tracing/events/syscalls/sys_enter* - * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c + * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c * 65 int * 23 unsigned int * 7 unsigned long */ arg->scnprintf = SCA_FD; - } else { - struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); + } else if (strstr(field->type, "enum") && use_btf != NULL) { + *use_btf = true; + arg->strtoul = STUL_BTF_TYPE; + } else { + const struct syscall_arg_fmt *fmt = + syscall_arg_fmt__find_by_name(field->name); if (fmt) { arg->scnprintf = fmt->scnprintf; @@ -1734,7 +2122,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field static int syscall__set_arg_fmts(struct syscall *sc) { - struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args); + struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args, + &sc->use_btf); if (last_field) sc->args_size = last_field->offset + last_field->size; @@ -1742,43 +2131,24 @@ static int syscall__set_arg_fmts(struct syscall *sc) return 0; } -static int trace__read_syscall_info(struct trace *trace, int id) +static int syscall__read_info(struct syscall *sc, struct trace *trace) { char tp_name[128]; - struct syscall *sc; - const char *name = syscalltbl__name(trace->sctbl, id); - -#ifdef HAVE_SYSCALL_TABLE_SUPPORT - if (trace->syscalls.table == NULL) { - trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc)); - if (trace->syscalls.table == NULL) - return -ENOMEM; - } -#else - if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) { - // When using libaudit we don't know beforehand what is the max syscall id - struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); - - if (table == NULL) - return -ENOMEM; - - // Need to memset from offset 0 and +1 members if brand new - if (trace->syscalls.table == NULL) - memset(table, 0, (id + 1) * sizeof(*sc)); - else - memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc)); + const char *name; + int err; - trace->syscalls.table = table; - trace->sctbl->syscalls.max_id = id; - } -#endif - sc = trace->syscalls.table + id; if (sc->nonexistent) + return -EEXIST; + + if (sc->name) { + /* Info already read. */ return 0; + } + name = syscalltbl__name(sc->e_machine, sc->id); if (name == NULL) { sc->nonexistent = true; - return 0; + return -EEXIST; } sc->name = name; @@ -1792,11 +2162,23 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc->tp_format = trace_event__tp_format("syscalls", tp_name); } - if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) - return -ENOMEM; + /* + * Fails to read trace point format via sysfs node, so the trace point + * doesn't exist. Set the 'nonexistent' flag as true. + */ + if (IS_ERR(sc->tp_format)) { + sc->nonexistent = true; + err = PTR_ERR(sc->tp_format); + sc->tp_format = NULL; + return err; + } - if (IS_ERR(sc->tp_format)) - return PTR_ERR(sc->tp_format); + /* + * The tracepoint format contains __syscall_nr field, so it's one more + * than the actual number of syscall arguments. + */ + if (syscall__alloc_arg_fmts(sc, sc->tp_format->format.nr_fields - 1)) + return -ENOMEM; sc->args = sc->tp_format->format.fields; /* @@ -1812,16 +2194,26 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); - return syscall__set_arg_fmts(sc); + err = syscall__set_arg_fmts(sc); + + /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */ + if (sc->use_btf) + trace__load_vmlinux_btf(trace); + + return err; } -static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) +static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf) { struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt != NULL) { - syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields); - return 0; + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format) { + syscall_arg_fmt__init_array(fmt, tp_format->format.fields, use_btf); + return 0; + } } return -ENOMEM; @@ -1853,10 +2245,14 @@ static int trace__validate_ev_qualifier(struct trace *trace) strlist__for_each_entry(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; + /* + * TODO: Assume more than the validation/warnings are all for + * the same binary type as perf. + */ + int id = syscalltbl__id(EM_HOST, sc), match_next = -1; if (id < 0) { - id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); + id = syscalltbl__strglobmatch_first(EM_HOST, sc, &match_next); if (id >= 0) goto matches; @@ -1876,7 +2272,7 @@ matches: continue; while (1) { - id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); + id = syscalltbl__strglobmatch_next(EM_HOST, sc, &match_next); if (id < 0) break; if (nr_allocated == nr_used) { @@ -1979,7 +2375,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, unsigned char *args, void *augmented_args, int augmented_args_size, struct trace *trace, struct thread *thread) { - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg arg = { @@ -1995,6 +2391,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, .show_string_prefix = trace->show_string_prefix, }; struct thread_trace *ttrace = thread__priv(thread); + void *default_scnprintf; /* * Things like fcntl will set this in its 'cmd' formatter to pick the @@ -2020,17 +2417,15 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val); /* - * Suppress this argument if its value is zero and - * and we don't have a string associated in an - * strarray for it. - */ - if (val == 0 && - !trace->show_zeros && - !(sc->arg_fmt && - (sc->arg_fmt[arg.idx].show_zero || - sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || - sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && - sc->arg_fmt[arg.idx].parm)) + * Suppress this argument if its value is zero and show_zero + * property isn't set. + * + * If it has a BTF type, then override the zero suppression knob + * as the common case is for zero in an enum to have an associated entry. + */ + if (val == 0 && !trace->show_zeros && + !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) && + !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE)) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2038,6 +2433,17 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + default_scnprintf = sc->arg_fmt[arg.idx].scnprintf; + + if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) { + btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed, + size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; + } + } + printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val); } @@ -2064,13 +2470,92 @@ next_arg: return printed; } +static struct syscall *syscall__new(int e_machine, int id) +{ + struct syscall *sc = zalloc(sizeof(*sc)); + + if (!sc) + return NULL; + + sc->e_machine = e_machine; + sc->id = id; + return sc; +} + +static void syscall__delete(struct syscall *sc) +{ + if (!sc) + return; + + free(sc->arg_fmt); + free(sc); +} + +static int syscall__bsearch_cmp(const void *key, const void *entry) +{ + const struct syscall *a = key, *b = *((const struct syscall **)entry); + + if (a->e_machine != b->e_machine) + return a->e_machine - b->e_machine; + + return a->id - b->id; +} + +static int syscall__cmp(const void *va, const void *vb) +{ + const struct syscall *a = *((const struct syscall **)va); + const struct syscall *b = *((const struct syscall **)vb); + + if (a->e_machine != b->e_machine) + return a->e_machine - b->e_machine; + + return a->id - b->id; +} + +static struct syscall *trace__find_syscall(struct trace *trace, int e_machine, int id) +{ + struct syscall key = { + .e_machine = e_machine, + .id = id, + }; + struct syscall *sc, **tmp; + + if (trace->syscalls.table) { + struct syscall **sc_entry = bsearch(&key, trace->syscalls.table, + trace->syscalls.table_size, + sizeof(trace->syscalls.table[0]), + syscall__bsearch_cmp); + + if (sc_entry) + return *sc_entry; + } + + sc = syscall__new(e_machine, id); + if (!sc) + return NULL; + + tmp = reallocarray(trace->syscalls.table, trace->syscalls.table_size + 1, + sizeof(trace->syscalls.table[0])); + if (!tmp) { + syscall__delete(sc); + return NULL; + } + + trace->syscalls.table = tmp; + trace->syscalls.table[trace->syscalls.table_size++] = sc; + qsort(trace->syscalls.table, trace->syscalls.table_size, sizeof(trace->syscalls.table[0]), + syscall__cmp); + return sc; +} + typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel, union perf_event *event, struct perf_sample *sample); -static struct syscall *trace__syscall_info(struct trace *trace, - struct evsel *evsel, int id) +static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *evsel, + int e_machine, int id) { + struct syscall *sc; int err = 0; if (id < 0) { @@ -2095,42 +2580,20 @@ static struct syscall *trace__syscall_info(struct trace *trace, err = -EINVAL; -#ifdef HAVE_SYSCALL_TABLE_SUPPORT - if (id > trace->sctbl->syscalls.max_id) { -#else - if (id >= trace->sctbl->syscalls.max_id) { - /* - * With libaudit we don't know beforehand what is the max_id, - * so we let trace__read_syscall_info() figure that out as we - * go on reading syscalls. - */ - err = trace__read_syscall_info(trace, id); - if (err) -#endif - goto out_cant_read; - } - - if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) && - (err = trace__read_syscall_info(trace, id)) != 0) - goto out_cant_read; + sc = trace__find_syscall(trace, e_machine, id); + if (sc) + err = syscall__read_info(sc, trace); - if (trace->syscalls.table[id].name == NULL) { - if (trace->syscalls.table[id].nonexistent) - return NULL; - goto out_cant_read; - } - - return &trace->syscalls.table[id]; - -out_cant_read: - if (verbose > 0) { + if (err && verbose > 0) { char sbuf[STRERR_BUFSIZE]; - fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf))); - if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL) - fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); + + fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, + str_error_r(-err, sbuf, sizeof(sbuf))); + if (sc && sc->name) + fprintf(trace->output, "(%s)", sc->name); fputs(" information\n", trace->output); } - return NULL; + return err ? NULL : sc; } struct syscall_stats { @@ -2141,27 +2604,29 @@ struct syscall_stats { }; static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace, - int id, struct perf_sample *sample, long err, bool errno_summary) + int id, struct perf_sample *sample, long err, + struct trace *trace) { - struct int_node *inode; - struct syscall_stats *stats; + struct hashmap *syscall_stats = ttrace->syscall_stats; + struct syscall_stats *stats = NULL; u64 duration = 0; - inode = intlist__findnew(ttrace->syscall_stats, id); - if (inode == NULL) + if (trace->summary_bpf) return; - stats = inode->priv; - if (stats == NULL) { - stats = malloc(sizeof(*stats)); + if (trace->summary_mode == SUMMARY__BY_TOTAL) + syscall_stats = trace->syscall_stats; + + if (!hashmap__find(syscall_stats, id, &stats)) { + stats = zalloc(sizeof(*stats)); if (stats == NULL) return; - stats->nr_failures = 0; - stats->max_errno = 0; - stats->errnos = NULL; init_stats(&stats->stats); - inode->priv = stats; + if (hashmap__add(syscall_stats, id, stats) < 0) { + free(stats); + return; + } } if (ttrace->entry_time && sample->time > ttrace->entry_time) @@ -2172,7 +2637,7 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr if (err < 0) { ++stats->nr_failures; - if (!errno_summary) + if (!trace->errno_summary) return; err = -err; @@ -2183,7 +2648,8 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32)); } else { pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n", - thread__comm_str(thread), thread->pid_, thread->tid); + thread__comm_str(thread), thread__pid(thread), + thread__tid(thread)); return; } @@ -2242,7 +2708,6 @@ static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel, static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size) { - void *augmented_args = NULL; /* * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter * and there we get all 6 syscall args plus the tracepoint common fields @@ -2260,10 +2725,24 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam int args_size = raw_augmented_args_size ?: sc->args_size; *augmented_args_size = sample->raw_size - args_size; - if (*augmented_args_size > 0) - augmented_args = sample->raw_data + args_size; + if (*augmented_args_size > 0) { + static uintptr_t argbuf[1024]; /* assuming single-threaded */ + + if ((size_t)(*augmented_args_size) > sizeof(argbuf)) + return NULL; - return augmented_args; + /* + * The perf ring-buffer is 8-byte aligned but sample->raw_data + * is not because it's preceded by u32 size. Later, beautifier + * will use the augmented args with stricter alignments like in + * some struct. To make sure it's aligned, let's copy the args + * into a static buffer as it's single-threaded for now. + */ + memcpy(argbuf, sample->raw_data + args_size, *augmented_args_size); + + return argbuf; + } + return NULL; } static int trace__sys_enter(struct trace *trace, struct evsel *evsel, @@ -2275,16 +2754,17 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel, int printed = 0; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; - int augmented_args_size = 0; + int augmented_args_size = 0, e_machine; void *augmented_args = NULL; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + struct syscall *sc; struct thread_trace *ttrace; - if (sc == NULL) - return -1; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + goto out_put; + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; @@ -2351,16 +2831,19 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel, struct thread_trace *ttrace; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + struct syscall *sc; char msg[1024]; void *args, *augmented_args = NULL; - int augmented_args_size; + int augmented_args_size, e_machine; + size_t printed = 0; - if (sc == NULL) - return -1; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + goto out_put; + ttrace = thread__trace(thread, trace); /* * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() * and the rest of the beautifiers accessing it via struct syscall_arg touches it. @@ -2370,8 +2853,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel, args = perf_evsel__sc_tp_ptr(evsel, args, sample); augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); - syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); - fprintf(trace->output, "%s", msg); + printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); + fprintf(trace->output, "%.*s", (int)printed, msg); err = 0; out_put: thread__put(thread); @@ -2386,13 +2869,15 @@ static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel, int max_stack = evsel->core.attr.sample_max_stack ? evsel->core.attr.sample_max_stack : trace->max_stack; - int err; + int err = -1; + addr_location__init(&al); if (machine__resolve(trace->host, &al, sample) < 0) - return -1; + goto out; err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); - addr_location__put(&al); +out: + addr_location__exit(&al); return err; } @@ -2403,15 +2888,14 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam EVSEL__PRINT_DSO | EVSEL__PRINT_UNKNOWN_AS_ADDR; - return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output); + return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output); } static const char *errno_to_name(struct evsel *evsel, int err) { struct perf_env *env = evsel__env(evsel); - const char *arch_name = perf_env__arch(env); - return arch_syscalls__strerrno(arch_name, err); + return perf_env__arch_strerrno(env, err); } static int trace__sys_exit(struct trace *trace, struct evsel *evsel, @@ -2423,15 +2907,16 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, bool duration_calculated = false; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0; - int alignment = trace->args_alignment; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + int alignment = trace->args_alignment, e_machine; + struct syscall *sc; struct thread_trace *ttrace; - if (sc == NULL) - return -1; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + goto out_put; + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; @@ -2440,7 +2925,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); if (trace->summary) - thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary); + thread__update_stats(thread, ttrace, id, sample, ret, trace); if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) { trace__set_fd_pathname(thread, ret, ttrace->filename.name); @@ -2457,9 +2942,11 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, goto out; if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2518,9 +3005,9 @@ errno_print: { else if (sc->fmt->errpid) { struct thread *child = machine__find_thread(trace->host, ret, ret); + fprintf(trace->output, "%ld", ret); if (child != NULL) { - fprintf(trace->output, "%ld", ret); - if (child->comm_set) + if (thread__comm_set(child)) fprintf(trace->output, " (%s)", thread__comm_str(child)); thread__put(child); } @@ -2618,7 +3105,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel, struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - struct thread_trace *ttrace = thread__trace(thread, trace->output); + struct thread_trace *ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_dump; @@ -2676,9 +3163,10 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, { char bf[2048]; size_t size = sizeof(bf); - struct tep_format_field *field = evsel->tp_format->format.fields; + const struct tep_event *tp_format = evsel__tp_format(evsel); + struct tep_format_field *field = tp_format ? tp_format->format.fields : NULL; struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel); - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg syscall_arg = { @@ -2706,6 +3194,8 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, offset = format_field__intval(field, sample, evsel->needs_swap); syscall_arg.len = offset >> 16; offset &= 0xffff; + if (tep_field_is_relative(field->flags)) + offset += field->offset + field->size; } val = (uintptr_t)(sample->raw_data + offset); @@ -2717,32 +3207,25 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, */ val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val); - /* - * Suppress this argument if its value is zero and - * and we don't have a string associated in an - * strarray for it. - */ - if (val == 0 && - !trace->show_zeros && - !((arg->show_zero || - arg->scnprintf == SCA_STRARRAY || - arg->scnprintf == SCA_STRARRAYS) && - arg->parm)) + /* Suppress this argument if its value is zero and show_zero property isn't set. */ + if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); - /* - * XXX Perhaps we should have a show_tp_arg_names, - * leaving show_arg_names just for syscalls? - */ - if (1 || trace->show_arg_names) + if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; + } + printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); } - return printed + fprintf(trace->output, "%s", bf); + return fprintf(trace->output, "%.*s", (int)printed, bf); } static int trace__event_handler(struct trace *trace, struct evsel *evsel, @@ -2751,21 +3234,18 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, { struct thread *thread; int callchain_ret = 0; - /* - * Check if we called perf_evsel__disable(evsel) due to, for instance, - * this event's max_events having been hit and this is an entry coming - * from the ring buffer that we should discard, since the max events - * have already been considered/printed. - */ - if (evsel->disabled) + + if (evsel->nr_events_printed >= evsel->max_events) return 0; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2780,9 +3260,10 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (thread) trace__fprintf_comm_tid(trace, thread, trace->output); - if (evsel == trace->syscalls.events.augmented) { + if (evsel == trace->syscalls.events.bpf_output) { int id = perf_evsel__sc_tp_uint(evsel, id, sample); - struct syscall *sc = trace__syscall_info(trace, evsel, id); + int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST; + struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc) { fprintf(trace->output, "%s(", sc->name); @@ -2802,11 +3283,13 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (evsel__is_bpf_output(evsel)) { bpf_output__fprintf(trace, sample); - } else if (evsel->tp_format) { - if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || - trace__fprintf_sys_enter(trace, evsel, sample)) { + } else { + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format && (strncmp(tp_format->name, "sys_enter_", 10) || + trace__fprintf_sys_enter(trace, evsel, sample))) { if (trace->libtraceevent_print) { - event_format__fprintf(evsel->tp_format, sample->cpu, + event_format__fprintf(tp_format, sample->cpu, sample->raw_data, sample->raw_size, trace->output); } else { @@ -2840,7 +3323,7 @@ static void print_location(FILE *f, struct perf_sample *sample, { if ((verbose > 0 || print_dso) && al->map) - fprintf(f, "%s@", al->map->dso->long_name); + fprintf(f, "%s@", dso__long_name(map__dso(al->map))); if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, @@ -2863,25 +3346,31 @@ static int trace__pgfault(struct trace *trace, int err = -1; int callchain_ret = 0; + addr_location__init(&al); thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out_put; callchain_ret = 1; } } - ttrace = thread__trace(thread, trace->output); + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; - if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) + if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) { ttrace->pfmaj++; - else + trace->pfmaj++; + } else { ttrace->pfmin++; + trace->pfmin++; + } if (trace->summary_only) goto out; @@ -2923,6 +3412,7 @@ out: err = 0; out_put: thread__put(thread); + addr_location__exit(&al); return err; } @@ -2943,7 +3433,7 @@ static void trace__set_base_time(struct trace *trace, trace->base_time = sample->time; } -static int trace__process_sample(struct perf_tool *tool, +static int trace__process_sample(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -3039,6 +3529,7 @@ out_free: } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); +static size_t trace__fprintf_total_summary(struct trace *trace, FILE *fp); static bool evlist__add_vfs_getname(struct evlist *evlist) { @@ -3047,15 +3538,11 @@ static bool evlist__add_vfs_getname(struct evlist *evlist) struct parse_events_error err; int ret; - bzero(&err, sizeof(err)); + parse_events_error__init(&err); ret = parse_events(evlist, "probe:vfs_getname*", &err); - if (ret) { - free(err.str); - free(err.help); - free(err.first_str); - free(err.first_help); + parse_events_error__exit(&err); + if (ret) return false; - } evlist__for_each_entry_safe(evlist, evsel, tmp) { if (!strstarts(evsel__name(evsel), "probe:vfs_getname")) @@ -3095,6 +3582,16 @@ static struct evsel *evsel__new_pgfault(u64 config) return evsel; } +static void evlist__free_syscall_tp_fields(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + evsel_trace__delete(evsel->priv); + evsel->priv = NULL; + } +} + static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) { const u32 type = event->header.type; @@ -3105,7 +3602,7 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st return; } - evsel = perf_evlist__id2evsel(trace->evlist, sample->id); + evsel = evlist__id2evsel(trace->evlist, sample->id); if (evsel == NULL) { fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); return; @@ -3203,33 +3700,41 @@ out_enomem: goto out; } -#ifdef HAVE_LIBBPF_SUPPORT -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name) +#ifdef HAVE_BPF_SKEL +static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) { - if (trace->bpf_obj == NULL) - return NULL; + int id; - return bpf_object__find_map_by_name(trace->bpf_obj, name); -} + if (arg_fmt->type != NULL) + return -1; -static void trace__set_bpf_map_filtered_pids(struct trace *trace) -{ - trace->filter_pids.map = trace__find_bpf_map_by_name(trace, "pids_filtered"); -} + id = btf__find_by_name(btf, type); + if (id < 0) + return -1; -static void trace__set_bpf_map_syscalls(struct trace *trace) -{ - trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls"); - trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter"); - trace->syscalls.prog_array.sys_exit = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit"); + arg_fmt->type = btf__type_by_id(btf, id); + arg_fmt->type_id = id; + + return 0; } static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name) { - if (trace->bpf_obj == NULL) + struct bpf_program *pos, *prog = NULL; + const char *sec_name; + + if (trace->skel->obj == NULL) return NULL; - return bpf_object__find_program_by_title(trace->bpf_obj, name); + bpf_object__for_each_program(pos, trace->skel->obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, name)) { + prog = pos; + break; + } + } + + return prog; } static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc, @@ -3239,12 +3744,12 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str if (prog_name == NULL) { char default_prog_name[256]; - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name); + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name); prog = trace__find_bpf_program_by_title(trace, default_prog_name); if (prog != NULL) goto out_found; if (sc->fmt && sc->fmt->alias) { - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias); + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias); prog = trace__find_bpf_program_by_title(trace, default_prog_name); if (prog != NULL) goto out_found; @@ -3262,12 +3767,12 @@ out_found: pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n", prog_name, type, sc->name); out_unaugmented: - return trace->syscalls.unaugmented_prog; + return trace->skel->progs.syscall_unaugmented; } -static void trace__init_syscall_bpf_progs(struct trace *trace, int id) +static void trace__init_syscall_bpf_progs(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); if (sc == NULL) return; @@ -3276,97 +3781,107 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id) sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit"); } -static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id) +static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); - return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); + return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } -static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id) +static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); - return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); + return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } -static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry) +static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int e_machine, int key, unsigned int *beauty_array) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); - int arg = 0; + struct tep_format_field *field; + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); + const struct btf_type *bt; + char *struct_offset, *tmp, name[32]; + bool can_augment = false; + int i, cnt; if (sc == NULL) - goto out; + return -1; - for (; arg < sc->nr_args; ++arg) { - entry->string_args_len[arg] = 0; - if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) { - /* Should be set like strace -s strsize */ - entry->string_args_len[arg] = PATH_MAX; - } - } -out: - for (; arg < 6; ++arg) - entry->string_args_len[arg] = 0; -} -static int trace__set_ev_qualifier_bpf_filter(struct trace *trace) -{ - int fd = bpf_map__fd(trace->syscalls.map); - struct bpf_map_syscall_entry value = { - .enabled = !trace->not_ev_qualifier, - }; - int err = 0; - size_t i; + trace__load_vmlinux_btf(trace); + if (trace->btf == NULL) + return -1; - for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) { - int key = trace->ev_qualifier_ids.entries[i]; + for (i = 0, field = sc->args; field; ++i, field = field->next) { + // XXX We're only collecting pointer payloads _from_ user space + if (!sc->arg_fmt[i].from_user) + continue; - if (value.enabled) { - trace__init_bpf_map_syscall_args(trace, key, &value); - trace__init_syscall_bpf_progs(trace, key); - } + struct_offset = strstr(field->type, "struct "); + if (struct_offset == NULL) + struct_offset = strstr(field->type, "union "); + else + struct_offset++; // "union" is shorter - err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); - if (err) - break; - } + if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */ + struct_offset += 6; - return err; -} + /* for 'struct foo *', we only want 'foo' */ + for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) { + } -static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled) -{ - int fd = bpf_map__fd(trace->syscalls.map); - struct bpf_map_syscall_entry value = { - .enabled = enabled, - }; - int err = 0, key; + strncpy(name, struct_offset, cnt); + name[cnt] = '\0'; - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - if (enabled) - trace__init_bpf_map_syscall_args(trace, key, &value); + /* cache struct's btf_type and type_id */ + if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name)) + continue; - err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); - if (err) - break; + bt = sc->arg_fmt[i].type; + beauty_array[i] = bt->size; + can_augment = true; + } else if (field->flags & TEP_FIELD_IS_POINTER && /* string */ + strcmp(field->type, "const char *") == 0 && + (strstr(field->name, "name") || + strstr(field->name, "path") || + strstr(field->name, "file") || + strstr(field->name, "root") || + strstr(field->name, "key") || + strstr(field->name, "special") || + strstr(field->name, "type") || + strstr(field->name, "description"))) { + beauty_array[i] = 1; + can_augment = true; + } else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */ + strstr(field->type, "char *") && + (strstr(field->name, "buf") || + strstr(field->name, "val") || + strstr(field->name, "msg"))) { + int j; + struct tep_format_field *field_tmp; + + /* find the size of the buffer that appears in pairs with buf */ + for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) { + if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */ + (strstr(field_tmp->name, "count") || + strstr(field_tmp->name, "siz") || /* size, bufsiz */ + (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) { + /* filename's got 'len' in it, we don't want that */ + beauty_array[i] = -(j + 1); + can_augment = true; + break; + } + } + } } - return err; -} - -static int trace__init_syscalls_bpf_map(struct trace *trace) -{ - bool enabled = true; - - if (trace->ev_qualifier_ids.nr) - enabled = trace->not_ev_qualifier; + if (can_augment) + return 0; - return __trace__init_syscalls_bpf_map(trace, enabled); + return -1; } -static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc) +static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, + struct syscall *sc) { struct tep_format_field *field, *candidate_field; - int id; - /* * We're only interested in syscalls that have a pointer: */ @@ -3378,13 +3893,14 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace return NULL; try_to_find_pair: - for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) { - struct syscall *pair = trace__syscall_info(trace, NULL, id); + for (int i = 0, num_idx = syscalltbl__num_idx(sc->e_machine); i < num_idx; ++i) { + int id = syscalltbl__id_at_idx(sc->e_machine, i); + struct syscall *pair = trace__syscall_info(trace, NULL, sc->e_machine, id); struct bpf_program *pair_prog; bool is_candidate = false; - if (pair == NULL || pair == sc || - pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog) + if (pair == NULL || pair->id == sc->id || + pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented) continue; for (field = sc->args, candidate_field = pair->args; @@ -3408,6 +3924,19 @@ try_to_find_pair: if (strcmp(field->type, candidate_field->type)) goto next_candidate; + /* + * This is limited in the BPF program but sys_write + * uses "const char *" for its "buf" arg so we need to + * use some heuristic that is kinda future proof... + */ + if (strcmp(field->type, "const char *") == 0 && + !(strstr(field->name, "name") || + strstr(field->name, "path") || + strstr(field->name, "file") || + strstr(field->name, "root") || + strstr(field->name, "description"))) + goto next_candidate; + is_candidate = true; } @@ -3437,11 +3966,12 @@ try_to_find_pair: */ if (pair_prog == NULL) { pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter"); - if (pair_prog == trace->syscalls.unaugmented_prog) + if (pair_prog == trace->skel->progs.syscall_unaugmented) goto next_candidate; } - pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name); + pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, + sc->name); return pair_prog; next_candidate: continue; @@ -3450,29 +3980,40 @@ try_to_find_pair: return NULL; } -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) +static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace, int e_machine) { - int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter), - map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit); - int err = 0, key; + int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter); + int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit); + int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter); + int err = 0; + unsigned int beauty_array[6]; - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - int prog_fd; + for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) { + int prog_fd, key = syscalltbl__id_at_idx(e_machine, i); if (!trace__syscall_enabled(trace, key)) continue; - trace__init_syscall_bpf_progs(trace, key); + trace__init_syscall_bpf_progs(trace, e_machine, key); // It'll get at least the "!raw_syscalls:unaugmented" - prog_fd = trace__bpf_prog_sys_enter_fd(trace, key); + prog_fd = trace__bpf_prog_sys_enter_fd(trace, e_machine, key); err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY); if (err) break; - prog_fd = trace__bpf_prog_sys_exit_fd(trace, key); + prog_fd = trace__bpf_prog_sys_exit_fd(trace, e_machine, key); err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY); if (err) break; + + /* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */ + memset(beauty_array, 0, sizeof(beauty_array)); + err = trace__bpf_sys_enter_beauty_map(trace, e_machine, key, (unsigned int *)beauty_array); + if (err) + continue; + err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY); + if (err) + break; } /* @@ -3503,8 +4044,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) * first and second arg (this one on the raw_syscalls:sys_exit prog * array tail call, then that one will be used. */ - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - struct syscall *sc = trace__syscall_info(trace, NULL, key); + for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) { + int key = syscalltbl__id_at_idx(e_machine, i); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); struct bpf_program *pair_prog; int prog_fd; @@ -3515,7 +4057,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) * For now we're just reusing the sys_enter prog, and if it * already has an augmenter, we don't need to find one. */ - if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog) + if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented) continue; /* @@ -3538,89 +4080,12 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) break; } - return err; } - -static void trace__delete_augmented_syscalls(struct trace *trace) -{ - struct evsel *evsel, *tmp; - - evlist__remove(trace->evlist, trace->syscalls.events.augmented); - evsel__delete(trace->syscalls.events.augmented); - trace->syscalls.events.augmented = NULL; - - evlist__for_each_entry_safe(trace->evlist, tmp, evsel) { - if (evsel->bpf_obj == trace->bpf_obj) { - evlist__remove(trace->evlist, evsel); - evsel__delete(evsel); - } - - } - - bpf_object__close(trace->bpf_obj); - trace->bpf_obj = NULL; -} -#else // HAVE_LIBBPF_SUPPORT -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused, - const char *name __maybe_unused) -{ - return NULL; -} - -static void trace__set_bpf_map_filtered_pids(struct trace *trace __maybe_unused) -{ -} - -static void trace__set_bpf_map_syscalls(struct trace *trace __maybe_unused) -{ -} - -static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused) -{ - return 0; -} - -static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused) -{ - return 0; -} - -static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused, - const char *name __maybe_unused) -{ - return NULL; -} - -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused) -{ - return 0; -} - -static void trace__delete_augmented_syscalls(struct trace *trace __maybe_unused) -{ -} -#endif // HAVE_LIBBPF_SUPPORT - -static bool trace__only_augmented_syscalls_evsels(struct trace *trace) -{ - struct evsel *evsel; - - evlist__for_each_entry(trace->evlist, evsel) { - if (evsel == trace->syscalls.events.augmented || - evsel->bpf_obj == trace->bpf_obj) - continue; - - return false; - } - - return true; -} +#endif // HAVE_BPF_SKEL static int trace__set_ev_qualifier_filter(struct trace *trace) { - if (trace->syscalls.map) - return trace__set_ev_qualifier_bpf_filter(trace); if (trace->syscalls.events.sys_enter) return trace__set_ev_qualifier_tp_filter(trace); return 0; @@ -3653,20 +4118,25 @@ static int trace__set_filter_loop_pids(struct trace *trace) struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); while (thread && nr < ARRAY_SIZE(pids)) { - struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); + struct thread *parent = machine__find_thread(trace->host, + thread__ppid(thread), + thread__ppid(thread)); if (parent == NULL) break; if (!strcmp(thread__comm_str(parent), "sshd") || strstarts(thread__comm_str(parent), "gnome-terminal")) { - pids[nr++] = parent->tid; + pids[nr++] = thread__tid(parent); + thread__put(parent); break; } + thread__put(thread); thread = parent; } + thread__put(thread); - err = perf_evlist__append_tp_filter_pids(trace->evlist, nr, pids); + err = evlist__append_tp_filter_pids(trace->evlist, nr, pids); if (!err && trace->filter_pids.map) err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids); @@ -3680,11 +4150,11 @@ static int trace__set_filter_pids(struct trace *trace) * Better not use !target__has_task() here because we need to cover the * case where no threads were specified in the command line, but a * workload was, and in that case we will fill in the thread_map when - * we fork the workload in perf_evlist__prepare_workload. + * we fork the workload in evlist__prepare_workload. */ if (trace->filter_pids.nr > 0) { - err = perf_evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr, - trace->filter_pids.entries); + err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr, + trace->filter_pids.entries); if (!err && trace->filter_pids.map) { err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr, trace->filter_pids.entries); @@ -3702,12 +4172,14 @@ static int __trace__deliver_event(struct trace *trace, union perf_event *event) struct perf_sample sample; int err; - err = perf_evlist__parse_sample(evlist, event, &sample); + perf_sample__init(&sample, /*all=*/false); + err = evlist__parse_sample(evlist, event, &sample); if (err) fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); else trace__handle_event(trace, event, &sample); + perf_sample__exit(&sample); return 0; } @@ -3735,11 +4207,11 @@ static int trace__deliver_event(struct trace *trace, union perf_event *event) if (!trace->sort_events) return __trace__deliver_event(trace, event); - err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last); + err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last); if (err && err != -1) return err; - err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0); + err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL); if (err) return err; @@ -3754,22 +4226,31 @@ static int ordered_events__deliver_event(struct ordered_events *oe, return __trace__deliver_event(trace, event->event); } -static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg) +static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg, + char **type) { - struct tep_format_field *field; struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel); + const struct tep_event *tp_format; - if (evsel->tp_format == NULL || fmt == NULL) + if (!fmt) return NULL; - for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt) - if (strcmp(field->name, arg) == 0) + tp_format = evsel__tp_format(evsel); + if (!tp_format) + return NULL; + + for (const struct tep_format_field *field = tp_format->format.fields; field; + field = field->next, ++fmt) { + if (strcmp(field->name, arg) == 0) { + *type = field->type; return fmt; + } + } return NULL; } -static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel) +static int trace__expand_filter(struct trace *trace, struct evsel *evsel) { char *tok, *left = evsel->filter, *new_filter = evsel->filter; @@ -3802,14 +4283,14 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel struct syscall_arg_fmt *fmt; int left_size = tok - left, right_size = right_end - right; - char arg[128]; + char arg[128], *type; while (isspace(left[left_size - 1])) --left_size; scnprintf(arg, sizeof(arg), "%.*s", left_size, left); - fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg); + fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type); if (fmt == NULL) { pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n", arg, evsel->name, evsel->filter); @@ -3822,6 +4303,9 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel if (fmt->strtoul) { u64 val; struct syscall_arg syscall_arg = { + .trace = trace, + .fmt = fmt, + .type_name = type, .parm = fmt->parm, }; @@ -3896,6 +4380,14 @@ static int trace__run(struct trace *trace, int argc, const char **argv) trace->live = true; + if (trace->summary_bpf) { + if (trace_prepare_bpf_summary(trace->summary_mode) < 0) + goto out_delete_evlist; + + if (trace->summary_only) + goto create_maps; + } + if (!trace->raw_augmented_syscalls) { if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) goto out_error_raw_syscalls; @@ -3920,6 +4412,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv) evlist__add(evlist, pgfault_min); } + /* Enable ignoring missing threads when -u/-p option is defined. */ + trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid; + if (trace->sched && evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime)) goto out_error_sched_stat_runtime; @@ -3951,7 +4446,8 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (trace->cgroup) evlist__set_default_cgroup(trace->evlist, trace->cgroup); - err = perf_evlist__create_maps(evlist, &trace->opts.target); +create_maps: + err = evlist__create_maps(evlist, &trace->opts.target); if (err < 0) { fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); goto out_delete_evlist; @@ -3963,43 +4459,61 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config(evlist, &trace->opts, &callchain_param); + if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) { + trace->syscall_stats = alloc_syscall_stats(); + if (trace->syscall_stats == NULL) + goto out_delete_evlist; + } - signal(SIGCHLD, sig_handler); - signal(SIGINT, sig_handler); + evlist__config(evlist, &trace->opts, &callchain_param); if (forks) { - err = perf_evlist__prepare_workload(evlist, &trace->opts.target, - argv, false, NULL); + err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL); if (err < 0) { fprintf(trace->output, "Couldn't run the workload!\n"); goto out_delete_evlist; } + workload_pid = evlist->workload.pid; } err = evlist__open(evlist); if (err < 0) goto out_error_open; +#ifdef HAVE_BPF_SKEL + if (trace->syscalls.events.bpf_output) { + struct perf_cpu cpu; - err = bpf__apply_obj_config(); - if (err) { - char errbuf[BUFSIZ]; - - bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); - pr_err("ERROR: Apply config to BPF failed: %s\n", - errbuf); - goto out_error_open; + /* + * Set up the __augmented_syscalls__ BPF map to hold for each + * CPU the bpf-output event's file descriptor. + */ + perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) { + int mycpu = cpu.cpu; + + bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__, + &mycpu, sizeof(mycpu), + xyarray__entry(trace->syscalls.events.bpf_output->core.fd, + mycpu, 0), + sizeof(__u32), BPF_ANY); + } } + if (trace->skel) + trace->filter_pids.map = trace->skel->maps.pids_filtered; +#endif err = trace__set_filter_pids(trace); if (err < 0) goto out_error_mem; - if (trace->syscalls.map) - trace__init_syscalls_bpf_map(trace); - - if (trace->syscalls.prog_array.sys_enter) - trace__init_syscalls_bpf_prog_array_maps(trace); +#ifdef HAVE_BPF_SKEL + if (trace->skel && trace->skel->progs.sys_enter) { + /* + * TODO: Initialize for all host binary machine types, not just + * those matching the perf binary. + */ + trace__init_syscalls_bpf_prog_array_maps(trace, EM_HOST); + } +#endif if (trace->ev_qualifier_ids.nr > 0) { err = trace__set_ev_qualifier_filter(trace); @@ -4023,36 +4537,39 @@ static int trace__run(struct trace *trace, int argc, const char **argv) * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is * not in use. */ - trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close")); + /* TODO: support for more than just perf binary machine type close. */ + trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(EM_HOST, "close")); err = trace__expand_filters(trace, &evsel); if (err) goto out_delete_evlist; - err = perf_evlist__apply_filters(evlist, &evsel); + err = evlist__apply_filters(evlist, &evsel, &trace->opts.target); if (err < 0) goto out_error_apply_filters; - if (trace->dump.map) - bpf_map__fprintf(trace->dump.map, trace->output); - - err = evlist__mmap(evlist, trace->opts.mmap_pages); - if (err < 0) - goto out_error_mmap; + if (!trace->summary_only || !trace->summary_bpf) { + err = evlist__mmap(evlist, trace->opts.mmap_pages); + if (err < 0) + goto out_error_mmap; + } - if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) + if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay) evlist__enable(evlist); if (forks) - perf_evlist__start_workload(evlist); + evlist__start_workload(evlist); - if (trace->opts.initial_delay) { - usleep(trace->opts.initial_delay * 1000); + if (trace->opts.target.initial_delay) { + usleep(trace->opts.target.initial_delay * 1000); evlist__enable(evlist); } + if (trace->summary_bpf) + trace_start_bpf_summary(); + trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 || - evlist->core.threads->nr > 1 || - evlist__first(evlist)->core.attr.inherit; + perf_thread_map__nr(evlist->core.threads) > 1 || + evlist__first(evlist)->core.attr.inherit; /* * Now that we already used evsel->core.attr to ask the kernel to setup the @@ -4117,12 +4634,21 @@ out_disable: evlist__disable(evlist); + if (trace->summary_bpf) + trace_end_bpf_summary(); + if (trace->sort_events) ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL); if (!err) { - if (trace->summary) - trace__fprintf_thread_summary(trace, trace->output); + if (trace->summary) { + if (trace->summary_bpf) + trace_print_bpf_summary(trace->output); + else if (trace->summary_mode == SUMMARY__BY_TOTAL) + trace__fprintf_total_summary(trace, trace->output); + else + trace__fprintf_thread_summary(trace, trace->output); + } if (trace->show_tool_stats) { fprintf(trace->output, "Stats:\n " @@ -4134,8 +4660,10 @@ out_disable: } out_delete_evlist: + trace_cleanup_bpf_summary(); + delete_syscall_stats(trace->syscall_stats); trace__symbols__exit(trace); - + evlist__free_syscall_tp_fields(evlist); evlist__delete(evlist); cgroup__put(trace->cgroup); trace->evlist = NULL; @@ -4193,6 +4721,7 @@ static int trace__replay(struct trace *trace) struct evsel *evsel; int err = -1; + perf_tool__init(&trace->tool, /*ordered_events=*/true); trace->tool.sample = trace__process_sample; trace->tool.mmap = perf_event__process_mmap; trace->tool.mmap2 = perf_event__process_mmap2; @@ -4210,7 +4739,7 @@ static int trace__replay(struct trace *trace) /* add tid to output */ trace->multiple_threads = true; - session = perf_session__new(&data, false, &trace->tool); + session = perf_session__new(&data, &trace->tool); if (IS_ERR(session)) return PTR_ERR(session); @@ -4229,12 +4758,11 @@ static int trace__replay(struct trace *trace) if (err) goto out; - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "raw_syscalls:sys_enter"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter"); + trace->syscalls.events.sys_enter = evsel; /* older kernels have syscalls tp versus raw_syscalls */ if (evsel == NULL) - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "syscalls:sys_enter"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter"); if (evsel && (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || @@ -4243,11 +4771,10 @@ static int trace__replay(struct trace *trace) goto out; } - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "raw_syscalls:sys_exit"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit"); + trace->syscalls.events.sys_exit = evsel; if (evsel == NULL) - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "syscalls:sys_exit"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || perf_evsel__init_sc_tp_uint_field(evsel, ret))) { @@ -4263,6 +4790,12 @@ static int trace__replay(struct trace *trace) evsel->handler = trace__pgfault; } + if (trace->summary_mode == SUMMARY__BY_TOTAL) { + trace->syscall_stats = alloc_syscall_stats(); + if (trace->syscall_stats == NULL) + goto out; + } + setup_pager(); err = perf_session__process_events(session); @@ -4273,12 +4806,13 @@ static int trace__replay(struct trace *trace) trace__fprintf_thread_summary(trace, trace->output); out: + delete_syscall_stats(trace->syscall_stats); perf_session__delete(session); return err; } -static size_t trace__fprintf_threads_header(FILE *fp) +static size_t trace__fprintf_summary_header(FILE *fp) { size_t printed; @@ -4287,29 +4821,56 @@ static size_t trace__fprintf_threads_header(FILE *fp) return printed; } -DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, +struct syscall_entry { struct syscall_stats *stats; double msecs; int syscall; -) +}; + +static int entry_cmp(const void *e1, const void *e2) { - struct int_node *source = rb_entry(nd, struct int_node, rb_node); - struct syscall_stats *stats = source->priv; + const struct syscall_entry *entry1 = e1; + const struct syscall_entry *entry2 = e2; - entry->syscall = source->i; - entry->stats = stats; - entry->msecs = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0; + return entry1->msecs > entry2->msecs ? -1 : 1; } -static size_t thread__dump_stats(struct thread_trace *ttrace, - struct trace *trace, FILE *fp) +static struct syscall_entry *syscall__sort_stats(struct hashmap *syscall_stats) +{ + struct syscall_entry *entry; + struct hashmap_entry *pos; + unsigned bkt, i, nr; + + nr = syscall_stats->sz; + entry = malloc(nr * sizeof(*entry)); + if (entry == NULL) + return NULL; + + i = 0; + hashmap__for_each_entry(syscall_stats, pos, bkt) { + struct syscall_stats *ss = pos->pvalue; + struct stats *st = &ss->stats; + + entry[i].stats = ss; + entry[i].msecs = (u64)st->n * (avg_stats(st) / NSEC_PER_MSEC); + entry[i].syscall = pos->key; + i++; + } + assert(i == nr); + + qsort(entry, nr, sizeof(*entry), entry_cmp); + return entry; +} + +static size_t syscall__dump_stats(struct trace *trace, int e_machine, FILE *fp, + struct hashmap *syscall_stats) { size_t printed = 0; struct syscall *sc; - struct rb_node *nd; - DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); + struct syscall_entry *entries; - if (syscall_stats == NULL) + entries = syscall__sort_stats(syscall_stats); + if (entries == NULL) return 0; printed += fprintf(fp, "\n"); @@ -4318,8 +4879,10 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); - resort_rb__for_each_entry(nd, syscall_stats) { - struct syscall_stats *stats = syscall_stats_entry->stats; + for (size_t i = 0; i < syscall_stats->sz; i++) { + struct syscall_entry *entry = &entries[i]; + struct syscall_stats *stats = entry->stats; + if (stats) { double min = (double)(stats->stats.min) / NSEC_PER_MSEC; double max = (double)(stats->stats.max) / NSEC_PER_MSEC; @@ -4330,34 +4893,48 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0; avg /= NSEC_PER_MSEC; - sc = &trace->syscalls.table[syscall_stats_entry->syscall]; + sc = trace__syscall_info(trace, /*evsel=*/NULL, e_machine, entry->syscall); + if (!sc) + continue; + printed += fprintf(fp, " %-15s", sc->name); printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f", - n, stats->nr_failures, syscall_stats_entry->msecs, min, avg); + n, stats->nr_failures, entry->msecs, min, avg); printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); if (trace->errno_summary && stats->nr_failures) { - const char *arch_name = perf_env__arch(trace->host->env); int e; for (e = 0; e < stats->max_errno; ++e) { if (stats->errnos[e] != 0) - fprintf(fp, "\t\t\t\t%s: %d\n", arch_syscalls__strerrno(arch_name, e + 1), stats->errnos[e]); + fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]); } } } } - resort_rb__delete(syscall_stats); + free(entries); printed += fprintf(fp, "\n\n"); return printed; } +static size_t thread__dump_stats(struct thread_trace *ttrace, + struct trace *trace, int e_machine, FILE *fp) +{ + return syscall__dump_stats(trace, e_machine, fp, ttrace->syscall_stats); +} + +static size_t system__dump_stats(struct trace *trace, int e_machine, FILE *fp) +{ + return syscall__dump_stats(trace, e_machine, fp, trace->syscall_stats); +} + static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) { size_t printed = 0; struct thread_trace *ttrace = thread__priv(thread); + int e_machine = thread__e_machine(thread, trace->host); double ratio; if (ttrace == NULL) @@ -4365,7 +4942,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; - printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); + printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread)); printed += fprintf(fp, "%lu events, ", ttrace->nr_events); printed += fprintf(fp, "%.1f%%", ratio); if (ttrace->pfmaj) @@ -4377,7 +4954,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac else if (fputc('\n', fp) != EOF) ++printed; - printed += thread__dump_stats(ttrace, trace, fp); + printed += thread__dump_stats(ttrace, trace, e_machine, fp); return printed; } @@ -4387,32 +4964,60 @@ static unsigned long thread__nr_events(struct thread_trace *ttrace) return ttrace ? ttrace->nr_events : 0; } -DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), - struct thread *thread; -) +static int trace_nr_events_cmp(void *priv __maybe_unused, + const struct list_head *la, + const struct list_head *lb) { - entry->thread = rb_entry(nd, struct thread, rb_node); + struct thread_list *a = list_entry(la, struct thread_list, list); + struct thread_list *b = list_entry(lb, struct thread_list, list); + unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread)); + unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread)); + + if (a_nr_events != b_nr_events) + return a_nr_events < b_nr_events ? -1 : 1; + + /* Identical number of threads, place smaller tids first. */ + return thread__tid(a->thread) < thread__tid(b->thread) + ? -1 + : (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0); } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) { - size_t printed = trace__fprintf_threads_header(fp); - struct rb_node *nd; - int i; + size_t printed = trace__fprintf_summary_header(fp); + LIST_HEAD(threads); - for (i = 0; i < THREADS__TABLE_SIZE; i++) { - DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); + if (machine__thread_list(trace->host, &threads) == 0) { + struct thread_list *pos; - if (threads == NULL) { - fprintf(fp, "%s", "Error sorting output by nr_events!\n"); - return 0; - } - - resort_rb__for_each_entry(nd, threads) - printed += trace__fprintf_thread(fp, threads_entry->thread, trace); + list_sort(NULL, &threads, trace_nr_events_cmp); - resort_rb__delete(threads); + list_for_each_entry(pos, &threads, list) + printed += trace__fprintf_thread(fp, pos->thread, trace); } + thread_list__delete(&threads); + return printed; +} + +static size_t trace__fprintf_total_summary(struct trace *trace, FILE *fp) +{ + size_t printed = trace__fprintf_summary_header(fp); + + printed += fprintf(fp, " total, "); + printed += fprintf(fp, "%lu events", trace->nr_events); + + if (trace->pfmaj) + printed += fprintf(fp, ", %lu majfaults", trace->pfmaj); + if (trace->pfmin) + printed += fprintf(fp, ", %lu minfaults", trace->pfmin); + if (trace->sched) + printed += fprintf(fp, ", %.3f msec\n", trace->runtime_ms); + else if (fputc('\n', fp) != EOF) + ++printed; + + /* TODO: get all system e_machines. */ + printed += system__dump_stats(trace, EM_HOST, fp); + return printed; } @@ -4506,50 +5111,65 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name) struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt) { - struct syscall_fmt *scfmt = syscall_fmt__find(name); + const struct syscall_fmt *scfmt = syscall_fmt__find(name); if (scfmt) { - int skip = 0; + const struct tep_event *tp_format = evsel__tp_format(evsel); - if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 || - strcmp(evsel->tp_format->format.fields->name, "nr") == 0) - ++skip; + if (tp_format) { + int skip = 0; - memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt)); + if (strcmp(tp_format->format.fields->name, "__syscall_nr") == 0 || + strcmp(tp_format->format.fields->name, "nr") == 0) + ++skip; + + memcpy(fmt + skip, scfmt->arg, + (tp_format->format.nr_fields - skip) * sizeof(*fmt)); + } } } } -static int evlist__set_syscall_tp_fields(struct evlist *evlist) +static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf) { struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { - if (evsel->priv || !evsel->tp_format) + const struct tep_event *tp_format; + + if (evsel->priv) + continue; + + tp_format = evsel__tp_format(evsel); + if (!tp_format) continue; - if (strcmp(evsel->tp_format->system, "syscalls")) { - evsel__init_tp_arg_scnprintf(evsel); + if (strcmp(tp_format->system, "syscalls")) { + evsel__init_tp_arg_scnprintf(evsel, use_btf); continue; } if (evsel__init_syscall_tp(evsel)) return -1; - if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { + if (!strncmp(tp_format->name, "sys_enter_", 10)) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) return -1; - evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1); - } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { + evsel__set_syscall_arg_fmt(evsel, + tp_format->name + sizeof("sys_enter_") - 1); + } else if (!strncmp(tp_format->name, "sys_exit_", 9)) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); - if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) + if (__tp_field__init_uint(&sc->ret, sizeof(u64), + sc->id.offset + sizeof(u64), + evsel->needs_swap)) return -1; - evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1); + evsel__set_syscall_arg_fmt(evsel, + tp_format->name + sizeof("sys_exit_") - 1); } } @@ -4573,7 +5193,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; @@ -4588,8 +5208,9 @@ static int trace__parse_events_option(const struct option *opt, const char *str, *sep = '\0'; list = 0; - if (syscalltbl__id(trace->sctbl, s) >= 0 || - syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { + /* TODO: support for more than just perf binary machine type syscalls. */ + if (syscalltbl__id(EM_HOST, s) >= 0 || + syscalltbl__strglobmatch_first(EM_HOST, s, &idx) >= 0) { list = 1; goto do_concat; } @@ -4639,12 +5260,18 @@ do_concat: err = 0; if (lists[0]) { - struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", - "event selector. use 'perf list' to list available events", - parse_events_option); + struct parse_events_option_args parse_events_option_args = { + .evlistp = &trace->evlist, + }; + struct option o = { + .value = &parse_events_option_args, + }; err = parse_events_option(&o, lists[0], 0); } out: + free(strace_groups_dir); + free(lists[0]); + free(lists[1]); if (sep) *sep = ','; @@ -4655,14 +5282,36 @@ static int trace__parse_cgroups(const struct option *opt, const char *str, int u { struct trace *trace = opt->value; - if (!list_empty(&trace->evlist->core.entries)) - return parse_cgroups(opt, str, unset); - + if (!list_empty(&trace->evlist->core.entries)) { + struct option o = { + .value = &trace->evlist, + }; + return parse_cgroups(&o, str, unset); + } trace->cgroup = evlist__findnew_cgroup(trace->evlist, str); return 0; } +static int trace__parse_summary_mode(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + struct trace *trace = opt->value; + + if (!strcmp(str, "thread")) { + trace->summary_mode = SUMMARY__BY_THREAD; + } else if (!strcmp(str, "total")) { + trace->summary_mode = SUMMARY__BY_TOTAL; + } else if (!strcmp(str, "cgroup")) { + trace->summary_mode = SUMMARY__BY_CGROUP; + } else { + pr_err("Unknown summary mode: %s\n", str); + return -1; + } + + return 0; +} + static int trace__config(const char *var, const char *value, void *arg) { struct trace *trace = arg; @@ -4707,6 +5356,36 @@ out: return err; } +static void trace__exit(struct trace *trace) +{ + strlist__delete(trace->ev_qualifier); + zfree(&trace->ev_qualifier_ids.entries); + if (trace->syscalls.table) { + for (size_t i = 0; i < trace->syscalls.table_size; i++) + syscall__delete(trace->syscalls.table[i]); + zfree(&trace->syscalls.table); + } + zfree(&trace->perfconfig_events); + evlist__delete(trace->evlist); + trace->evlist = NULL; +#ifdef HAVE_LIBBPF_SUPPORT + btf__free(trace->btf); + trace->btf = NULL; +#endif +} + +#ifdef HAVE_BPF_SKEL +static int bpf__setup_bpf_output(struct evlist *evlist) +{ + int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/"); + + if (err) + pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n"); + + return err; +} +#endif + int cmd_trace(int argc, const char **argv) { const char *trace_usage[] = { @@ -4738,7 +5417,6 @@ int cmd_trace(int argc, const char **argv) .max_stack = UINT_MAX, .max_events = ULONG_MAX, }; - const char *map_dump_str = NULL; const char *output_name = NULL; const struct option trace_options[] = { OPT_CALLBACK('e', "event", &trace, "event", @@ -4766,16 +5444,12 @@ int cmd_trace(int argc, const char **argv) OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, "child tasks do not inherit counters"), OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", - "number of mmap data pages", - perf_evlist__parse_mmap_pages), + "number of mmap data pages", evlist__parse_mmap_pages), OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", "user to profile"), OPT_CALLBACK(0, "duration", &trace, "float", "show only events with duration > N.M ms", trace__set_duration), -#ifdef HAVE_LIBBPF_SUPPORT - OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"), -#endif OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_BOOLEAN('T', "time", &trace.full_time, @@ -4788,6 +5462,9 @@ int cmd_trace(int argc, const char **argv) "Show all syscalls and summary with statistics"), OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary, "Show errno stats per syscall, use with -s or -S"), + OPT_CALLBACK(0, "summary-mode", &trace, "mode", + "How to show summary: select thread (default), total or cgroup", + trace__parse_summary_mode), OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", "Trace pagefaults", parse_pagefaults, "maj"), OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), @@ -4816,9 +5493,12 @@ int cmd_trace(int argc, const char **argv) "per thread proc mmap processing timeout in ms"), OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only", trace__parse_cgroups), - OPT_INTEGER('D', "delay", &trace.opts.initial_delay, + OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay, "ms to wait before starting measurement after program " "start"), + OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer" + "to customized ones"), + OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"), OPTS_EVSWITCH(&trace.evswitch), OPT_END() }; @@ -4828,14 +5508,20 @@ int cmd_trace(int argc, const char **argv) const char * const trace_subcommands[] = { "record", NULL }; int err = -1; char bf[BUFSIZ]; + struct sigaction sigchld_act; signal(SIGSEGV, sighandler_dump_stack); signal(SIGFPE, sighandler_dump_stack); + signal(SIGINT, sighandler_interrupt); + + memset(&sigchld_act, 0, sizeof(sigchld_act)); + sigchld_act.sa_flags = SA_SIGINFO; + sigchld_act.sa_sigaction = sighandler_chld; + sigaction(SIGCHLD, &sigchld_act, NULL); trace.evlist = evlist__new(); - trace.sctbl = syscalltbl__new(); - if (trace.evlist == NULL || trace.sctbl == NULL) { + if (trace.evlist == NULL) { pr_err("Not enough memory to run!\n"); err = -ENOMEM; goto out; @@ -4881,12 +5567,13 @@ int cmd_trace(int argc, const char **argv) if (trace.perfconfig_events != NULL) { struct parse_events_error parse_err; - bzero(&parse_err, sizeof(parse_err)); + parse_events_error__init(&parse_err); err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err); - if (err) { - parse_events_print_error(&parse_err, trace.perfconfig_events); + if (err) + parse_events_error__print(&parse_err, trace.perfconfig_events); + parse_events_error__exit(&parse_err); + if (err) goto out; - } } if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) { @@ -4894,87 +5581,63 @@ int cmd_trace(int argc, const char **argv) "cgroup monitoring only available in system-wide mode"); } - evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); - if (IS_ERR(evsel)) { - bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); - pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); - goto out; - } +#ifdef HAVE_BPF_SKEL + if (!trace.trace_syscalls) + goto skip_augmentation; - if (evsel) { - trace.syscalls.events.augmented = evsel; + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + pr_debug("Syscall augmentation fails with record, disabling augmentation"); + goto skip_augmentation; + } - evsel = perf_evlist__find_tracepoint_by_name(trace.evlist, "raw_syscalls:sys_enter"); - if (evsel == NULL) { - pr_err("ERROR: raw_syscalls:sys_enter not found in the augmented BPF object\n"); + if (trace.summary_bpf) { + if (!trace.opts.target.system_wide) { + /* TODO: Add filters in the BPF to support other targets. */ + pr_err("Error: --bpf-summary only works for system-wide mode.\n"); goto out; } + if (trace.summary_only) + goto skip_augmentation; + } - if (evsel->bpf_obj == NULL) { - pr_err("ERROR: raw_syscalls:sys_enter not associated to a BPF object\n"); - goto out; + trace.skel = augmented_raw_syscalls_bpf__open(); + if (!trace.skel) { + pr_debug("Failed to open augmented syscalls BPF skeleton"); + } else { + /* + * Disable attaching the BPF programs except for sys_enter and + * sys_exit that tail call into this as necessary. + */ + struct bpf_program *prog; + + bpf_object__for_each_program(prog, trace.skel->obj) { + if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit) + bpf_program__set_autoattach(prog, /*autoattach=*/false); } - trace.bpf_obj = evsel->bpf_obj; + err = augmented_raw_syscalls_bpf__load(trace.skel); - /* - * If we have _just_ the augmenter event but don't have a - * explicit --syscalls, then assume we want all strace-like - * syscalls: - */ - if (!trace.trace_syscalls && trace__only_augmented_syscalls_evsels(&trace)) - trace.trace_syscalls = true; - /* - * So, if we have a syscall augmenter, but trace_syscalls, aka - * strace-like syscall tracing is not set, then we need to trow - * away the augmenter, i.e. all the events that were created - * from that BPF object file. - * - * This is more to fix the current .perfconfig trace.add_events - * style of setting up the strace-like eBPF based syscall point - * payload augmenter. - * - * All this complexity will be avoided by adding an alternative - * to trace.add_events in the form of - * trace.bpf_augmented_syscalls, that will be only parsed if we - * need it. - * - * .perfconfig trace.add_events is still useful if we want, for - * instance, have msr_write.msr in some .perfconfig profile based - * 'perf trace --config determinism.profile' mode, where for some - * particular goal/workload type we want a set of events and - * output mode (with timings, etc) instead of having to add - * all via the command line. - * - * Also --config to specify an alternate .perfconfig file needs - * to be implemented. - */ - if (!trace.trace_syscalls) { - trace__delete_augmented_syscalls(&trace); + if (err < 0) { + libbpf_strerror(err, bf, sizeof(bf)); + pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf); } else { - trace__set_bpf_map_filtered_pids(&trace); - trace__set_bpf_map_syscalls(&trace); - trace.syscalls.unaugmented_prog = trace__find_bpf_program_by_title(&trace, "!raw_syscalls:unaugmented"); + augmented_raw_syscalls_bpf__attach(trace.skel); + trace__add_syscall_newtp(&trace); } } - err = bpf__setup_stdout(trace.evlist); + err = bpf__setup_bpf_output(trace.evlist); if (err) { - bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); - pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); + libbpf_strerror(err, bf, sizeof(bf)); + pr_err("ERROR: Setup BPF output event failed: %s\n", bf); goto out; } - + trace.syscalls.events.bpf_output = evlist__last(trace.evlist); + assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__")); +skip_augmentation: +#endif err = -1; - if (map_dump_str) { - trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str); - if (trace.dump.map == NULL) { - pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str); - goto out; - } - } - if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; @@ -5002,11 +5665,16 @@ int cmd_trace(int argc, const char **argv) } if (trace.evlist->core.nr_entries > 0) { + bool use_btf = false; + evlist__set_default_evsel_handler(trace.evlist, trace__event_handler); - if (evlist__set_syscall_tp_fields(trace.evlist)) { + if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) { perror("failed to set syscalls:* tracepoint fields"); goto out; } + + if (use_btf) + trace__load_vmlinux_btf(&trace); } if (trace.sort_events) { @@ -5025,18 +5693,18 @@ int cmd_trace(int argc, const char **argv) * buffers that are being copied from kernel to userspace, think 'read' * syscall. */ - if (trace.syscalls.events.augmented) { + if (trace.syscalls.events.bpf_output) { evlist__for_each_entry(trace.evlist, evsel) { - bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit"); if (raw_syscalls_sys_exit) { trace.raw_augmented_syscalls = true; goto init_augmented_syscall_tp; } - if (trace.syscalls.events.augmented->priv == NULL && + if (trace.syscalls.events.bpf_output->priv == NULL && strstr(evsel__name(evsel), "syscalls:sys_enter")) { - struct evsel *augmented = trace.syscalls.events.augmented; + struct evsel *augmented = trace.syscalls.events.bpf_output; if (evsel__init_augmented_syscall_tp(augmented, evsel) || evsel__init_augmented_syscall_tp_args(augmented)) goto out; @@ -5091,8 +5759,10 @@ init_augmented_syscall_tp: } } - if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) - return trace__record(&trace, argc-1, &argv[1]); + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + err = trace__record(&trace, argc-1, &argv[1]); + goto out; + } /* Using just --errno-summary will trigger --summary */ if (trace.errno_summary && !trace.summary && !trace.summary_only) @@ -5102,6 +5772,19 @@ init_augmented_syscall_tp: if (trace.summary_only) trace.summary = trace.summary_only; + /* Keep exited threads, otherwise information might be lost for summary */ + if (trace.summary) { + symbol_conf.keep_exited_threads = true; + if (trace.summary_mode == SUMMARY__NONE) + trace.summary_mode = SUMMARY__BY_THREAD; + + if (!trace.summary_bpf && trace.summary_mode == SUMMARY__BY_CGROUP) { + pr_err("Error: --summary-mode=cgroup only works with --bpf-summary\n"); + err = -EINVAL; + goto out; + } + } + if (output_name != NULL) { err = trace__open_output(&trace, output_name); if (err < 0) { @@ -5140,6 +5823,9 @@ out_close: if (output_name != NULL) fclose(trace.output); out: - zfree(&trace.perfconfig_events); + trace__exit(&trace); +#ifdef HAVE_BPF_SKEL + augmented_raw_syscalls_bpf__destroy(trace.skel); +#endif return err; } |