diff options
Diffstat (limited to '')
-rw-r--r-- | tools/perf/builtin-trace.c | 825 |
1 files changed, 380 insertions, 445 deletions
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 44a75f234db1..90eaff8c0f6e 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -15,9 +15,14 @@ */ #include "util/record.h" -#include <traceevent/event-parse.h> #include <api/fs/tracing_path.h> +#ifdef HAVE_LIBBPF_SUPPORT #include <bpf/bpf.h> +#include <bpf/libbpf.h> +#ifdef HAVE_BPF_SKEL +#include "bpf_skel/augmented_raw_syscalls.skel.h" +#endif +#endif #include "util/bpf_map.h" #include "util/rlimit.h" #include "builtin.h" @@ -52,7 +57,7 @@ #include "trace/beauty/beauty.h" #include "trace-event.h" #include "util/parse-events.h" -#include "util/bpf-loader.h" +#include "util/tracepoint.h" #include "callchain.h" #include "print_binary.h" #include "string2.h" @@ -69,6 +74,7 @@ #include <linux/err.h> #include <linux/filter.h> #include <linux/kernel.h> +#include <linux/list_sort.h> #include <linux/random.h> #include <linux/stringify.h> #include <linux/time64.h> @@ -79,6 +85,10 @@ #include <linux/ctype.h> #include <perf/mmap.h> +#ifdef HAVE_LIBTRACEEVENT +#include <traceevent/event-parse.h> +#endif + #ifndef O_CLOEXEC # define O_CLOEXEC 02000000 #endif @@ -87,6 +97,8 @@ # define F_LINUX_SPECIFIC_BASE 1024 #endif +#define RAW_SYSCALL_ARGS_NUM 6 + /* * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100 */ @@ -107,7 +119,7 @@ struct syscall_fmt { const char *sys_enter, *sys_exit; } bpf_prog_name; - struct syscall_arg_fmt arg[6]; + struct syscall_arg_fmt arg[RAW_SYSCALL_ARGS_NUM]; u8 nr_args; bool errpid; bool timeout; @@ -119,26 +131,19 @@ struct trace { struct syscalltbl *sctbl; struct { struct syscall *table; - struct bpf_map *map; - struct { // per syscall BPF_MAP_TYPE_PROG_ARRAY - struct bpf_map *sys_enter, - *sys_exit; - } prog_array; struct { struct evsel *sys_enter, - *sys_exit, - *augmented; + *sys_exit, + *bpf_output; } events; - struct bpf_program *unaugmented_prog; } syscalls; - struct { - struct bpf_map *map; - } dump; +#ifdef HAVE_BPF_SKEL + struct augmented_raw_syscalls_bpf *skel; +#endif struct record_opts opts; struct evlist *evlist; struct machine *host; struct thread *current; - struct bpf_object *bpf_obj; struct cgroup *cgroup; u64 base_time; FILE *output; @@ -408,6 +413,7 @@ static int evsel__init_syscall_tp(struct evsel *evsel) if (evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") && evsel__init_tp_uint_field(evsel, &sc->id, "nr")) return -ENOENT; + return 0; } @@ -614,11 +620,8 @@ bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *re if (isalpha(*tok) || *tok == '_') { if (!strarray__strtoul(sa, tok, toklen, &val)) return false; - } else { - bool is_hexa = tok[0] == 0 && (tok[1] = 'x' || tok[1] == 'X'); - - val = strtoul(tok, NULL, is_hexa ? 16 : 0); - } + } else + val = strtoul(tok, NULL, 0); *ret |= (1 << (val - 1)); @@ -707,7 +710,15 @@ static size_t syscall_arg__scnprintf_char_array(char *bf, size_t size, struct sy static const char *bpf_cmd[] = { "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", - "MAP_GET_NEXT_KEY", "PROG_LOAD", + "MAP_GET_NEXT_KEY", "PROG_LOAD", "OBJ_PIN", "OBJ_GET", "PROG_ATTACH", + "PROG_DETACH", "PROG_TEST_RUN", "PROG_GET_NEXT_ID", "MAP_GET_NEXT_ID", + "PROG_GET_FD_BY_ID", "MAP_GET_FD_BY_ID", "OBJ_GET_INFO_BY_FD", + "PROG_QUERY", "RAW_TRACEPOINT_OPEN", "BTF_LOAD", "BTF_GET_FD_BY_ID", + "TASK_FD_QUERY", "MAP_LOOKUP_AND_DELETE_ELEM", "MAP_FREEZE", + "BTF_GET_NEXT_ID", "MAP_LOOKUP_BATCH", "MAP_LOOKUP_AND_DELETE_BATCH", + "MAP_UPDATE_BATCH", "MAP_DELETE_BATCH", "LINK_CREATE", "LINK_UPDATE", + "LINK_GET_FD_BY_ID", "LINK_GET_NEXT_ID", "ENABLE_STATS", "ITER_CREATE", + "LINK_DETACH", "PROG_BIND_MAP", }; static DEFINE_STRARRAY(bpf_cmd, "BPF_"); @@ -902,7 +913,7 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #include "trace/beauty/socket_type.c" #include "trace/beauty/waitid_options.c" -static struct syscall_fmt syscall_fmts[] = { +static const struct syscall_fmt syscall_fmts[] = { { .name = "access", .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, { .name = "arch_prctl", @@ -918,6 +929,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, }, { .name = "clock_gettime", .arg = { [0] = STRARRAY(clk_id, clockid), }, }, + { .name = "clock_nanosleep", + .arg = { [2] = { .scnprintf = SCA_TIMESPEC, /* rqtp */ }, }, }, { .name = "clone", .errpid = true, .nr_args = 5, .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, @@ -971,6 +984,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, { .name = "getrlimit", .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, + { .name = "getsockopt", + .arg = { [1] = STRARRAY(level, socket_level), }, }, { .name = "gettid", .errpid = true, }, { .name = "ioctl", .arg = { @@ -1045,7 +1060,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "perf_event_open", - .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, + .arg = { [0] = { .scnprintf = SCA_PERF_ATTR, /* attr */ }, + [2] = { .scnprintf = SCA_INT, /* cpu */ }, [3] = { .scnprintf = SCA_FD, /* group_fd */ }, [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, { .name = "pipe2", @@ -1113,6 +1129,8 @@ static struct syscall_fmt syscall_fmts[] = { .arg = { [0] = STRARRAY(which, itimers), }, }, { .name = "setrlimit", .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, + { .name = "setsockopt", + .arg = { [1] = STRARRAY(level, socket_level), }, }, { .name = "socket", .arg = { [0] = STRARRAY(family, socket_families), [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, @@ -1157,18 +1175,21 @@ static int syscall_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_fmt *__syscall_fmt__find(struct syscall_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_fmt *__syscall_fmt__find(const struct syscall_fmt *fmts, + const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); } -static struct syscall_fmt *syscall_fmt__find(const char *name) +static const struct syscall_fmt *syscall_fmt__find(const char *name) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find(syscall_fmts, nmemb, name); } -static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts, const int nmemb, const char *alias) +static const struct syscall_fmt *__syscall_fmt__find_by_alias(const struct syscall_fmt *fmts, + const int nmemb, const char *alias) { int i; @@ -1180,7 +1201,7 @@ static struct syscall_fmt *__syscall_fmt__find_by_alias(struct syscall_fmt *fmts return NULL; } -static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) +static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) { const int nmemb = ARRAY_SIZE(syscall_fmts); return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias); @@ -1205,21 +1226,11 @@ struct syscall { bool nonexistent; struct tep_format_field *args; const char *name; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; struct syscall_arg_fmt *arg_fmt; }; /* - * Must match what is in the BPF program: - * - * tools/perf/examples/bpf/augmented_raw_syscalls.c - */ -struct bpf_map_syscall_entry { - bool enabled; - u16 string_args_len[6]; -}; - -/* * We need to have this 'calculated' boolean because in some cases we really * don't know what is the duration of a syscall, for instance, when we start * a session and some threads are waiting for a syscall to finish, say 'poll', @@ -1284,6 +1295,22 @@ static struct thread_trace *thread_trace__new(void) return ttrace; } +static void thread_trace__free_files(struct thread_trace *ttrace); + +static void thread_trace__delete(void *pttrace) +{ + struct thread_trace *ttrace = pttrace; + + if (!ttrace) + return; + + intlist__delete(ttrace->syscall_stats); + ttrace->syscall_stats = NULL; + thread_trace__free_files(ttrace); + zfree(&ttrace->entry_str); + free(ttrace); +} + static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) { struct thread_trace *ttrace; @@ -1321,6 +1348,17 @@ void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, static const size_t trace__entry_str_size = 2048; +static void thread_trace__free_files(struct thread_trace *ttrace) +{ + for (int i = 0; i < ttrace->files.max; ++i) { + struct file *file = ttrace->files.table + i; + zfree(&file->pathname); + } + + zfree(&ttrace->files.table); + ttrace->files.max = -1; +} + static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd) { if (fd < 0) @@ -1374,12 +1412,13 @@ static int thread__read_fd_path(struct thread *thread, int fd) struct stat st; int ret; - if (thread->pid_ == thread->tid) { + if (thread__pid(thread) == thread__tid(thread)) { scnprintf(linkname, sizeof(linkname), - "/proc/%d/fd/%d", thread->pid_, fd); + "/proc/%d/fd/%d", thread__pid(thread), fd); } else { scnprintf(linkname, sizeof(linkname), - "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); + "/proc/%d/task/%d/fd/%d", + thread__pid(thread), thread__tid(thread), fd); } if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) @@ -1524,13 +1563,20 @@ static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) return fprintf(fp, " ? "); } -static bool done = false; -static bool interrupted = false; +static pid_t workload_pid = -1; +static volatile sig_atomic_t done = false; +static volatile sig_atomic_t interrupted = false; -static void sig_handler(int sig) +static void sighandler_interrupt(int sig __maybe_unused) { - done = true; - interrupted = sig == SIGINT; + done = interrupted = true; +} + +static void sighandler_chld(int sig __maybe_unused, siginfo_t *info, + void *context __maybe_unused) +{ + if (info->si_pid == workload_pid) + done = true; } static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) @@ -1540,7 +1586,7 @@ static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread if (trace->multiple_threads) { if (trace->show_comm) printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); - printed += fprintf(fp, "%d ", thread->tid); + printed += fprintf(fp, "%d ", thread__tid(thread)); } return printed; @@ -1615,13 +1661,15 @@ static int trace__symbols_init(struct trace *trace, struct evlist *evlist) if (trace->host == NULL) return -ENOMEM; + thread__set_priv_destructor(thread_trace__delete); + err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr); if (err < 0) goto out; err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, - evlist->core.threads, trace__tool_process, false, - 1); + evlist->core.threads, trace__tool_process, + true, false, 1); out: if (err) symbol__exit(); @@ -1641,7 +1689,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) { int idx; - if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) + if (nr_args == RAW_SYSCALL_ARGS_NUM && sc->fmt && sc->fmt->nr_args != 0) nr_args = sc->fmt->nr_args; sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); @@ -1657,7 +1705,7 @@ static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) return 0; } -static struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { +static const struct syscall_arg_fmt syscall_arg_fmts__by_name[] = { { .name = "msr", .scnprintf = SCA_X86_MSR, .strtoul = STUL_X86_MSR, }, { .name = "vector", .scnprintf = SCA_X86_IRQ_VECTORS, .strtoul = STUL_X86_IRQ_VECTORS, }, }; @@ -1668,13 +1716,14 @@ static int syscall_arg_fmt__cmp(const void *name, const void *fmtp) return strcmp(name, fmt->name); } -static struct syscall_arg_fmt * -__syscall_arg_fmt__find_by_name(struct syscall_arg_fmt *fmts, const int nmemb, const char *name) +static const struct syscall_arg_fmt * +__syscall_arg_fmt__find_by_name(const struct syscall_arg_fmt *fmts, const int nmemb, + const char *name) { return bsearch(name, fmts, nmemb, sizeof(struct syscall_arg_fmt), syscall_arg_fmt__cmp); } -static struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) +static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *name) { const int nmemb = ARRAY_SIZE(syscall_arg_fmts__by_name); return __syscall_arg_fmt__find_by_name(syscall_arg_fmts__by_name, nmemb, name); @@ -1713,14 +1762,15 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field len >= 2 && strcmp(field->name + len - 2, "fd") == 0) { /* * /sys/kernel/tracing/events/syscalls/sys_enter* - * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c + * grep -E 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c * 65 int * 23 unsigned int * 7 unsigned long */ arg->scnprintf = SCA_FD; - } else { - struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); + } else { + const struct syscall_arg_fmt *fmt = + syscall_arg_fmt__find_by_name(field->name); if (fmt) { arg->scnprintf = fmt->scnprintf; @@ -1774,11 +1824,11 @@ static int trace__read_syscall_info(struct trace *trace, int id) #endif sc = trace->syscalls.table + id; if (sc->nonexistent) - return 0; + return -EEXIST; if (name == NULL) { sc->nonexistent = true; - return 0; + return -EEXIST; } sc->name = name; @@ -1792,11 +1842,18 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc->tp_format = trace_event__tp_format("syscalls", tp_name); } - if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) - return -ENOMEM; - - if (IS_ERR(sc->tp_format)) + /* + * Fails to read trace point format via sysfs node, so the trace point + * doesn't exist. Set the 'nonexistent' flag as true. + */ + if (IS_ERR(sc->tp_format)) { + sc->nonexistent = true; return PTR_ERR(sc->tp_format); + } + + if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? + RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields)) + return -ENOMEM; sc->args = sc->tp_format->format.fields; /* @@ -2114,11 +2171,8 @@ static struct syscall *trace__syscall_info(struct trace *trace, (err = trace__read_syscall_info(trace, id)) != 0) goto out_cant_read; - if (trace->syscalls.table[id].name == NULL) { - if (trace->syscalls.table[id].nonexistent) - return NULL; + if (trace->syscalls.table && trace->syscalls.table[id].nonexistent) goto out_cant_read; - } return &trace->syscalls.table[id]; @@ -2153,13 +2207,10 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr stats = inode->priv; if (stats == NULL) { - stats = malloc(sizeof(*stats)); + stats = zalloc(sizeof(*stats)); if (stats == NULL) return; - stats->nr_failures = 0; - stats->max_errno = 0; - stats->errnos = NULL; init_stats(&stats->stats); inode->priv = stats; } @@ -2183,7 +2234,8 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr memset(new_errnos + stats->max_errno, 0, (err - stats->max_errno) * sizeof(u32)); } else { pr_debug("Not enough memory for errno stats for thread \"%s\"(%d/%d), results will be incomplete\n", - thread__comm_str(thread), thread->pid_, thread->tid); + thread__comm_str(thread), thread__pid(thread), + thread__tid(thread)); return; } @@ -2266,6 +2318,14 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam return augmented_args; } +static void syscall__exit(struct syscall *sc) +{ + if (!sc) + return; + + zfree(&sc->arg_fmt); +} + static int trace__sys_enter(struct trace *trace, struct evsel *evsel, union perf_event *event __maybe_unused, struct perf_sample *sample) @@ -2386,13 +2446,15 @@ static int trace__resolve_callchain(struct trace *trace, struct evsel *evsel, int max_stack = evsel->core.attr.sample_max_stack ? evsel->core.attr.sample_max_stack : trace->max_stack; - int err; + int err = -1; + addr_location__init(&al); if (machine__resolve(trace->host, &al, sample) < 0) - return -1; + goto out; err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); - addr_location__put(&al); +out: + addr_location__exit(&al); return err; } @@ -2403,15 +2465,14 @@ static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sam EVSEL__PRINT_DSO | EVSEL__PRINT_UNKNOWN_AS_ADDR; - return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, symbol_conf.bt_stop_list, trace->output); + return sample__fprintf_callchain(sample, 38, print_opts, get_tls_callchain_cursor(), symbol_conf.bt_stop_list, trace->output); } static const char *errno_to_name(struct evsel *evsel, int err) { struct perf_env *env = evsel__env(evsel); - const char *arch_name = perf_env__arch(env); - return arch_syscalls__strerrno(arch_name, err); + return perf_env__arch_strerrno(env, err); } static int trace__sys_exit(struct trace *trace, struct evsel *evsel, @@ -2457,9 +2518,11 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, goto out; if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2520,7 +2583,7 @@ errno_print: { if (child != NULL) { fprintf(trace->output, "%ld", ret); - if (child->comm_set) + if (thread__comm_set(child)) fprintf(trace->output, " (%s)", thread__comm_str(child)); thread__put(child); } @@ -2706,6 +2769,8 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, offset = format_field__intval(field, sample, evsel->needs_swap); syscall_arg.len = offset >> 16; offset &= 0xffff; + if (tep_field_is_relative(field->flags)) + offset += field->offset + field->size; } val = (uintptr_t)(sample->raw_data + offset); @@ -2719,7 +2784,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, /* * Suppress this argument if its value is zero and - * and we don't have a string associated in an + * we don't have a string associated in an * strarray for it. */ if (val == 0 && @@ -2732,11 +2797,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); - /* - * XXX Perhaps we should have a show_tp_arg_names, - * leaving show_arg_names just for syscalls? - */ - if (1 || trace->show_arg_names) + if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); @@ -2763,9 +2824,11 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out; callchain_ret = 1; } @@ -2780,7 +2843,7 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (thread) trace__fprintf_comm_tid(trace, thread, trace->output); - if (evsel == trace->syscalls.events.augmented) { + if (evsel == trace->syscalls.events.bpf_output) { int id = perf_evsel__sc_tp_uint(evsel, id, sample); struct syscall *sc = trace__syscall_info(trace, evsel, id); @@ -2840,7 +2903,7 @@ static void print_location(FILE *f, struct perf_sample *sample, { if ((verbose > 0 || print_dso) && al->map) - fprintf(f, "%s@", al->map->dso->long_name); + fprintf(f, "%s@", map__dso(al->map)->long_name); if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, @@ -2863,12 +2926,15 @@ static int trace__pgfault(struct trace *trace, int err = -1; int callchain_ret = 0; + addr_location__init(&al); thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); if (sample->callchain) { - callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); + struct callchain_cursor *cursor = get_tls_callchain_cursor(); + + callchain_ret = trace__resolve_callchain(trace, evsel, sample, cursor); if (callchain_ret == 0) { - if (callchain_cursor.nr < trace->min_stack) + if (cursor->nr < trace->min_stack) goto out_put; callchain_ret = 1; } @@ -2923,6 +2989,7 @@ out: err = 0; out_put: thread__put(thread); + addr_location__exit(&al); return err; } @@ -3047,15 +3114,11 @@ static bool evlist__add_vfs_getname(struct evlist *evlist) struct parse_events_error err; int ret; - bzero(&err, sizeof(err)); + parse_events_error__init(&err); ret = parse_events(evlist, "probe:vfs_getname*", &err); - if (ret) { - free(err.str); - free(err.help); - free(err.first_str); - free(err.first_help); + parse_events_error__exit(&err); + if (ret) return false; - } evlist__for_each_entry_safe(evlist, evsel, tmp) { if (!strstarts(evsel__name(evsel), "probe:vfs_getname")) @@ -3095,6 +3158,16 @@ static struct evsel *evsel__new_pgfault(u64 config) return evsel; } +static void evlist__free_syscall_tp_fields(struct evlist *evlist) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + evsel_trace__delete(evsel->priv); + evsel->priv = NULL; + } +} + static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) { const u32 type = event->header.type; @@ -3105,7 +3178,7 @@ static void trace__handle_event(struct trace *trace, union perf_event *event, st return; } - evsel = perf_evlist__id2evsel(trace->evlist, sample->id); + evsel = evlist__id2evsel(trace->evlist, sample->id); if (evsel == NULL) { fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); return; @@ -3203,33 +3276,24 @@ out_enomem: goto out; } -#ifdef HAVE_LIBBPF_SUPPORT -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace, const char *name) -{ - if (trace->bpf_obj == NULL) - return NULL; - - return bpf_object__find_map_by_name(trace->bpf_obj, name); -} - -static void trace__set_bpf_map_filtered_pids(struct trace *trace) -{ - trace->filter_pids.map = trace__find_bpf_map_by_name(trace, "pids_filtered"); -} - -static void trace__set_bpf_map_syscalls(struct trace *trace) -{ - trace->syscalls.map = trace__find_bpf_map_by_name(trace, "syscalls"); - trace->syscalls.prog_array.sys_enter = trace__find_bpf_map_by_name(trace, "syscalls_sys_enter"); - trace->syscalls.prog_array.sys_exit = trace__find_bpf_map_by_name(trace, "syscalls_sys_exit"); -} - +#ifdef HAVE_BPF_SKEL static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name) { - if (trace->bpf_obj == NULL) + struct bpf_program *pos, *prog = NULL; + const char *sec_name; + + if (trace->skel->obj == NULL) return NULL; - return bpf_object__find_program_by_title(trace->bpf_obj, name); + bpf_object__for_each_program(pos, trace->skel->obj) { + sec_name = bpf_program__section_name(pos); + if (sec_name && !strcmp(sec_name, name)) { + prog = pos; + break; + } + } + + return prog; } static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, struct syscall *sc, @@ -3239,12 +3303,12 @@ static struct bpf_program *trace__find_syscall_bpf_prog(struct trace *trace, str if (prog_name == NULL) { char default_prog_name[256]; - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->name); + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->name); prog = trace__find_bpf_program_by_title(trace, default_prog_name); if (prog != NULL) goto out_found; if (sc->fmt && sc->fmt->alias) { - scnprintf(default_prog_name, sizeof(default_prog_name), "!syscalls:sys_%s_%s", type, sc->fmt->alias); + scnprintf(default_prog_name, sizeof(default_prog_name), "tp/syscalls/sys_%s_%s", type, sc->fmt->alias); prog = trace__find_bpf_program_by_title(trace, default_prog_name); if (prog != NULL) goto out_found; @@ -3262,7 +3326,7 @@ out_found: pr_debug("Couldn't find BPF prog \"%s\" to associate with syscalls:sys_%s_%s, not augmenting it\n", prog_name, type, sc->name); out_unaugmented: - return trace->syscalls.unaugmented_prog; + return trace->skel->progs.syscall_unaugmented; } static void trace__init_syscall_bpf_progs(struct trace *trace, int id) @@ -3279,87 +3343,13 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id) static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id) { struct syscall *sc = trace__syscall_info(trace, NULL, id); - return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->syscalls.unaugmented_prog); + return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id) { struct syscall *sc = trace__syscall_info(trace, NULL, id); - return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->syscalls.unaugmented_prog); -} - -static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry) -{ - struct syscall *sc = trace__syscall_info(trace, NULL, id); - int arg = 0; - - if (sc == NULL) - goto out; - - for (; arg < sc->nr_args; ++arg) { - entry->string_args_len[arg] = 0; - if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) { - /* Should be set like strace -s strsize */ - entry->string_args_len[arg] = PATH_MAX; - } - } -out: - for (; arg < 6; ++arg) - entry->string_args_len[arg] = 0; -} -static int trace__set_ev_qualifier_bpf_filter(struct trace *trace) -{ - int fd = bpf_map__fd(trace->syscalls.map); - struct bpf_map_syscall_entry value = { - .enabled = !trace->not_ev_qualifier, - }; - int err = 0; - size_t i; - - for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) { - int key = trace->ev_qualifier_ids.entries[i]; - - if (value.enabled) { - trace__init_bpf_map_syscall_args(trace, key, &value); - trace__init_syscall_bpf_progs(trace, key); - } - - err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); - if (err) - break; - } - - return err; -} - -static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled) -{ - int fd = bpf_map__fd(trace->syscalls.map); - struct bpf_map_syscall_entry value = { - .enabled = enabled, - }; - int err = 0, key; - - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - if (enabled) - trace__init_bpf_map_syscall_args(trace, key, &value); - - err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); - if (err) - break; - } - - return err; -} - -static int trace__init_syscalls_bpf_map(struct trace *trace) -{ - bool enabled = true; - - if (trace->ev_qualifier_ids.nr) - enabled = trace->not_ev_qualifier; - - return __trace__init_syscalls_bpf_map(trace, enabled); + return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc) @@ -3384,7 +3374,7 @@ try_to_find_pair: bool is_candidate = false; if (pair == NULL || pair == sc || - pair->bpf_prog.sys_enter == trace->syscalls.unaugmented_prog) + pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented) continue; for (field = sc->args, candidate_field = pair->args; @@ -3408,6 +3398,19 @@ try_to_find_pair: if (strcmp(field->type, candidate_field->type)) goto next_candidate; + /* + * This is limited in the BPF program but sys_write + * uses "const char *" for its "buf" arg so we need to + * use some heuristic that is kinda future proof... + */ + if (strcmp(field->type, "const char *") == 0 && + !(strstr(field->name, "name") || + strstr(field->name, "path") || + strstr(field->name, "file") || + strstr(field->name, "root") || + strstr(field->name, "description"))) + goto next_candidate; + is_candidate = true; } @@ -3437,7 +3440,7 @@ try_to_find_pair: */ if (pair_prog == NULL) { pair_prog = trace__find_syscall_bpf_prog(trace, pair, pair->fmt ? pair->fmt->bpf_prog_name.sys_enter : NULL, "enter"); - if (pair_prog == trace->syscalls.unaugmented_prog) + if (pair_prog == trace->skel->progs.syscall_unaugmented) goto next_candidate; } @@ -3452,8 +3455,8 @@ try_to_find_pair: static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) { - int map_enter_fd = bpf_map__fd(trace->syscalls.prog_array.sys_enter), - map_exit_fd = bpf_map__fd(trace->syscalls.prog_array.sys_exit); + int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter); + int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit); int err = 0, key; for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { @@ -3515,7 +3518,7 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) * For now we're just reusing the sys_enter prog, and if it * already has an augmenter, we don't need to find one. */ - if (sc->bpf_prog.sys_enter != trace->syscalls.unaugmented_prog) + if (sc->bpf_prog.sys_enter != trace->skel->progs.syscall_unaugmented) continue; /* @@ -3538,89 +3541,12 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) break; } - return err; } - -static void trace__delete_augmented_syscalls(struct trace *trace) -{ - struct evsel *evsel, *tmp; - - evlist__remove(trace->evlist, trace->syscalls.events.augmented); - evsel__delete(trace->syscalls.events.augmented); - trace->syscalls.events.augmented = NULL; - - evlist__for_each_entry_safe(trace->evlist, tmp, evsel) { - if (evsel->bpf_obj == trace->bpf_obj) { - evlist__remove(trace->evlist, evsel); - evsel__delete(evsel); - } - - } - - bpf_object__close(trace->bpf_obj); - trace->bpf_obj = NULL; -} -#else // HAVE_LIBBPF_SUPPORT -static struct bpf_map *trace__find_bpf_map_by_name(struct trace *trace __maybe_unused, - const char *name __maybe_unused) -{ - return NULL; -} - -static void trace__set_bpf_map_filtered_pids(struct trace *trace __maybe_unused) -{ -} - -static void trace__set_bpf_map_syscalls(struct trace *trace __maybe_unused) -{ -} - -static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused) -{ - return 0; -} - -static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused) -{ - return 0; -} - -static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace __maybe_unused, - const char *name __maybe_unused) -{ - return NULL; -} - -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace __maybe_unused) -{ - return 0; -} - -static void trace__delete_augmented_syscalls(struct trace *trace __maybe_unused) -{ -} -#endif // HAVE_LIBBPF_SUPPORT - -static bool trace__only_augmented_syscalls_evsels(struct trace *trace) -{ - struct evsel *evsel; - - evlist__for_each_entry(trace->evlist, evsel) { - if (evsel == trace->syscalls.events.augmented || - evsel->bpf_obj == trace->bpf_obj) - continue; - - return false; - } - - return true; -} +#endif // HAVE_BPF_SKEL static int trace__set_ev_qualifier_filter(struct trace *trace) { - if (trace->syscalls.map) - return trace__set_ev_qualifier_bpf_filter(trace); if (trace->syscalls.events.sys_enter) return trace__set_ev_qualifier_tp_filter(trace); return 0; @@ -3653,20 +3579,22 @@ static int trace__set_filter_loop_pids(struct trace *trace) struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); while (thread && nr < ARRAY_SIZE(pids)) { - struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); + struct thread *parent = machine__find_thread(trace->host, + thread__ppid(thread), + thread__ppid(thread)); if (parent == NULL) break; if (!strcmp(thread__comm_str(parent), "sshd") || strstarts(thread__comm_str(parent), "gnome-terminal")) { - pids[nr++] = parent->tid; + pids[nr++] = thread__tid(parent); break; } thread = parent; } - err = perf_evlist__append_tp_filter_pids(trace->evlist, nr, pids); + err = evlist__append_tp_filter_pids(trace->evlist, nr, pids); if (!err && trace->filter_pids.map) err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids); @@ -3680,11 +3608,11 @@ static int trace__set_filter_pids(struct trace *trace) * Better not use !target__has_task() here because we need to cover the * case where no threads were specified in the command line, but a * workload was, and in that case we will fill in the thread_map when - * we fork the workload in perf_evlist__prepare_workload. + * we fork the workload in evlist__prepare_workload. */ if (trace->filter_pids.nr > 0) { - err = perf_evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr, - trace->filter_pids.entries); + err = evlist__append_tp_filter_pids(trace->evlist, trace->filter_pids.nr, + trace->filter_pids.entries); if (!err && trace->filter_pids.map) { err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr, trace->filter_pids.entries); @@ -3700,9 +3628,8 @@ static int __trace__deliver_event(struct trace *trace, union perf_event *event) { struct evlist *evlist = trace->evlist; struct perf_sample sample; - int err; + int err = evlist__parse_sample(evlist, event, &sample); - err = perf_evlist__parse_sample(evlist, event, &sample); if (err) fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); else @@ -3735,11 +3662,11 @@ static int trace__deliver_event(struct trace *trace, union perf_event *event) if (!trace->sort_events) return __trace__deliver_event(trace, event); - err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last); + err = evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last); if (err && err != -1) return err; - err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0); + err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0, NULL); if (err) return err; @@ -3920,6 +3847,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv) evlist__add(evlist, pgfault_min); } + /* Enable ignoring missing threads when -u/-p option is defined. */ + trace->opts.ignore_missing_thread = trace->opts.target.uid != UINT_MAX || trace->opts.target.pid; + if (trace->sched && evlist__add_newtp(evlist, "sched", "sched_stat_runtime", trace__sched_stat_runtime)) goto out_error_sched_stat_runtime; @@ -3951,7 +3881,7 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (trace->cgroup) evlist__set_default_cgroup(trace->evlist, trace->cgroup); - err = perf_evlist__create_maps(evlist, &trace->opts.target); + err = evlist__create_maps(evlist, &trace->opts.target); if (err < 0) { fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); goto out_delete_evlist; @@ -3963,43 +3893,45 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } - perf_evlist__config(evlist, &trace->opts, &callchain_param); - - signal(SIGCHLD, sig_handler); - signal(SIGINT, sig_handler); + evlist__config(evlist, &trace->opts, &callchain_param); if (forks) { - err = perf_evlist__prepare_workload(evlist, &trace->opts.target, - argv, false, NULL); + err = evlist__prepare_workload(evlist, &trace->opts.target, argv, false, NULL); if (err < 0) { fprintf(trace->output, "Couldn't run the workload!\n"); goto out_delete_evlist; } + workload_pid = evlist->workload.pid; } err = evlist__open(evlist); if (err < 0) goto out_error_open; +#ifdef HAVE_BPF_SKEL + if (trace->syscalls.events.bpf_output) { + struct perf_cpu cpu; - err = bpf__apply_obj_config(); - if (err) { - char errbuf[BUFSIZ]; - - bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); - pr_err("ERROR: Apply config to BPF failed: %s\n", - errbuf); - goto out_error_open; + /* + * Set up the __augmented_syscalls__ BPF map to hold for each + * CPU the bpf-output event's file descriptor. + */ + perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) { + bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__, + &cpu.cpu, sizeof(int), + xyarray__entry(trace->syscalls.events.bpf_output->core.fd, + cpu.cpu, 0), + sizeof(__u32), BPF_ANY); + } } - +#endif err = trace__set_filter_pids(trace); if (err < 0) goto out_error_mem; - if (trace->syscalls.map) - trace__init_syscalls_bpf_map(trace); - - if (trace->syscalls.prog_array.sys_enter) +#ifdef HAVE_BPF_SKEL + if (trace->skel && trace->skel->progs.sys_enter) trace__init_syscalls_bpf_prog_array_maps(trace); +#endif if (trace->ev_qualifier_ids.nr > 0) { err = trace__set_ev_qualifier_filter(trace); @@ -4028,31 +3960,28 @@ static int trace__run(struct trace *trace, int argc, const char **argv) err = trace__expand_filters(trace, &evsel); if (err) goto out_delete_evlist; - err = perf_evlist__apply_filters(evlist, &evsel); + err = evlist__apply_filters(evlist, &evsel); if (err < 0) goto out_error_apply_filters; - if (trace->dump.map) - bpf_map__fprintf(trace->dump.map, trace->output); - err = evlist__mmap(evlist, trace->opts.mmap_pages); if (err < 0) goto out_error_mmap; - if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) + if (!target__none(&trace->opts.target) && !trace->opts.target.initial_delay) evlist__enable(evlist); if (forks) - perf_evlist__start_workload(evlist); + evlist__start_workload(evlist); - if (trace->opts.initial_delay) { - usleep(trace->opts.initial_delay * 1000); + if (trace->opts.target.initial_delay) { + usleep(trace->opts.target.initial_delay * 1000); evlist__enable(evlist); } trace->multiple_threads = perf_thread_map__pid(evlist->core.threads, 0) == -1 || - evlist->core.threads->nr > 1 || - evlist__first(evlist)->core.attr.inherit; + perf_thread_map__nr(evlist->core.threads) > 1 || + evlist__first(evlist)->core.attr.inherit; /* * Now that we already used evsel->core.attr to ask the kernel to setup the @@ -4135,7 +4064,7 @@ out_disable: out_delete_evlist: trace__symbols__exit(trace); - + evlist__free_syscall_tp_fields(evlist); evlist__delete(evlist); cgroup__put(trace->cgroup); trace->evlist = NULL; @@ -4210,7 +4139,7 @@ static int trace__replay(struct trace *trace) /* add tid to output */ trace->multiple_threads = true; - session = perf_session__new(&data, false, &trace->tool); + session = perf_session__new(&data, &trace->tool); if (IS_ERR(session)) return PTR_ERR(session); @@ -4229,12 +4158,11 @@ static int trace__replay(struct trace *trace) if (err) goto out; - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "raw_syscalls:sys_enter"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_enter"); + trace->syscalls.events.sys_enter = evsel; /* older kernels have syscalls tp versus raw_syscalls */ if (evsel == NULL) - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "syscalls:sys_enter"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_enter"); if (evsel && (evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || @@ -4243,11 +4171,10 @@ static int trace__replay(struct trace *trace) goto out; } - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "raw_syscalls:sys_exit"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "raw_syscalls:sys_exit"); + trace->syscalls.events.sys_exit = evsel; if (evsel == NULL) - evsel = perf_evlist__find_tracepoint_by_name(session->evlist, - "syscalls:sys_exit"); + evsel = evlist__find_tracepoint_by_name(session->evlist, "syscalls:sys_exit"); if (evsel && (evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || perf_evsel__init_sc_tp_uint_field(evsel, ret))) { @@ -4337,12 +4264,11 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); if (trace->errno_summary && stats->nr_failures) { - const char *arch_name = perf_env__arch(trace->host->env); int e; for (e = 0; e < stats->max_errno; ++e) { if (stats->errnos[e] != 0) - fprintf(fp, "\t\t\t\t%s: %d\n", arch_syscalls__strerrno(arch_name, e + 1), stats->errnos[e]); + fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]); } } } @@ -4365,7 +4291,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; - printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); + printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread__tid(thread)); printed += fprintf(fp, "%lu events, ", ttrace->nr_events); printed += fprintf(fp, "%.1f%%", ratio); if (ttrace->pfmaj) @@ -4387,32 +4313,38 @@ static unsigned long thread__nr_events(struct thread_trace *ttrace) return ttrace ? ttrace->nr_events : 0; } -DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), - struct thread *thread; -) +static int trace_nr_events_cmp(void *priv __maybe_unused, + const struct list_head *la, + const struct list_head *lb) { - entry->thread = rb_entry(nd, struct thread, rb_node); + struct thread_list *a = list_entry(la, struct thread_list, list); + struct thread_list *b = list_entry(lb, struct thread_list, list); + unsigned long a_nr_events = thread__nr_events(thread__priv(a->thread)); + unsigned long b_nr_events = thread__nr_events(thread__priv(b->thread)); + + if (a_nr_events != b_nr_events) + return a_nr_events < b_nr_events ? -1 : 1; + + /* Identical number of threads, place smaller tids first. */ + return thread__tid(a->thread) < thread__tid(b->thread) + ? -1 + : (thread__tid(a->thread) > thread__tid(b->thread) ? 1 : 0); } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) { size_t printed = trace__fprintf_threads_header(fp); - struct rb_node *nd; - int i; + LIST_HEAD(threads); - for (i = 0; i < THREADS__TABLE_SIZE; i++) { - DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); - - if (threads == NULL) { - fprintf(fp, "%s", "Error sorting output by nr_events!\n"); - return 0; - } + if (machine__thread_list(trace->host, &threads) == 0) { + struct thread_list *pos; - resort_rb__for_each_entry(nd, threads) - printed += trace__fprintf_thread(fp, threads_entry->thread, trace); + list_sort(NULL, &threads, trace_nr_events_cmp); - resort_rb__delete(threads); + list_for_each_entry(pos, &threads, list) + printed += trace__fprintf_thread(fp, pos->thread, trace); } + thread_list__delete(&threads); return printed; } @@ -4506,7 +4438,7 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name) struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt) { - struct syscall_fmt *scfmt = syscall_fmt__find(name); + const struct syscall_fmt *scfmt = syscall_fmt__find(name); if (scfmt) { int skip = 0; @@ -4573,7 +4505,7 @@ static int trace__parse_events_option(const struct option *opt, const char *str, int len = strlen(str) + 1, err = -1, list, idx; char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); char group_name[PATH_MAX]; - struct syscall_fmt *fmt; + const struct syscall_fmt *fmt; if (strace_groups_dir == NULL) return -1; @@ -4639,12 +4571,18 @@ do_concat: err = 0; if (lists[0]) { - struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", - "event selector. use 'perf list' to list available events", - parse_events_option); + struct parse_events_option_args parse_events_option_args = { + .evlistp = &trace->evlist, + }; + struct option o = { + .value = &parse_events_option_args, + }; err = parse_events_option(&o, lists[0], 0); } out: + free(strace_groups_dir); + free(lists[0]); + free(lists[1]); if (sep) *sep = ','; @@ -4655,9 +4593,12 @@ static int trace__parse_cgroups(const struct option *opt, const char *str, int u { struct trace *trace = opt->value; - if (!list_empty(&trace->evlist->core.entries)) - return parse_cgroups(opt, str, unset); - + if (!list_empty(&trace->evlist->core.entries)) { + struct option o = { + .value = &trace->evlist, + }; + return parse_cgroups(&o, str, unset); + } trace->cgroup = evlist__findnew_cgroup(trace->evlist, str); return 0; @@ -4707,6 +4648,33 @@ out: return err; } +static void trace__exit(struct trace *trace) +{ + int i; + + strlist__delete(trace->ev_qualifier); + zfree(&trace->ev_qualifier_ids.entries); + if (trace->syscalls.table) { + for (i = 0; i <= trace->sctbl->syscalls.max_id; i++) + syscall__exit(&trace->syscalls.table[i]); + zfree(&trace->syscalls.table); + } + syscalltbl__delete(trace->sctbl); + zfree(&trace->perfconfig_events); +} + +#ifdef HAVE_BPF_SKEL +static int bpf__setup_bpf_output(struct evlist *evlist) +{ + int err = parse_event(evlist, "bpf-output/no-inherit=1,name=__augmented_syscalls__/"); + + if (err) + pr_debug("ERROR: failed to create the \"__augmented_syscalls__\" bpf-output event\n"); + + return err; +} +#endif + int cmd_trace(int argc, const char **argv) { const char *trace_usage[] = { @@ -4738,7 +4706,6 @@ int cmd_trace(int argc, const char **argv) .max_stack = UINT_MAX, .max_events = ULONG_MAX, }; - const char *map_dump_str = NULL; const char *output_name = NULL; const struct option trace_options[] = { OPT_CALLBACK('e', "event", &trace, "event", @@ -4766,16 +4733,12 @@ int cmd_trace(int argc, const char **argv) OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, "child tasks do not inherit counters"), OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", - "number of mmap data pages", - perf_evlist__parse_mmap_pages), + "number of mmap data pages", evlist__parse_mmap_pages), OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", "user to profile"), OPT_CALLBACK(0, "duration", &trace, "float", "show only events with duration > N.M ms", trace__set_duration), -#ifdef HAVE_LIBBPF_SUPPORT - OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"), -#endif OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_BOOLEAN('T', "time", &trace.full_time, @@ -4816,7 +4779,7 @@ int cmd_trace(int argc, const char **argv) "per thread proc mmap processing timeout in ms"), OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only", trace__parse_cgroups), - OPT_INTEGER('D', "delay", &trace.opts.initial_delay, + OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay, "ms to wait before starting measurement after program " "start"), OPTS_EVSWITCH(&trace.evswitch), @@ -4828,9 +4791,16 @@ int cmd_trace(int argc, const char **argv) const char * const trace_subcommands[] = { "record", NULL }; int err = -1; char bf[BUFSIZ]; + struct sigaction sigchld_act; signal(SIGSEGV, sighandler_dump_stack); signal(SIGFPE, sighandler_dump_stack); + signal(SIGINT, sighandler_interrupt); + + memset(&sigchld_act, 0, sizeof(sigchld_act)); + sigchld_act.sa_flags = SA_SIGINFO; + sigchld_act.sa_sigaction = sighandler_chld; + sigaction(SIGCHLD, &sigchld_act, NULL); trace.evlist = evlist__new(); trace.sctbl = syscalltbl__new(); @@ -4881,12 +4851,13 @@ int cmd_trace(int argc, const char **argv) if (trace.perfconfig_events != NULL) { struct parse_events_error parse_err; - bzero(&parse_err, sizeof(parse_err)); + parse_events_error__init(&parse_err); err = parse_events(trace.evlist, trace.perfconfig_events, &parse_err); - if (err) { - parse_events_print_error(&parse_err, trace.perfconfig_events); + if (err) + parse_events_error__print(&parse_err, trace.perfconfig_events); + parse_events_error__exit(&parse_err); + if (err) goto out; - } } if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) { @@ -4894,87 +4865,48 @@ int cmd_trace(int argc, const char **argv) "cgroup monitoring only available in system-wide mode"); } - evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); - if (IS_ERR(evsel)) { - bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); - pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); - goto out; - } - - if (evsel) { - trace.syscalls.events.augmented = evsel; +#ifdef HAVE_BPF_SKEL + if (!trace.trace_syscalls) + goto skip_augmentation; - evsel = perf_evlist__find_tracepoint_by_name(trace.evlist, "raw_syscalls:sys_enter"); - if (evsel == NULL) { - pr_err("ERROR: raw_syscalls:sys_enter not found in the augmented BPF object\n"); - goto out; - } + trace.skel = augmented_raw_syscalls_bpf__open(); + if (!trace.skel) { + pr_debug("Failed to open augmented syscalls BPF skeleton"); + } else { + /* + * Disable attaching the BPF programs except for sys_enter and + * sys_exit that tail call into this as necessary. + */ + struct bpf_program *prog; - if (evsel->bpf_obj == NULL) { - pr_err("ERROR: raw_syscalls:sys_enter not associated to a BPF object\n"); - goto out; + bpf_object__for_each_program(prog, trace.skel->obj) { + if (prog != trace.skel->progs.sys_enter && prog != trace.skel->progs.sys_exit) + bpf_program__set_autoattach(prog, /*autoattach=*/false); } - trace.bpf_obj = evsel->bpf_obj; + err = augmented_raw_syscalls_bpf__load(trace.skel); - /* - * If we have _just_ the augmenter event but don't have a - * explicit --syscalls, then assume we want all strace-like - * syscalls: - */ - if (!trace.trace_syscalls && trace__only_augmented_syscalls_evsels(&trace)) - trace.trace_syscalls = true; - /* - * So, if we have a syscall augmenter, but trace_syscalls, aka - * strace-like syscall tracing is not set, then we need to trow - * away the augmenter, i.e. all the events that were created - * from that BPF object file. - * - * This is more to fix the current .perfconfig trace.add_events - * style of setting up the strace-like eBPF based syscall point - * payload augmenter. - * - * All this complexity will be avoided by adding an alternative - * to trace.add_events in the form of - * trace.bpf_augmented_syscalls, that will be only parsed if we - * need it. - * - * .perfconfig trace.add_events is still useful if we want, for - * instance, have msr_write.msr in some .perfconfig profile based - * 'perf trace --config determinism.profile' mode, where for some - * particular goal/workload type we want a set of events and - * output mode (with timings, etc) instead of having to add - * all via the command line. - * - * Also --config to specify an alternate .perfconfig file needs - * to be implemented. - */ - if (!trace.trace_syscalls) { - trace__delete_augmented_syscalls(&trace); + if (err < 0) { + libbpf_strerror(err, bf, sizeof(bf)); + pr_debug("Failed to load augmented syscalls BPF skeleton: %s\n", bf); } else { - trace__set_bpf_map_filtered_pids(&trace); - trace__set_bpf_map_syscalls(&trace); - trace.syscalls.unaugmented_prog = trace__find_bpf_program_by_title(&trace, "!raw_syscalls:unaugmented"); + augmented_raw_syscalls_bpf__attach(trace.skel); + trace__add_syscall_newtp(&trace); } } - err = bpf__setup_stdout(trace.evlist); + err = bpf__setup_bpf_output(trace.evlist); if (err) { - bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); - pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); + libbpf_strerror(err, bf, sizeof(bf)); + pr_err("ERROR: Setup BPF output event failed: %s\n", bf); goto out; } - + trace.syscalls.events.bpf_output = evlist__last(trace.evlist); + assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__")); +skip_augmentation: +#endif err = -1; - if (map_dump_str) { - trace.dump.map = trace__find_bpf_map_by_name(&trace, map_dump_str); - if (trace.dump.map == NULL) { - pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str); - goto out; - } - } - if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; @@ -5025,7 +4957,7 @@ int cmd_trace(int argc, const char **argv) * buffers that are being copied from kernel to userspace, think 'read' * syscall. */ - if (trace.syscalls.events.augmented) { + if (trace.syscalls.events.bpf_output) { evlist__for_each_entry(trace.evlist, evsel) { bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; @@ -5034,9 +4966,9 @@ int cmd_trace(int argc, const char **argv) goto init_augmented_syscall_tp; } - if (trace.syscalls.events.augmented->priv == NULL && + if (trace.syscalls.events.bpf_output->priv == NULL && strstr(evsel__name(evsel), "syscalls:sys_enter")) { - struct evsel *augmented = trace.syscalls.events.augmented; + struct evsel *augmented = trace.syscalls.events.bpf_output; if (evsel__init_augmented_syscall_tp(augmented, evsel) || evsel__init_augmented_syscall_tp_args(augmented)) goto out; @@ -5140,6 +5072,9 @@ out_close: if (output_name != NULL) fclose(trace.output); out: - zfree(&trace.perfconfig_events); + trace__exit(&trace); +#ifdef HAVE_BPF_SKEL + augmented_raw_syscalls_bpf__destroy(trace.skel); +#endif return err; } |