diff options
author | 2014-12-08 07:45:45 +0100 | |
---|---|---|
committer | 2014-12-08 07:45:45 +0100 | |
commit | cfa0bd52d0ba9b852f76c7b3f1055edd5e5c7846 (patch) | |
tree | 1076abb283310578da29e08b0eda281388ce22ea /tools/perf/util | |
parent | Merge tag 'perf-core-for-mingo' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core (diff) | |
parent | perf report: In branch stack mode use address history sorting (diff) | |
download | linux-dev-cfa0bd52d0ba9b852f76c7b3f1055edd5e5c7846.tar.xz linux-dev-cfa0bd52d0ba9b852f76c7b3f1055edd5e5c7846.zip |
Merge tag 'perf-core-for-mingo-2' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux into perf/core
Pull perf/core improvements from Arnaldo Carvalho de Melo:
User visible changes:
- Support handling complete branch stacks as histograms (Andi Kleen)
Infrastructure changes:
- Prep work for supporting per-pkg and snapshot counters in 'perf stat' (Jiri Olsa)
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'tools/perf/util')
-rw-r--r-- | tools/perf/util/callchain.c | 4 | ||||
-rw-r--r-- | tools/perf/util/callchain.h | 1 | ||||
-rw-r--r-- | tools/perf/util/evsel.c | 34 | ||||
-rw-r--r-- | tools/perf/util/evsel.h | 30 | ||||
-rw-r--r-- | tools/perf/util/machine.c | 126 | ||||
-rw-r--r-- | tools/perf/util/symbol.h | 3 |
6 files changed, 125 insertions, 73 deletions
diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 517ed84db97a..cf524a35cc84 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -149,6 +149,10 @@ static int parse_callchain_sort_key(const char *value) callchain_param.key = CCKEY_ADDRESS; return 0; } + if (!strncmp(value, "branch", strlen(value))) { + callchain_param.branch_callstack = 1; + return 0; + } return -1; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 3f158474c892..dbc08cf5f970 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -63,6 +63,7 @@ struct callchain_param { sort_chain_func_t sort; enum chain_order order; enum chain_key key; + bool branch_callstack; }; extern struct callchain_param callchain_param; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 2d26b7ad6fe0..1e90c8557ede 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -954,40 +954,6 @@ int __perf_evsel__read_on_cpu(struct perf_evsel *evsel, return 0; } -int __perf_evsel__read(struct perf_evsel *evsel, - int ncpus, int nthreads, bool scale) -{ - size_t nv = scale ? 3 : 1; - int cpu, thread; - struct perf_counts_values *aggr = &evsel->counts->aggr, count; - - if (evsel->system_wide) - nthreads = 1; - - aggr->val = aggr->ena = aggr->run = 0; - - for (cpu = 0; cpu < ncpus; cpu++) { - for (thread = 0; thread < nthreads; thread++) { - if (FD(evsel, cpu, thread) < 0) - continue; - - if (readn(FD(evsel, cpu, thread), - &count, nv * sizeof(u64)) < 0) - return -errno; - - aggr->val += count.val; - if (scale) { - aggr->ena += count.ena; - aggr->run += count.run; - } - } - } - - perf_evsel__compute_deltas(evsel, -1, aggr); - perf_counts_values__scale(aggr, scale, &evsel->counts->scaled); - return 0; -} - static int get_group_fd(struct perf_evsel *evsel, int cpu, int thread) { struct perf_evsel *leader = evsel->leader; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b18d58da580b..38622747d130 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -93,6 +93,7 @@ struct perf_evsel { bool system_wide; bool tracking; bool per_pkg; + unsigned long *per_pkg_mask; /* parse modifier helper */ int exclude_GH; int nr_members; @@ -271,35 +272,6 @@ static inline int perf_evsel__read_on_cpu_scaled(struct perf_evsel *evsel, return __perf_evsel__read_on_cpu(evsel, cpu, thread, true); } -int __perf_evsel__read(struct perf_evsel *evsel, int ncpus, int nthreads, - bool scale); - -/** - * perf_evsel__read - Read the aggregate results on all CPUs - * - * @evsel - event selector to read value - * @ncpus - Number of cpus affected, from zero - * @nthreads - Number of threads affected, from zero - */ -static inline int perf_evsel__read(struct perf_evsel *evsel, - int ncpus, int nthreads) -{ - return __perf_evsel__read(evsel, ncpus, nthreads, false); -} - -/** - * perf_evsel__read_scaled - Read the aggregate results on all CPUs, scaled - * - * @evsel - event selector to read value - * @ncpus - Number of cpus affected, from zero - * @nthreads - Number of threads affected, from zero - */ -static inline int perf_evsel__read_scaled(struct perf_evsel *evsel, - int ncpus, int nthreads) -{ - return __perf_evsel__read(evsel, ncpus, nthreads, true); -} - int perf_evsel__parse_sample(struct perf_evsel *evsel, union perf_event *event, struct perf_sample *sample); diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index b75b487574c7..15dd0a9691ce 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -12,6 +12,7 @@ #include <stdbool.h> #include <symbol/kallsyms.h> #include "unwind.h" +#include "linux/hash.h" static void dsos__init(struct dsos *dsos) { @@ -1391,7 +1392,11 @@ static int add_callchain_ip(struct thread *thread, al.filtered = 0; al.sym = NULL; - thread__find_addr_location(thread, cpumode, MAP__FUNCTION, + if (cpumode == -1) + thread__find_cpumode_addr_location(thread, MAP__FUNCTION, + ip, &al); + else + thread__find_addr_location(thread, cpumode, MAP__FUNCTION, ip, &al); if (al.sym != NULL) { if (sort__has_parent && !*parent && @@ -1427,8 +1432,50 @@ struct branch_info *sample__resolve_bstack(struct perf_sample *sample, return bi; } +#define CHASHSZ 127 +#define CHASHBITS 7 +#define NO_ENTRY 0xff + +#define PERF_MAX_BRANCH_DEPTH 127 + +/* Remove loops. */ +static int remove_loops(struct branch_entry *l, int nr) +{ + int i, j, off; + unsigned char chash[CHASHSZ]; + + memset(chash, NO_ENTRY, sizeof(chash)); + + BUG_ON(PERF_MAX_BRANCH_DEPTH > 255); + + for (i = 0; i < nr; i++) { + int h = hash_64(l[i].from, CHASHBITS) % CHASHSZ; + + /* no collision handling for now */ + if (chash[h] == NO_ENTRY) { + chash[h] = i; + } else if (l[chash[h]].from == l[i].from) { + bool is_loop = true; + /* check if it is a real loop */ + off = 0; + for (j = chash[h]; j < i && i + off < nr; j++, off++) + if (l[j].from != l[i + off].from) { + is_loop = false; + break; + } + if (is_loop) { + memmove(l + i, l + i + off, + (nr - (i + off)) * sizeof(*l)); + nr -= off; + } + } + } + return nr; +} + static int thread__resolve_callchain_sample(struct thread *thread, struct ip_callchain *chain, + struct branch_stack *branch, struct symbol **parent, struct addr_location *root_al, int max_stack) @@ -1438,22 +1485,82 @@ static int thread__resolve_callchain_sample(struct thread *thread, int i; int j; int err; - int skip_idx __maybe_unused; + int skip_idx = -1; + int first_call = 0; + + /* + * Based on DWARF debug information, some architectures skip + * a callchain entry saved by the kernel. + */ + if (chain->nr < PERF_MAX_STACK_DEPTH) + skip_idx = arch_skip_callchain_idx(thread, chain); callchain_cursor_reset(&callchain_cursor); + /* + * Add branches to call stack for easier browsing. This gives + * more context for a sample than just the callers. + * + * This uses individual histograms of paths compared to the + * aggregated histograms the normal LBR mode uses. + * + * Limitations for now: + * - No extra filters + * - No annotations (should annotate somehow) + */ + + if (branch && callchain_param.branch_callstack) { + int nr = min(max_stack, (int)branch->nr); + struct branch_entry be[nr]; + + if (branch->nr > PERF_MAX_BRANCH_DEPTH) { + pr_warning("corrupted branch chain. skipping...\n"); + goto check_calls; + } + + for (i = 0; i < nr; i++) { + if (callchain_param.order == ORDER_CALLEE) { + be[i] = branch->entries[i]; + /* + * Check for overlap into the callchain. + * The return address is one off compared to + * the branch entry. To adjust for this + * assume the calling instruction is not longer + * than 8 bytes. + */ + if (i == skip_idx || + chain->ips[first_call] >= PERF_CONTEXT_MAX) + first_call++; + else if (be[i].from < chain->ips[first_call] && + be[i].from >= chain->ips[first_call] - 8) + first_call++; + } else + be[i] = branch->entries[branch->nr - i - 1]; + } + + nr = remove_loops(be, nr); + + for (i = 0; i < nr; i++) { + err = add_callchain_ip(thread, parent, root_al, + -1, be[i].to); + if (!err) + err = add_callchain_ip(thread, parent, root_al, + -1, be[i].from); + if (err == -EINVAL) + break; + if (err) + return err; + } + chain_nr -= nr; + } + +check_calls: if (chain->nr > PERF_MAX_STACK_DEPTH) { pr_warning("corrupted callchain. skipping...\n"); return 0; } - /* - * Based on DWARF debug information, some architectures skip - * a callchain entry saved by the kernel. - */ - skip_idx = arch_skip_callchain_idx(thread, chain); - - for (i = 0; i < chain_nr; i++) { + for (i = first_call; i < chain_nr; i++) { u64 ip; if (callchain_param.order == ORDER_CALLEE) @@ -1517,6 +1624,7 @@ int thread__resolve_callchain(struct thread *thread, int max_stack) { int ret = thread__resolve_callchain_sample(thread, sample->callchain, + sample->branch_stack, parent, root_al, max_stack); if (ret) return ret; diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index e0b297c50f9d..9d602e9c6f59 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -102,7 +102,8 @@ struct symbol_conf { demangle, demangle_kernel, filter_relative, - show_hist_headers; + show_hist_headers, + branch_callstack; const char *vmlinux_name, *kallsyms_name, *source_prefix, |