aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/util/stat-shadow.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/util/stat-shadow.c')
-rw-r--r--tools/perf/util/stat-shadow.c162
1 files changed, 162 insertions, 0 deletions
diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c
index aa9efe08762b..8a2bbd2a4d82 100644
--- a/tools/perf/util/stat-shadow.c
+++ b/tools/perf/util/stat-shadow.c
@@ -36,6 +36,11 @@ static struct stats runtime_dtlb_cache_stats[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_cycles_in_tx_stats[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_transaction_stats[NUM_CTX][MAX_NR_CPUS];
static struct stats runtime_elision_stats[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_total_slots[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_issued[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_slots_retired[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_fetch_bubbles[NUM_CTX][MAX_NR_CPUS];
+static struct stats runtime_topdown_recovery_bubbles[NUM_CTX][MAX_NR_CPUS];
static bool have_frontend_stalled;
struct stats walltime_nsecs_stats;
@@ -82,6 +87,11 @@ void perf_stat__reset_shadow_stats(void)
sizeof(runtime_transaction_stats));
memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+ memset(runtime_topdown_total_slots, 0, sizeof(runtime_topdown_total_slots));
+ memset(runtime_topdown_slots_retired, 0, sizeof(runtime_topdown_slots_retired));
+ memset(runtime_topdown_slots_issued, 0, sizeof(runtime_topdown_slots_issued));
+ memset(runtime_topdown_fetch_bubbles, 0, sizeof(runtime_topdown_fetch_bubbles));
+ memset(runtime_topdown_recovery_bubbles, 0, sizeof(runtime_topdown_recovery_bubbles));
}
/*
@@ -105,6 +115,16 @@ void perf_stat__update_shadow_stats(struct perf_evsel *counter, u64 *count,
update_stats(&runtime_transaction_stats[ctx][cpu], count[0]);
else if (perf_stat_evsel__is(counter, ELISION_START))
update_stats(&runtime_elision_stats[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS))
+ update_stats(&runtime_topdown_total_slots[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED))
+ update_stats(&runtime_topdown_slots_issued[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED))
+ update_stats(&runtime_topdown_slots_retired[ctx][cpu], count[0]);
+ else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES))
+ update_stats(&runtime_topdown_fetch_bubbles[ctx][cpu],count[0]);
+ else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES))
+ update_stats(&runtime_topdown_recovery_bubbles[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
update_stats(&runtime_stalled_cycles_front_stats[ctx][cpu], count[0]);
else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
@@ -302,6 +322,107 @@ static void print_ll_cache_misses(int cpu,
out->print_metric(out->ctx, color, "%7.2f%%", "of all LL-cache hits", ratio);
}
+/*
+ * High level "TopDown" CPU core pipe line bottleneck break down.
+ *
+ * Basic concept following
+ * Yasin, A Top Down Method for Performance analysis and Counter architecture
+ * ISPASS14
+ *
+ * The CPU pipeline is divided into 4 areas that can be bottlenecks:
+ *
+ * Frontend -> Backend -> Retiring
+ * BadSpeculation in addition means out of order execution that is thrown away
+ * (for example branch mispredictions)
+ * Frontend is instruction decoding.
+ * Backend is execution, like computation and accessing data in memory
+ * Retiring is good execution that is not directly bottlenecked
+ *
+ * The formulas are computed in slots.
+ * A slot is an entry in the pipeline each for the pipeline width
+ * (for example a 4-wide pipeline has 4 slots for each cycle)
+ *
+ * Formulas:
+ * BadSpeculation = ((SlotsIssued - SlotsRetired) + RecoveryBubbles) /
+ * TotalSlots
+ * Retiring = SlotsRetired / TotalSlots
+ * FrontendBound = FetchBubbles / TotalSlots
+ * BackendBound = 1.0 - BadSpeculation - Retiring - FrontendBound
+ *
+ * The kernel provides the mapping to the low level CPU events and any scaling
+ * needed for the CPU pipeline width, for example:
+ *
+ * TotalSlots = Cycles * 4
+ *
+ * The scaling factor is communicated in the sysfs unit.
+ *
+ * In some cases the CPU may not be able to measure all the formulas due to
+ * missing events. In this case multiple formulas are combined, as possible.
+ *
+ * Full TopDown supports more levels to sub-divide each area: for example
+ * BackendBound into computing bound and memory bound. For now we only
+ * support Level 1 TopDown.
+ */
+
+static double sanitize_val(double x)
+{
+ if (x < 0 && x >= -0.02)
+ return 0.0;
+ return x;
+}
+
+static double td_total_slots(int ctx, int cpu)
+{
+ return avg_stats(&runtime_topdown_total_slots[ctx][cpu]);
+}
+
+static double td_bad_spec(int ctx, int cpu)
+{
+ double bad_spec = 0;
+ double total_slots;
+ double total;
+
+ total = avg_stats(&runtime_topdown_slots_issued[ctx][cpu]) -
+ avg_stats(&runtime_topdown_slots_retired[ctx][cpu]) +
+ avg_stats(&runtime_topdown_recovery_bubbles[ctx][cpu]);
+ total_slots = td_total_slots(ctx, cpu);
+ if (total_slots)
+ bad_spec = total / total_slots;
+ return sanitize_val(bad_spec);
+}
+
+static double td_retiring(int ctx, int cpu)
+{
+ double retiring = 0;
+ double total_slots = td_total_slots(ctx, cpu);
+ double ret_slots = avg_stats(&runtime_topdown_slots_retired[ctx][cpu]);
+
+ if (total_slots)
+ retiring = ret_slots / total_slots;
+ return retiring;
+}
+
+static double td_fe_bound(int ctx, int cpu)
+{
+ double fe_bound = 0;
+ double total_slots = td_total_slots(ctx, cpu);
+ double fetch_bub = avg_stats(&runtime_topdown_fetch_bubbles[ctx][cpu]);
+
+ if (total_slots)
+ fe_bound = fetch_bub / total_slots;
+ return fe_bound;
+}
+
+static double td_be_bound(int ctx, int cpu)
+{
+ double sum = (td_fe_bound(ctx, cpu) +
+ td_bad_spec(ctx, cpu) +
+ td_retiring(ctx, cpu));
+ if (sum == 0)
+ return 0;
+ return sanitize_val(1.0 - sum);
+}
+
void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
double avg, int cpu,
struct perf_stat_output_ctx *out)
@@ -309,6 +430,7 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
void *ctxp = out->ctx;
print_metric_t print_metric = out->print_metric;
double total, ratio = 0.0, total2;
+ const char *color = NULL;
int ctx = evsel_context(evsel);
if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
@@ -452,6 +574,46 @@ void perf_stat__print_shadow_stats(struct perf_evsel *evsel,
avg / ratio);
else
print_metric(ctxp, NULL, NULL, "CPUs utilized", 0);
+ } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) {
+ double fe_bound = td_fe_bound(ctx, cpu);
+
+ if (fe_bound > 0.2)
+ color = PERF_COLOR_RED;
+ print_metric(ctxp, color, "%8.1f%%", "frontend bound",
+ fe_bound * 100.);
+ } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) {
+ double retiring = td_retiring(ctx, cpu);
+
+ if (retiring > 0.7)
+ color = PERF_COLOR_GREEN;
+ print_metric(ctxp, color, "%8.1f%%", "retiring",
+ retiring * 100.);
+ } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) {
+ double bad_spec = td_bad_spec(ctx, cpu);
+
+ if (bad_spec > 0.1)
+ color = PERF_COLOR_RED;
+ print_metric(ctxp, color, "%8.1f%%", "bad speculation",
+ bad_spec * 100.);
+ } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) {
+ double be_bound = td_be_bound(ctx, cpu);
+ const char *name = "backend bound";
+ static int have_recovery_bubbles = -1;
+
+ /* In case the CPU does not support topdown-recovery-bubbles */
+ if (have_recovery_bubbles < 0)
+ have_recovery_bubbles = pmu_have_event("cpu",
+ "topdown-recovery-bubbles");
+ if (!have_recovery_bubbles)
+ name = "backend bound/bad spec";
+
+ if (be_bound > 0.2)
+ color = PERF_COLOR_RED;
+ if (td_total_slots(ctx, cpu) > 0)
+ print_metric(ctxp, color, "%8.1f%%", name,
+ be_bound * 100.);
+ else
+ print_metric(ctxp, NULL, NULL, name, 0);
} else if (runtime_nsecs_stats[cpu].n != 0) {
char unit = 'M';
char unit_buf[10];