aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf')
-rw-r--r--tools/perf/.gitignore7
-rw-r--r--tools/perf/Documentation/arm-coresight.txt5
-rw-r--r--tools/perf/Documentation/itrace.txt1
-rw-r--r--tools/perf/Documentation/perf-c2c.txt14
-rw-r--r--tools/perf/Documentation/perf-config.txt7
-rw-r--r--tools/perf/Documentation/perf-inject.txt13
-rw-r--r--tools/perf/Documentation/perf-intel-pt.txt13
-rw-r--r--tools/perf/Documentation/perf-lock.txt20
-rw-r--r--tools/perf/Documentation/perf-mem.txt3
-rw-r--r--tools/perf/Documentation/perf-record.txt8
-rw-r--r--tools/perf/Documentation/perf-report.txt3
-rw-r--r--tools/perf/Makefile.config29
-rw-r--r--tools/perf/Makefile.perf18
-rw-r--r--tools/perf/arch/arm/util/auxtrace.c116
-rw-r--r--tools/perf/arch/arm/util/pmu.c3
-rw-r--r--tools/perf/arch/arm64/annotate/instructions.c2
-rw-r--r--tools/perf/arch/arm64/util/Build2
-rw-r--r--tools/perf/arch/arm64/util/hisi-ptt.c188
-rw-r--r--tools/perf/arch/powerpc/entry/syscalls/syscall.tbl28
-rw-r--r--tools/perf/arch/x86/util/intel-pt.c17
-rw-r--r--tools/perf/arch/x86/util/mem-events.c31
-rw-r--r--tools/perf/bench/epoll-ctl.c33
-rw-r--r--tools/perf/bench/epoll-wait.c33
-rw-r--r--tools/perf/bench/futex-hash.c33
-rw-r--r--tools/perf/bench/futex-lock-pi.c33
-rw-r--r--tools/perf/bench/futex-requeue.c33
-rw-r--r--tools/perf/bench/futex-wake-parallel.c33
-rw-r--r--tools/perf/bench/futex-wake.c33
-rw-r--r--tools/perf/bench/numa.c93
-rw-r--r--tools/perf/builtin-c2c.c66
-rw-r--r--tools/perf/builtin-inject.c89
-rw-r--r--tools/perf/builtin-list.c2
-rw-r--r--tools/perf/builtin-lock.c274
-rw-r--r--tools/perf/builtin-mem.c9
-rw-r--r--tools/perf/builtin-record.c251
-rw-r--r--tools/perf/builtin-report.c17
-rw-r--r--tools/perf/builtin-sched.c125
-rw-r--r--tools/perf/builtin-script.c12
-rw-r--r--tools/perf/builtin-stat.c126
-rw-r--r--tools/perf/builtin-timechart.c65
-rw-r--r--tools/perf/builtin-top.c48
-rw-r--r--tools/perf/builtin-trace.c18
-rwxr-xr-xtools/perf/check-headers.sh2
-rw-r--r--tools/perf/perf.c12
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/branch.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/branch.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/bus.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/bus.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/cache.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/cache.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/dpu.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/dpu.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/exception.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/exception.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/ifu.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/ifu.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/instruction.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/instruction.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/memory.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/memory.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/pipeline.json (renamed from tools/perf/pmu-events/arch/arm64/arm/cortex-a65/pipeline.json)0
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json3
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json5
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json17
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json17
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json107
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json14
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json65
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json23
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json8
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json14
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json3
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json5
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json30
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json3
-rw-r--r--tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json5
-rw-r--r--tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json6
-rw-r--r--tools/perf/pmu-events/arch/arm64/mapfile.csv4
-rw-r--r--tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json72
-rw-r--r--tools/perf/pmu-events/arch/s390/cf_z16/pai_crypto.json (renamed from tools/perf/pmu-events/arch/s390/cf_z16/pai.json)0
-rw-r--r--tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json1353
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/cache.json129
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/frontend.json12
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/memory.json22
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/other.json22
-rw-r--r--tools/perf/pmu-events/arch/x86/alderlake/pipeline.json14
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json679
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json711
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json965
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json10
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json1285
-rw-r--r--tools/perf/pmu-events/arch/x86/cascadelakex/uncore-memory.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/cascadelakex/uncore-other.json10
-rw-r--r--tools/perf/pmu-events/arch/x86/haswell/cache.json4
-rw-r--r--tools/perf/pmu-events/arch/x86/haswell/frontend.json12
-rw-r--r--tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json570
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/cache.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/frontend.json12
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json919
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-interconnect.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/haswellx/uncore-memory.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/icelake/cache.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json808
-rw-r--r--tools/perf/pmu-events/arch/x86/icelake/pipeline.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/icelakex/cache.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json1155
-rw-r--r--tools/perf/pmu-events/arch/x86/icelakex/pipeline.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/icelakex/uncore-other.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json594
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/cache.json4
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/floating-point.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/frontend.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json630
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-cache.json58
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-interconnect.json84
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-memory.json2
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-other.json6
-rw-r--r--tools/perf/pmu-events/arch/x86/ivytown/uncore-power.json8
-rw-r--r--tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json327
-rw-r--r--tools/perf/pmu-events/arch/x86/mapfile.csv18
-rw-r--r--tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json315
-rw-r--r--tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json4
-rw-r--r--tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json11
-rw-r--r--tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json4
-rw-r--r--tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json1249
-rw-r--r--tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json861
-rw-r--r--tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json1262
-rw-r--r--tools/perf/pmu-events/arch/x86/skylakex/uncore-memory.json18
-rw-r--r--tools/perf/pmu-events/arch/x86/skylakex/uncore-other.json19
-rw-r--r--tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json810
-rw-r--r--tools/perf/pmu-events/empty-pmu-events.c6
-rw-r--r--tools/perf/tests/attr/base-record2
-rw-r--r--tools/perf/tests/attr/system-wide-dummy2
-rw-r--r--tools/perf/tests/attr/test-record-group4
-rw-r--r--tools/perf/tests/attr/test-record-group-sampling6
-rw-r--r--tools/perf/tests/attr/test-record-group14
-rw-r--r--tools/perf/tests/attr/test-record-group24
-rw-r--r--tools/perf/tests/cpumap.c52
-rw-r--r--tools/perf/tests/event_update.c14
-rw-r--r--tools/perf/tests/expr.c41
-rw-r--r--tools/perf/tests/mmap-basic.c5
-rw-r--r--tools/perf/tests/openat-syscall-all-cpus.c2
-rw-r--r--tools/perf/tests/perf-record.c4
-rw-r--r--tools/perf/tests/shell/coresight/Makefile29
-rw-r--r--tools/perf/tests/shell/coresight/Makefile.miniconfig14
-rwxr-xr-xtools/perf/tests/shell/coresight/asm_pure_loop.sh18
-rw-r--r--tools/perf/tests/shell/coresight/asm_pure_loop/.gitignore1
-rw-r--r--tools/perf/tests/shell/coresight/asm_pure_loop/Makefile34
-rw-r--r--tools/perf/tests/shell/coresight/asm_pure_loop/asm_pure_loop.S28
-rw-r--r--tools/perf/tests/shell/coresight/memcpy_thread/.gitignore1
-rw-r--r--tools/perf/tests/shell/coresight/memcpy_thread/Makefile33
-rw-r--r--tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c79
-rwxr-xr-xtools/perf/tests/shell/coresight/memcpy_thread_16k_10.sh18
-rw-r--r--tools/perf/tests/shell/coresight/thread_loop/.gitignore1
-rw-r--r--tools/perf/tests/shell/coresight/thread_loop/Makefile33
-rw-r--r--tools/perf/tests/shell/coresight/thread_loop/thread_loop.c86
-rwxr-xr-xtools/perf/tests/shell/coresight/thread_loop_check_tid_10.sh19
-rwxr-xr-xtools/perf/tests/shell/coresight/thread_loop_check_tid_2.sh19
-rw-r--r--tools/perf/tests/shell/coresight/unroll_loop_thread/.gitignore1
-rw-r--r--tools/perf/tests/shell/coresight/unroll_loop_thread/Makefile33
-rw-r--r--tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c74
-rwxr-xr-xtools/perf/tests/shell/coresight/unroll_loop_thread_10.sh18
-rw-r--r--tools/perf/tests/shell/lib/coresight.sh132
-rw-r--r--tools/perf/tests/shell/lib/probe_vfs_getname.sh2
-rw-r--r--tools/perf/tests/shell/lib/waiting.sh77
-rwxr-xr-xtools/perf/tests/shell/lock_contention.sh73
-rwxr-xr-xtools/perf/tests/shell/record.sh2
-rwxr-xr-xtools/perf/tests/shell/stat+csv_output.sh43
-rwxr-xr-xtools/perf/tests/shell/stat+json_output.sh43
-rwxr-xr-xtools/perf/tests/shell/stat_bpf_counters_cgrp.sh83
-rwxr-xr-xtools/perf/tests/shell/test_arm_coresight.sh2
-rwxr-xr-xtools/perf/tests/shell/test_brstack.sh6
-rwxr-xr-xtools/perf/tests/shell/test_data_symbol.sh93
-rwxr-xr-xtools/perf/tests/shell/test_intel_pt.sh622
-rwxr-xr-xtools/perf/tests/shell/test_java_symbol.sh75
-rw-r--r--tools/perf/tests/sigtrap.c65
-rw-r--r--tools/perf/tests/switch-tracking.c15
-rw-r--r--tools/perf/tests/topology.c10
-rw-r--r--tools/perf/tests/vmlinux-kallsyms.c5
-rw-r--r--tools/perf/tests/wp.c10
-rw-r--r--tools/perf/trace/beauty/statx.c1
-rw-r--r--tools/perf/ui/browser.c20
-rw-r--r--tools/perf/ui/browsers/annotate.c23
-rw-r--r--tools/perf/ui/setup.c5
-rw-r--r--tools/perf/ui/tui/helpline.c5
-rw-r--r--tools/perf/ui/tui/progress.c8
-rw-r--r--tools/perf/ui/tui/setup.c8
-rw-r--r--tools/perf/ui/tui/util.c18
-rw-r--r--tools/perf/ui/ui.h4
-rw-r--r--tools/perf/util/Build5
-rwxr-xr-xtools/perf/util/PERF-VERSION-GEN10
-rw-r--r--tools/perf/util/annotate.c34
-rw-r--r--tools/perf/util/annotate.h8
-rw-r--r--tools/perf/util/arm-spe.c2
-rw-r--r--tools/perf/util/auxtrace.c27
-rw-r--r--tools/perf/util/auxtrace.h4
-rw-r--r--tools/perf/util/bpf-event.c5
-rw-r--r--tools/perf/util/bpf-event.h1
-rw-r--r--tools/perf/util/bpf-loader.c24
-rw-r--r--tools/perf/util/bpf_counter_cgroup.c10
-rw-r--r--tools/perf/util/bpf_lock_contention.c26
-rw-r--r--tools/perf/util/bpf_skel/bperf_cgroup.bpf.c42
-rw-r--r--tools/perf/util/bpf_skel/lock_contention.bpf.c5
-rw-r--r--tools/perf/util/bpf_skel/off_cpu.bpf.c18
-rw-r--r--tools/perf/util/branch.c70
-rw-r--r--tools/perf/util/branch.h7
-rw-r--r--tools/perf/util/build-id.c12
-rw-r--r--tools/perf/util/callchain.c12
-rw-r--r--tools/perf/util/config.c31
-rw-r--r--tools/perf/util/config.h1
-rw-r--r--tools/perf/util/cpumap.c39
-rw-r--r--tools/perf/util/cpumap.h2
-rw-r--r--tools/perf/util/cputopo.c61
-rw-r--r--tools/perf/util/cputopo.h5
-rw-r--r--tools/perf/util/dso.c19
-rw-r--r--tools/perf/util/dso.h4
-rw-r--r--tools/perf/util/events_stats.h1
-rw-r--r--tools/perf/util/evlist.c316
-rw-r--r--tools/perf/util/evlist.h13
-rw-r--r--tools/perf/util/evsel.c30
-rw-r--r--tools/perf/util/evsel.h1
-rw-r--r--tools/perf/util/expr.c40
-rw-r--r--tools/perf/util/expr.h25
-rw-r--r--tools/perf/util/expr.l6
-rw-r--r--tools/perf/util/expr.y2
-rw-r--r--tools/perf/util/genelf.c15
-rw-r--r--tools/perf/util/genelf.h8
-rw-r--r--tools/perf/util/header.c24
-rw-r--r--tools/perf/util/hisi-ptt-decoder/Build1
-rw-r--r--tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.c164
-rw-r--r--tools/perf/util/hisi-ptt-decoder/hisi-ptt-pkt-decoder.h31
-rw-r--r--tools/perf/util/hisi-ptt.c192
-rw-r--r--tools/perf/util/hisi-ptt.h19
-rw-r--r--tools/perf/util/hist.c22
-rw-r--r--tools/perf/util/hist.h6
-rw-r--r--tools/perf/util/include/linux/linkage.h13
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-log.c117
-rw-r--r--tools/perf/util/intel-pt-decoder/intel-pt-log.h3
-rw-r--r--tools/perf/util/intel-pt.c32
-rw-r--r--tools/perf/util/jitdump.c7
-rw-r--r--tools/perf/util/lock-contention.h5
-rw-r--r--tools/perf/util/machine.c4
-rw-r--r--tools/perf/util/map.c3
-rw-r--r--tools/perf/util/mem-events.c17
-rw-r--r--tools/perf/util/metricgroup.c145
-rw-r--r--tools/perf/util/metricgroup.h4
-rw-r--r--tools/perf/util/mmap.h1
-rw-r--r--tools/perf/util/mutex.c119
-rw-r--r--tools/perf/util/mutex.h108
-rw-r--r--tools/perf/util/parse-branch-options.c5
-rw-r--r--tools/perf/util/parse-events-hybrid.c21
-rw-r--r--tools/perf/util/parse-events.c46
-rw-r--r--tools/perf/util/parse-events.h1
-rw-r--r--tools/perf/util/perf_event_attr_fprintf.c4
-rw-r--r--tools/perf/util/pmu.c19
-rw-r--r--tools/perf/util/pmu.h2
-rw-r--r--tools/perf/util/pmu.l2
-rw-r--r--tools/perf/util/pmu.y17
-rw-r--r--tools/perf/util/print-events.c39
-rw-r--r--tools/perf/util/probe-event.c3
-rw-r--r--tools/perf/util/scripting-engines/Build2
-rw-r--r--tools/perf/util/session.c7
-rw-r--r--tools/perf/util/smt.c110
-rw-r--r--tools/perf/util/smt.h19
-rw-r--r--tools/perf/util/sort.c38
-rw-r--r--tools/perf/util/sort.h3
-rw-r--r--tools/perf/util/stat-display.c42
-rw-r--r--tools/perf/util/stat-shadow.c338
-rw-r--r--tools/perf/util/stat.c29
-rw-r--r--tools/perf/util/stat.h12
-rw-r--r--tools/perf/util/string.c1
-rw-r--r--tools/perf/util/symbol-elf.c7
-rw-r--r--tools/perf/util/symbol.c4
-rw-r--r--tools/perf/util/synthetic-events.c201
-rw-r--r--tools/perf/util/top.h5
270 files changed, 17122 insertions, 5449 deletions
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index 4b9c71faa01a..fd7a6ff9e7aa 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -4,6 +4,7 @@ PERF-GUI-VARS
PERF-VERSION-FILE
FEATURE-DUMP
perf
+!include/perf/
perf-read-vdso32
perf-read-vdsox32
perf-help
@@ -15,13 +16,14 @@ perf*.1
perf*.xml
perf*.html
common-cmds.h
-perf.data
-perf.data.old
+perf*.data
+perf*.data.old
output.svg
perf-archive
perf-iostat
tags
TAGS
+stats-*.csv
cscope*
config.mak
config.mak.autogen
@@ -29,6 +31,7 @@ config.mak.autogen
*-flex.*
*.pyc
*.pyo
+*.stdout
.config-detected
util/intel-pt-decoder/inat-tables.c
arch/*/include/generated/
diff --git a/tools/perf/Documentation/arm-coresight.txt b/tools/perf/Documentation/arm-coresight.txt
new file mode 100644
index 000000000000..c117fc50a2a9
--- /dev/null
+++ b/tools/perf/Documentation/arm-coresight.txt
@@ -0,0 +1,5 @@
+Arm CoreSight Support
+=====================
+
+For full documentation, see Documentation/trace/coresight/coresight-perf.rst
+in the kernel tree.
diff --git a/tools/perf/Documentation/itrace.txt b/tools/perf/Documentation/itrace.txt
index 6b189669c450..0916bbfe64cb 100644
--- a/tools/perf/Documentation/itrace.txt
+++ b/tools/perf/Documentation/itrace.txt
@@ -64,6 +64,7 @@
debug messages will or will not be logged. Each flag must be preceded
by either '+' or '-'. The flags are:
a all perf events
+ e output only on errors (size configurable - see linkperf:perf-config[1])
o output to stdout
If supported, the 'q' option may be repeated to increase the effect.
diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index f1f7ae6b08d1..5c5eb2def83e 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -19,9 +19,10 @@ C2C stands for Cache To Cache.
The perf c2c tool provides means for Shared Data C2C/HITM analysis. It allows
you to track down the cacheline contentions.
-On x86, the tool is based on load latency and precise store facility events
+On Intel, the tool is based on load latency and precise store facility events
provided by Intel CPUs. On PowerPC, the tool uses random instruction sampling
-with thresholding feature.
+with thresholding feature. On AMD, the tool uses IBS op pmu (due to hardware
+limitations, perf c2c is not supported on Zen3 cpus).
These events provide:
- memory address of the access
@@ -49,7 +50,8 @@ RECORD OPTIONS
-l::
--ldlat::
- Configure mem-loads latency. (x86 only)
+ Configure mem-loads latency. Supported on Intel and Arm64 processors
+ only. Ignored on other archs.
-k::
--all-kernel::
@@ -135,11 +137,15 @@ Following perf record options are configured by default:
-W,-d,--phys-data,--sample-cpu
Unless specified otherwise with '-e' option, following events are monitored by
-default on x86:
+default on Intel:
cpu/mem-loads,ldlat=30/P
cpu/mem-stores/P
+following on AMD:
+
+ ibs_op//
+
and following on PowerPC:
cpu/mem-loads/
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 0420e71698ee..39c890ead2dc 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -729,6 +729,13 @@ auxtrace.*::
If the directory does not exist or has the wrong file type,
the current directory is used.
+itrace.*::
+
+ debug-log-buffer-size::
+ Log size in bytes to output when using the option --itrace=d+e
+ Refer 'itrace' option of linkperf:perf-script[1] or
+ linkperf:perf-report[1]. The default is 16384.
+
daemon.*::
daemon.base::
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
index ffc293fdf61d..c972032f4ca0 100644
--- a/tools/perf/Documentation/perf-inject.txt
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -25,10 +25,17 @@ OPTIONS
-------
-b::
--build-ids::
- Inject build-ids into the output stream
+ Inject build-ids of DSOs hit by samples into the output stream.
+ This means it needs to process all SAMPLE records to find the DSOs.
---buildid-all:
- Inject build-ids of all DSOs into the output stream
+--buildid-all::
+ Inject build-ids of all DSOs into the output stream regardless of hits
+ and skip SAMPLE processing.
+
+--known-build-ids=::
+ Override build-ids to inject using these comma-separated pairs of
+ build-id and path. Understands file://filename to read these pairs
+ from a file, which can be generated with perf buildid-list.
-v::
--verbose::
diff --git a/tools/perf/Documentation/perf-intel-pt.txt b/tools/perf/Documentation/perf-intel-pt.txt
index 3dc3f0ccbd51..92464a5d7eaf 100644
--- a/tools/perf/Documentation/perf-intel-pt.txt
+++ b/tools/perf/Documentation/perf-intel-pt.txt
@@ -943,12 +943,15 @@ event packets are recorded only if the "pwr_evt" config term was used. Refer to
the config terms section above. The power events record information about
C-state changes, whereas CBR is indicative of CPU frequency. perf script
"event,synth" fields display information like this:
+
cbr: cbr: 22 freq: 2189 MHz (200%)
mwait: hints: 0x60 extensions: 0x1
pwre: hw: 0 cstate: 2 sub-cstate: 0
exstop: ip: 1
pwrx: deepest cstate: 2 last cstate: 2 wake reason: 0x4
+
Where:
+
"cbr" includes the frequency and the percentage of maximum non-turbo
"mwait" shows mwait hints and extensions
"pwre" shows C-state transitions (to a C-state deeper than C0) and
@@ -956,6 +959,7 @@ Where:
"exstop" indicates execution stopped and whether the IP was recorded
exactly,
"pwrx" indicates return to C0
+
For more details refer to the Intel 64 and IA-32 Architectures Software
Developer Manuals.
@@ -969,8 +973,10 @@ are quite important. Users must know if what they are seeing is a complete
picture or not. The "e" option may be followed by flags which affect what errors
will or will not be reported. Each flag must be preceded by either '+' or '-'.
The flags supported by Intel PT are:
+
-o Suppress overflow errors
-l Suppress trace data lost errors
+
For example, for errors but not overflow or data lost errors:
--itrace=e-o-l
@@ -980,11 +986,16 @@ decoded packets and instructions. Note that this option slows down the decoder
and that the resulting file may be very large. The "d" option may be followed
by flags which affect what debug messages will or will not be logged. Each flag
must be preceded by either '+' or '-'. The flags support by Intel PT are:
+
-a Suppress logging of perf events
+a Log all perf events
+ +e Output only on decoding errors (size configurable)
+o Output to stdout instead of "intel_pt.log"
+
By default, logged perf events are filtered by any specified time ranges, but
-flag +a overrides that.
+flag +a overrides that. The +e flag can be useful for analyzing errors. By
+default, the log size in that case is 16384 bytes, but can be altered by
+linkperf:perf-config[1] e.g. perf config itrace.debug-log-buffer-size=30000
In addition, the period of the "instructions" event can be specified. e.g.
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 193c5d8b8db9..3b1e16563b79 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -40,6 +40,10 @@ COMMON OPTIONS
--verbose::
Be more verbose (show symbol address, etc).
+-q::
+--quiet::
+ Do not show any message. (Suppress -v)
+
-D::
--dump-raw-trace::
Dump raw trace in ASCII.
@@ -94,6 +98,11 @@ REPORT OPTIONS
EventManager_De 1845 1 636
futex-default-S 1609 0 0
+-E::
+--entries=<value>::
+ Display this many entries.
+
+
INFO OPTIONS
------------
@@ -105,6 +114,7 @@ INFO OPTIONS
--map::
dump map of lock instances (address:name table)
+
CONTENTION OPTIONS
--------------
@@ -148,6 +158,16 @@ CONTENTION OPTIONS
--map-nr-entries::
Maximum number of BPF map entries (default: 10240).
+--max-stack::
+ Maximum stack depth when collecting lock contention (default: 8).
+
+--stack-skip
+ Number of stack depth to skip when finding a lock caller (default: 3).
+
+-E::
+--entries=<value>::
+ Display this many entries.
+
SEE ALSO
--------
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 66177511c5c4..005c95580b1e 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -85,7 +85,8 @@ RECORD OPTIONS
Be more verbose (show counter open errors, etc)
--ldlat <n>::
- Specify desired latency for loads event. (x86 only)
+ Specify desired latency for loads event. Supported on Intel and Arm64
+ processors only. Ignored on other archs.
In addition, for report all perf report options are valid, and for record
all perf record options.
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 0228efc96686..e41ae950fdc3 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -400,6 +400,7 @@ following filters are defined:
For the platforms with Intel Arch LBR support (12th-Gen+ client or
4th-Gen Xeon+ server), the save branch type is unconditionally enabled
when the taken branch stack sampling is enabled.
+ - priv: save privilege state during sampling in case binary is not available later
+
The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
@@ -410,6 +411,7 @@ is enabled for all the sampling events. The sampled branch type is the same for
The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
Note that this feature may not be available on all processors.
+-W::
--weight::
Enable weightened sampling. An additional weight is recorded per sample and can be
displayed with the weight and local_weight sort keys. This currently works for TSX
@@ -433,8 +435,10 @@ if combined with -a or -C options.
-D::
--delay=::
After starting the program, wait msecs before measuring (-1: start with events
-disabled). This is useful to filter out the startup phase of the program, which
-is often very different.
+disabled), or enable events only for specified ranges of msecs (e.g.
+-D 10-20,30-40 means wait 10 msecs, enable for 10 msecs, wait 10 msecs, enable
+for 10 msecs, then stop). Note, delaying enabling of events is useful to filter
+out the startup phase of the program, which is often very different.
-I::
--intr-regs::
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 24efc0583c93..4533db2ee56b 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -73,7 +73,7 @@ OPTIONS
Sort histogram entries by given key(s) - multiple keys can be specified
in CSV format. Following sort keys are available:
pid, comm, dso, symbol, parent, cpu, socket, srcline, weight,
- local_weight, cgroup_id.
+ local_weight, cgroup_id, addr.
Each key has following meaning:
@@ -114,6 +114,7 @@ OPTIONS
- local_ins_lat: Local instruction latency version
- p_stage_cyc: On powerpc, this presents the number of cycles spent in a
pipeline stage. And currently supported only on powerpc.
+ - addr: (Full) virtual address of the sampled instruction
By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol)
diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
index 2171f02daf59..898226ea8cad 100644
--- a/tools/perf/Makefile.config
+++ b/tools/perf/Makefile.config
@@ -19,6 +19,11 @@ detected_var = $(shell echo "$(1)=$($(1))" >> $(OUTPUT).config-detected)
CFLAGS := $(EXTRA_CFLAGS) $(filter-out -Wnested-externs,$(EXTRA_WARNINGS))
HOSTCFLAGS := $(filter-out -Wnested-externs,$(EXTRA_WARNINGS))
+# Enabled Wthread-safety analysis for clang builds.
+ifeq ($(CC_NO_CLANG), 0)
+ CFLAGS += -Wthread-safety
+endif
+
include $(srctree)/tools/scripts/Makefile.arch
$(call detected_var,SRCARCH)
@@ -583,6 +588,10 @@ ifndef NO_LIBELF
ifeq ($(feature-libbpf-bpf_object__next_map), 1)
CFLAGS += -DHAVE_LIBBPF_BPF_OBJECT__NEXT_MAP
endif
+ $(call feature_check,libbpf-bpf_program__set_insns)
+ ifeq ($(feature-libbpf-bpf_program__set_insns), 1)
+ CFLAGS += -DHAVE_LIBBPF_BPF_PROGRAM__SET_INSNS
+ endif
$(call feature_check,libbpf-btf__raw_data)
ifeq ($(feature-libbpf-btf__raw_data), 1)
CFLAGS += -DHAVE_LIBBPF_BTF__RAW_DATA
@@ -599,6 +608,7 @@ ifndef NO_LIBELF
CFLAGS += -DHAVE_LIBBPF_BPF_PROG_LOAD
CFLAGS += -DHAVE_LIBBPF_BPF_OBJECT__NEXT_PROGRAM
CFLAGS += -DHAVE_LIBBPF_BPF_OBJECT__NEXT_MAP
+ CFLAGS += -DHAVE_LIBBPF_BPF_PROGRAM__SET_INSNS
CFLAGS += -DHAVE_LIBBPF_BTF__RAW_DATA
CFLAGS += -DHAVE_LIBBPF_BPF_MAP_CREATE
endif
@@ -1291,6 +1301,8 @@ perf_examples_instdir_SQ = $(subst ','\'',$(perf_examples_instdir))
STRACE_GROUPS_INSTDIR_SQ = $(subst ','\'',$(STRACE_GROUPS_INSTDIR))
tip_instdir_SQ = $(subst ','\'',$(tip_instdir))
+export perfexec_instdir_SQ
+
# If we install to $(HOME) we keep the traceevent default:
# $(HOME)/.traceevent/plugins
# Otherwise we install plugins into the global $(libdir).
@@ -1301,14 +1313,18 @@ endif
print_var = $(eval $(print_var_code)) $(info $(MSG))
define print_var_code
- MSG = $(shell printf '...%30s: %s' $(1) $($(1)))
+ MSG = $(shell printf '...%40s: %s' $(1) $($(1)))
endef
+ifeq ($(feature_display),1)
+ $(call feature_display_entries)
+endif
+
ifeq ($(VF),1)
# Display EXTRA features which are detected manualy
# from here with feature_check call and thus cannot
# be partof global state output.
- $(foreach feat,$(FEATURE_TESTS_EXTRA),$(call feature_print_status,$(feat),))
+ $(foreach feat,$(FEATURE_TESTS_EXTRA),$(call feature_print_status,$(feat),) $(info $(MSG)))
$(call print_var,prefix)
$(call print_var,bindir)
$(call print_var,libdir)
@@ -1318,11 +1334,12 @@ ifeq ($(VF),1)
$(call print_var,JDIR)
ifeq ($(dwarf-post-unwind),1)
- $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
+ $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text)) $(info $(MSG))
endif
- $(info )
endif
+$(info )
+
$(call detected_var,bindir_SQ)
$(call detected_var,PYTHON_WORD)
ifneq ($(OUTPUT),)
@@ -1352,7 +1369,3 @@ endif
# tests.
$(shell rm -f $(FEATURE_DUMP_FILENAME))
$(foreach feat,$(FEATURE_TESTS),$(shell echo "$(call feature_assign,$(feat))" >> $(FEATURE_DUMP_FILENAME)))
-
-ifeq ($(feature_display),1)
- $(call feature_display_entries)
-endif
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index bd947885a639..a432e59afc42 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -629,7 +629,16 @@ sync_file_range_tbls := $(srctree)/tools/perf/trace/beauty/sync_file_range.sh
$(sync_file_range_arrays): $(linux_uapi_dir)/fs.h $(sync_file_range_tbls)
$(Q)$(SHELL) '$(sync_file_range_tbls)' $(linux_uapi_dir) > $@
-all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
+TESTS_CORESIGHT_DIR := $(srctree)/tools/perf/tests/shell/coresight
+
+tests-coresight-targets: FORCE
+ $(Q)$(MAKE) -C $(TESTS_CORESIGHT_DIR)
+
+tests-coresight-targets-clean:
+ $(call QUIET_CLEAN, coresight)
+ $(Q)$(MAKE) -C $(TESTS_CORESIGHT_DIR) O=$(OUTPUT) clean >/dev/null
+
+all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS) tests-coresight-targets
# Create python binding output directory if not already present
_dummy := $(shell [ -d '$(OUTPUT)python' ] || mkdir -p '$(OUTPUT)python')
@@ -1006,7 +1015,10 @@ install-tests: all install-gtk
$(INSTALL) tests/shell/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell'; \
$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
$(INSTALL) tests/shell/lib/*.sh -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
- $(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'
+ $(INSTALL) tests/shell/lib/*.py -m 644 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/lib'; \
+ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight' ; \
+ $(INSTALL) tests/shell/coresight/*.sh '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/shell/coresight'
+ $(Q)$(MAKE) -C tests/shell/coresight install-tests
install-bin: install-tools install-tests install-traceevent-plugins
@@ -1077,7 +1089,7 @@ endif # BUILD_BPF_SKEL
bpf-skel-clean:
$(call QUIET_CLEAN, bpf-skel) $(RM) -r $(SKEL_TMP_OUT) $(SKELETONS)
-clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean
+clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean $(LIBPERF)-clean fixdep-clean python-clean bpf-skel-clean tests-coresight-targets-clean
$(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-iostat $(LANG_BINDINGS)
$(Q)find $(or $(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete
$(Q)$(RM) $(OUTPUT).config-detected
diff --git a/tools/perf/arch/arm/util/auxtrace.c b/tools/perf/arch/arm/util/auxtrace.c
index 5fc6a2a3dbc5..deeb163999ce 100644
--- a/tools/perf/arch/arm/util/auxtrace.c
+++ b/tools/perf/arch/arm/util/auxtrace.c
@@ -4,9 +4,11 @@
* Author: Mathieu Poirier <mathieu.poirier@linaro.org>
*/
+#include <dirent.h>
#include <stdbool.h>
#include <linux/coresight-pmu.h>
#include <linux/zalloc.h>
+#include <api/fs/fs.h>
#include "../../../util/auxtrace.h"
#include "../../../util/debug.h"
@@ -14,6 +16,7 @@
#include "../../../util/pmu.h"
#include "cs-etm.h"
#include "arm-spe.h"
+#include "hisi-ptt.h"
static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
{
@@ -50,42 +53,114 @@ static struct perf_pmu **find_all_arm_spe_pmus(int *nr_spes, int *err)
return arm_spe_pmus;
}
+static struct perf_pmu **find_all_hisi_ptt_pmus(int *nr_ptts, int *err)
+{
+ const char *sysfs = sysfs__mountpoint();
+ struct perf_pmu **hisi_ptt_pmus = NULL;
+ struct dirent *dent;
+ char path[PATH_MAX];
+ DIR *dir = NULL;
+ int idx = 0;
+
+ snprintf(path, PATH_MAX, "%s" EVENT_SOURCE_DEVICE_PATH, sysfs);
+ dir = opendir(path);
+ if (!dir) {
+ pr_err("can't read directory '%s'\n", EVENT_SOURCE_DEVICE_PATH);
+ *err = -EINVAL;
+ return NULL;
+ }
+
+ while ((dent = readdir(dir))) {
+ if (strstr(dent->d_name, HISI_PTT_PMU_NAME))
+ (*nr_ptts)++;
+ }
+
+ if (!(*nr_ptts))
+ goto out;
+
+ hisi_ptt_pmus = zalloc(sizeof(struct perf_pmu *) * (*nr_ptts));
+ if (!hisi_ptt_pmus) {
+ pr_err("hisi_ptt alloc failed\n");
+ *err = -ENOMEM;
+ goto out;
+ }
+
+ rewinddir(dir);
+ while ((dent = readdir(dir))) {
+ if (strstr(dent->d_name, HISI_PTT_PMU_NAME) && idx < *nr_ptts) {
+ hisi_ptt_pmus[idx] = perf_pmu__find(dent->d_name);
+ if (hisi_ptt_pmus[idx])
+ idx++;
+ }
+ }
+
+out:
+ closedir(dir);
+ return hisi_ptt_pmus;
+}
+
+static struct perf_pmu *find_pmu_for_event(struct perf_pmu **pmus,
+ int pmu_nr, struct evsel *evsel)
+{
+ int i;
+
+ if (!pmus)
+ return NULL;
+
+ for (i = 0; i < pmu_nr; i++) {
+ if (evsel->core.attr.type == pmus[i]->type)
+ return pmus[i];
+ }
+
+ return NULL;
+}
+
struct auxtrace_record
*auxtrace_record__init(struct evlist *evlist, int *err)
{
- struct perf_pmu *cs_etm_pmu;
+ struct perf_pmu *cs_etm_pmu = NULL;
+ struct perf_pmu **arm_spe_pmus = NULL;
+ struct perf_pmu **hisi_ptt_pmus = NULL;
struct evsel *evsel;
- bool found_etm = false;
+ struct perf_pmu *found_etm = NULL;
struct perf_pmu *found_spe = NULL;
- struct perf_pmu **arm_spe_pmus = NULL;
+ struct perf_pmu *found_ptt = NULL;
+ int auxtrace_event_cnt = 0;
int nr_spes = 0;
- int i = 0;
+ int nr_ptts = 0;
if (!evlist)
return NULL;
cs_etm_pmu = perf_pmu__find(CORESIGHT_ETM_PMU_NAME);
arm_spe_pmus = find_all_arm_spe_pmus(&nr_spes, err);
+ hisi_ptt_pmus = find_all_hisi_ptt_pmus(&nr_ptts, err);
evlist__for_each_entry(evlist, evsel) {
- if (cs_etm_pmu &&
- evsel->core.attr.type == cs_etm_pmu->type)
- found_etm = true;
-
- if (!nr_spes || found_spe)
- continue;
-
- for (i = 0; i < nr_spes; i++) {
- if (evsel->core.attr.type == arm_spe_pmus[i]->type) {
- found_spe = arm_spe_pmus[i];
- break;
- }
- }
+ if (cs_etm_pmu && !found_etm)
+ found_etm = find_pmu_for_event(&cs_etm_pmu, 1, evsel);
+
+ if (arm_spe_pmus && !found_spe)
+ found_spe = find_pmu_for_event(arm_spe_pmus, nr_spes, evsel);
+
+ if (hisi_ptt_pmus && !found_ptt)
+ found_ptt = find_pmu_for_event(hisi_ptt_pmus, nr_ptts, evsel);
}
+
free(arm_spe_pmus);
+ free(hisi_ptt_pmus);
+
+ if (found_etm)
+ auxtrace_event_cnt++;
- if (found_etm && found_spe) {
- pr_err("Concurrent ARM Coresight ETM and SPE operation not currently supported\n");
+ if (found_spe)
+ auxtrace_event_cnt++;
+
+ if (found_ptt)
+ auxtrace_event_cnt++;
+
+ if (auxtrace_event_cnt > 1) {
+ pr_err("Concurrent AUX trace operation not currently supported\n");
*err = -EOPNOTSUPP;
return NULL;
}
@@ -96,6 +171,9 @@ struct auxtrace_record
#if defined(__aarch64__)
if (found_spe)
return arm_spe_recording_init(err, found_spe);
+
+ if (found_ptt)
+ return hisi_ptt_recording_init(err, found_ptt);
#endif
/*
diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c
index b8b23b9dc598..887c8addc491 100644
--- a/tools/perf/arch/arm/util/pmu.c
+++ b/tools/perf/arch/arm/util/pmu.c
@@ -10,6 +10,7 @@
#include <linux/string.h>
#include "arm-spe.h"
+#include "hisi-ptt.h"
#include "../../../util/pmu.h"
struct perf_event_attr
@@ -22,6 +23,8 @@ struct perf_event_attr
#if defined(__aarch64__)
} else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) {
return arm_spe_pmu_default_config(pmu);
+ } else if (strstarts(pmu->name, HISI_PTT_PMU_NAME)) {
+ pmu->selectable = true;
#endif
}
diff --git a/tools/perf/arch/arm64/annotate/instructions.c b/tools/perf/arch/arm64/annotate/instructions.c
index 037e292ecd8e..4af0c3a0f86e 100644
--- a/tools/perf/arch/arm64/annotate/instructions.c
+++ b/tools/perf/arch/arm64/annotate/instructions.c
@@ -102,7 +102,7 @@ static int arm64__annotate_init(struct arch *arch, char *cpuid __maybe_unused)
if (err)
goto out_free_arm;
/* b, b.cond, br, cbz/cbnz, tbz/tbnz */
- err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|le|ls|lt|mi|ne|pl)?n?z?$",
+ err = regcomp(&arm->jump_insn, "^[ct]?br?\\.?(cc|cs|eq|ge|gt|hi|hs|le|lo|ls|lt|mi|ne|pl|vc|vs)?n?z?$",
REG_EXTENDED);
if (err)
goto out_free_call;
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index 9fcb4e68add9..337aa9bdf905 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -11,4 +11,4 @@ perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
perf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \
../../arm/util/auxtrace.o \
../../arm/util/cs-etm.o \
- arm-spe.o mem-events.o
+ arm-spe.o mem-events.o hisi-ptt.o
diff --git a/tools/perf/arch/arm64/util/hisi-ptt.c b/tools/perf/arch/arm64/util/hisi-ptt.c
new file mode 100644
index 000000000000..ba97c8a562a0
--- /dev/null
+++ b/tools/perf/arch/arm64/util/hisi-ptt.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HiSilicon PCIe Trace and Tuning (PTT) support
+ * Copyright (c) 2022 HiSilicon Technologies Co., Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/zalloc.h>
+#include <time.h>
+
+#include <internal/lib.h> // page_size
+#include "../../../util/auxtrace.h"
+#include "../../../util/cpumap.h"
+#include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/evlist.h"
+#include "../../../util/evsel.h"
+#include "../../../util/hisi-ptt.h"
+#include "../../../util/pmu.h"
+#include "../../../util/record.h"
+#include "../../../util/session.h"
+#include "../../../util/tsc.h"
+
+#define KiB(x) ((x) * 1024)
+#define MiB(x) ((x) * 1024 * 1024)
+
+struct hisi_ptt_recording {
+ struct auxtrace_record itr;
+ struct perf_pmu *hisi_ptt_pmu;
+ struct evlist *evlist;
+};
+
+static size_t
+hisi_ptt_info_priv_size(struct auxtrace_record *itr __maybe_unused,
+ struct evlist *evlist __maybe_unused)
+{
+ return HISI_PTT_AUXTRACE_PRIV_SIZE;
+}
+
+static int hisi_ptt_info_fill(struct auxtrace_record *itr,
+ struct perf_session *session,
+ struct perf_record_auxtrace_info *auxtrace_info,
+ size_t priv_size)
+{
+ struct hisi_ptt_recording *pttr =
+ container_of(itr, struct hisi_ptt_recording, itr);
+ struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu;
+
+ if (priv_size != HISI_PTT_AUXTRACE_PRIV_SIZE)
+ return -EINVAL;
+
+ if (!session->evlist->core.nr_mmaps)
+ return -EINVAL;
+
+ auxtrace_info->type = PERF_AUXTRACE_HISI_PTT;
+ auxtrace_info->priv[0] = hisi_ptt_pmu->type;
+
+ return 0;
+}
+
+static int hisi_ptt_set_auxtrace_mmap_page(struct record_opts *opts)
+{
+ bool privileged = perf_event_paranoid_check(-1);
+
+ if (!opts->full_auxtrace)
+ return 0;
+
+ if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) {
+ if (privileged) {
+ opts->auxtrace_mmap_pages = MiB(16) / page_size;
+ } else {
+ opts->auxtrace_mmap_pages = KiB(128) / page_size;
+ if (opts->mmap_pages == UINT_MAX)
+ opts->mmap_pages = KiB(256) / page_size;
+ }
+ }
+
+ /* Validate auxtrace_mmap_pages */
+ if (opts->auxtrace_mmap_pages) {
+ size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size;
+ size_t min_sz = KiB(8);
+
+ if (sz < min_sz || !is_power_of_2(sz)) {
+ pr_err("Invalid mmap size for HISI PTT: must be at least %zuKiB and a power of 2\n",
+ min_sz / 1024);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int hisi_ptt_recording_options(struct auxtrace_record *itr,
+ struct evlist *evlist,
+ struct record_opts *opts)
+{
+ struct hisi_ptt_recording *pttr =
+ container_of(itr, struct hisi_ptt_recording, itr);
+ struct perf_pmu *hisi_ptt_pmu = pttr->hisi_ptt_pmu;
+ struct evsel *evsel, *hisi_ptt_evsel = NULL;
+ struct evsel *tracking_evsel;
+ int err;
+
+ pttr->evlist = evlist;
+ evlist__for_each_entry(evlist, evsel) {
+ if (evsel->core.attr.type == hisi_ptt_pmu->type) {
+ if (hisi_ptt_evsel) {
+ pr_err("There may be only one " HISI_PTT_PMU_NAME "x event\n");
+ return -EINVAL;
+ }
+ evsel->core.attr.freq = 0;
+ evsel->core.attr.sample_period = 1;
+ evsel->needs_auxtrace_mmap = true;
+ hisi_ptt_evsel = evsel;
+ opts->full_auxtrace = true;
+ }
+ }
+
+ err = hisi_ptt_set_auxtrace_mmap_page(opts);
+ if (err)
+ return err;
+ /*
+ * To obtain the auxtrace buffer file descriptor, the auxtrace event
+ * must come first.
+ */
+ evlist__to_front(evlist, hisi_ptt_evsel);
+ evsel__set_sample_bit(hisi_ptt_evsel, TIME);
+
+ /* Add dummy event to keep tracking */
+ err = parse_event(evlist, "dummy:u");
+ if (err)
+ return err;
+
+ tracking_evsel = evlist__last(evlist);
+ evlist__set_tracking_event(evlist, tracking_evsel);
+
+ tracking_evsel->core.attr.freq = 0;
+ tracking_evsel->core.attr.sample_period = 1;
+ evsel__set_sample_bit(tracking_evsel, TIME);
+
+ return 0;
+}
+
+static u64 hisi_ptt_reference(struct auxtrace_record *itr __maybe_unused)
+{
+ return rdtsc();
+}
+
+static void hisi_ptt_recording_free(struct auxtrace_record *itr)
+{
+ struct hisi_ptt_recording *pttr =
+ container_of(itr, struct hisi_ptt_recording, itr);
+
+ free(pttr);
+}
+
+struct auxtrace_record *hisi_ptt_recording_init(int *err,
+ struct perf_pmu *hisi_ptt_pmu)
+{
+ struct hisi_ptt_recording *pttr;
+
+ if (!hisi_ptt_pmu) {
+ *err = -ENODEV;
+ return NULL;
+ }
+
+ pttr = zalloc(sizeof(*pttr));
+ if (!pttr) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ pttr->hisi_ptt_pmu = hisi_ptt_pmu;
+ pttr->itr.pmu = hisi_ptt_pmu;
+ pttr->itr.recording_options = hisi_ptt_recording_options;
+ pttr->itr.info_priv_size = hisi_ptt_info_priv_size;
+ pttr->itr.info_fill = hisi_ptt_info_fill;
+ pttr->itr.free = hisi_ptt_recording_free;
+ pttr->itr.reference = hisi_ptt_reference;
+ pttr->itr.read_finish = auxtrace_record__read_finish;
+ pttr->itr.alignment = 0;
+
+ *err = 0;
+ return &pttr->itr;
+}
diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
index 2600b4237292..e9e0df4f9a61 100644
--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl
@@ -110,7 +110,7 @@
79 common settimeofday sys_settimeofday compat_sys_settimeofday
80 common getgroups sys_getgroups
81 common setgroups sys_setgroups
-82 32 select ppc_select sys_ni_syscall
+82 32 select sys_old_select compat_sys_old_select
82 64 select sys_ni_syscall
82 spu select sys_ni_syscall
83 common symlink sys_symlink
@@ -178,9 +178,9 @@
133 common fchdir sys_fchdir
134 common bdflush sys_ni_syscall
135 common sysfs sys_sysfs
-136 32 personality sys_personality ppc64_personality
-136 64 personality ppc64_personality
-136 spu personality ppc64_personality
+136 32 personality sys_personality compat_sys_ppc64_personality
+136 64 personality sys_ppc64_personality
+136 spu personality sys_ppc64_personality
137 common afs_syscall sys_ni_syscall
138 common setfsuid sys_setfsuid
139 common setfsgid sys_setfsgid
@@ -228,8 +228,10 @@
176 64 rt_sigtimedwait sys_rt_sigtimedwait
177 nospu rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo
178 nospu rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend
-179 common pread64 sys_pread64 compat_sys_pread64
-180 common pwrite64 sys_pwrite64 compat_sys_pwrite64
+179 32 pread64 sys_ppc_pread64 compat_sys_ppc_pread64
+179 64 pread64 sys_pread64
+180 32 pwrite64 sys_ppc_pwrite64 compat_sys_ppc_pwrite64
+180 64 pwrite64 sys_pwrite64
181 common chown sys_chown
182 common getcwd sys_getcwd
183 common capget sys_capget
@@ -242,10 +244,11 @@
188 common putpmsg sys_ni_syscall
189 nospu vfork sys_vfork
190 common ugetrlimit sys_getrlimit compat_sys_getrlimit
-191 common readahead sys_readahead compat_sys_readahead
+191 32 readahead sys_ppc_readahead compat_sys_ppc_readahead
+191 64 readahead sys_readahead
192 32 mmap2 sys_mmap2 compat_sys_mmap2
-193 32 truncate64 sys_truncate64 compat_sys_truncate64
-194 32 ftruncate64 sys_ftruncate64 compat_sys_ftruncate64
+193 32 truncate64 sys_ppc_truncate64 compat_sys_ppc_truncate64
+194 32 ftruncate64 sys_ppc_ftruncate64 compat_sys_ppc_ftruncate64
195 32 stat64 sys_stat64
196 32 lstat64 sys_lstat64
197 32 fstat64 sys_fstat64
@@ -288,7 +291,8 @@
230 common io_submit sys_io_submit compat_sys_io_submit
231 common io_cancel sys_io_cancel
232 nospu set_tid_address sys_set_tid_address
-233 common fadvise64 sys_fadvise64 ppc32_fadvise64
+233 32 fadvise64 sys_ppc32_fadvise64 compat_sys_ppc32_fadvise64
+233 64 fadvise64 sys_fadvise64
234 nospu exit_group sys_exit_group
235 nospu lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie
236 common epoll_create sys_epoll_create
@@ -323,7 +327,7 @@
251 spu utimes sys_utimes
252 common statfs64 sys_statfs64 compat_sys_statfs64
253 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64
-254 32 fadvise64_64 ppc_fadvise64_64
+254 32 fadvise64_64 sys_ppc_fadvise64_64
254 spu fadvise64_64 sys_ni_syscall
255 common rtas sys_rtas
256 32 sys_debug_setcontext sys_debug_setcontext sys_ni_syscall
@@ -390,7 +394,7 @@
305 common signalfd sys_signalfd compat_sys_signalfd
306 common timerfd_create sys_timerfd_create
307 common eventfd sys_eventfd
-308 common sync_file_range2 sys_sync_file_range2 compat_sys_sync_file_range2
+308 common sync_file_range2 sys_sync_file_range2 compat_sys_ppc_sync_file_range2
309 nospu fallocate sys_fallocate compat_sys_fallocate
310 nospu subpage_prot sys_subpage_prot
311 32 timerfd_settime sys_timerfd_settime32
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
index 13933020a79e..af102f471e9f 100644
--- a/tools/perf/arch/x86/util/intel-pt.c
+++ b/tools/perf/arch/x86/util/intel-pt.c
@@ -11,6 +11,7 @@
#include <linux/bitops.h>
#include <linux/log2.h>
#include <linux/zalloc.h>
+#include <linux/err.h>
#include <cpuid.h>
#include "../../../util/session.h"
@@ -426,20 +427,14 @@ static int intel_pt_track_switches(struct evlist *evlist)
if (!evlist__can_select_event(evlist, sched_switch))
return -EPERM;
- err = parse_event(evlist, sched_switch);
- if (err) {
- pr_debug2("%s: failed to parse %s, error %d\n",
+ evsel = evlist__add_sched_switch(evlist, true);
+ if (IS_ERR(evsel)) {
+ err = PTR_ERR(evsel);
+ pr_debug2("%s: failed to create %s, error = %d\n",
__func__, sched_switch, err);
return err;
}
- evsel = evlist__last(evlist);
-
- evsel__set_sample_bit(evsel, CPU);
- evsel__set_sample_bit(evsel, TIME);
-
- evsel->core.system_wide = true;
- evsel->no_aux_samples = true;
evsel->immediate = true;
return 0;
@@ -871,7 +866,7 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
* User space tasks can migrate between CPUs, so when tracing
* selected CPUs, sideband for all CPUs is still needed.
*/
- need_system_wide_tracking = evlist->core.has_user_cpus &&
+ need_system_wide_tracking = opts->target.cpu_list &&
!intel_pt_evsel->core.attr.exclude_user;
tracking_evsel = evlist__add_aux_dummy(evlist, need_system_wide_tracking);
diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c
index 5214370ca4e4..f683ac702247 100644
--- a/tools/perf/arch/x86/util/mem-events.c
+++ b/tools/perf/arch/x86/util/mem-events.c
@@ -1,7 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include "util/pmu.h"
+#include "util/env.h"
#include "map_symbol.h"
#include "mem-events.h"
+#include "linux/string.h"
static char mem_loads_name[100];
static bool mem_loads_name__init;
@@ -12,18 +14,43 @@ static char mem_stores_name[100];
#define E(t, n, s) { .tag = t, .name = n, .sysfs_name = s }
-static struct perf_mem_event perf_mem_events[PERF_MEM_EVENTS__MAX] = {
+static struct perf_mem_event perf_mem_events_intel[PERF_MEM_EVENTS__MAX] = {
E("ldlat-loads", "%s/mem-loads,ldlat=%u/P", "%s/events/mem-loads"),
E("ldlat-stores", "%s/mem-stores/P", "%s/events/mem-stores"),
E(NULL, NULL, NULL),
};
+static struct perf_mem_event perf_mem_events_amd[PERF_MEM_EVENTS__MAX] = {
+ E(NULL, NULL, NULL),
+ E(NULL, NULL, NULL),
+ E("mem-ldst", "ibs_op//", "ibs_op"),
+};
+
+static int perf_mem_is_amd_cpu(void)
+{
+ struct perf_env env = { .total_mem = 0, };
+
+ perf_env__cpuid(&env);
+ if (env.cpuid && strstarts(env.cpuid, "AuthenticAMD"))
+ return 1;
+ return -1;
+}
+
struct perf_mem_event *perf_mem_events__ptr(int i)
{
+ /* 0: Uninitialized, 1: Yes, -1: No */
+ static int is_amd;
+
if (i >= PERF_MEM_EVENTS__MAX)
return NULL;
- return &perf_mem_events[i];
+ if (!is_amd)
+ is_amd = perf_mem_is_amd_cpu();
+
+ if (is_amd == 1)
+ return &perf_mem_events_amd[i];
+
+ return &perf_mem_events_intel[i];
}
bool is_mem_loads_aux_event(struct evsel *leader)
diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
index 4256dc5d6236..521d1ff97b06 100644
--- a/tools/perf/bench/epoll-ctl.c
+++ b/tools/perf/bench/epoll-ctl.c
@@ -23,6 +23,7 @@
#include <sys/eventfd.h>
#include <perf/cpumap.h>
+#include "../util/mutex.h"
#include "../util/stat.h"
#include <subcmd/parse-options.h>
#include "bench.h"
@@ -58,10 +59,10 @@ static unsigned int nested = 0;
/* amount of fds to monitor, per thread */
static unsigned int nfds = 64;
-static pthread_mutex_t thread_lock;
+static struct mutex thread_lock;
static unsigned int threads_starting;
static struct stats all_stats[EPOLL_NR_OPS];
-static pthread_cond_t thread_parent, thread_worker;
+static struct cond thread_parent, thread_worker;
struct worker {
int tid;
@@ -174,12 +175,12 @@ static void *workerfn(void *arg)
struct timespec ts = { .tv_sec = 0,
.tv_nsec = 250 };
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
/* Let 'em loose */
do {
@@ -367,9 +368,9 @@ int bench_epoll_ctl(int argc, const char **argv)
for (i = 0; i < EPOLL_NR_OPS; i++)
init_stats(&all_stats[i]);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
threads_starting = nthreads;
@@ -377,11 +378,11 @@ int bench_epoll_ctl(int argc, const char **argv)
do_threads(worker, cpu);
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
sleep(nsecs);
toggle_done(0, NULL, NULL);
@@ -394,9 +395,9 @@ int bench_epoll_ctl(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
for (i = 0; i < nthreads; i++) {
unsigned long t[EPOLL_NR_OPS];
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
index 2728b0140853..c1cdf03c075d 100644
--- a/tools/perf/bench/epoll-wait.c
+++ b/tools/perf/bench/epoll-wait.c
@@ -79,6 +79,7 @@
#include <perf/cpumap.h>
#include "../util/stat.h"
+#include "../util/mutex.h"
#include <subcmd/parse-options.h>
#include "bench.h"
@@ -109,10 +110,10 @@ static bool multiq; /* use an epoll instance per thread */
/* amount of fds to monitor, per thread */
static unsigned int nfds = 64;
-static pthread_mutex_t thread_lock;
+static struct mutex thread_lock;
static unsigned int threads_starting;
static struct stats throughput_stats;
-static pthread_cond_t thread_parent, thread_worker;
+static struct cond thread_parent, thread_worker;
struct worker {
int tid;
@@ -189,12 +190,12 @@ static void *workerfn(void *arg)
int to = nonblocking? 0 : -1;
int efd = multiq ? w->epollfd : epollfd;
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
do {
/*
@@ -485,9 +486,9 @@ int bench_epoll_wait(int argc, const char **argv)
getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs);
init_stats(&throughput_stats);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
threads_starting = nthreads;
@@ -495,11 +496,11 @@ int bench_epoll_wait(int argc, const char **argv)
do_threads(worker, cpu);
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
/*
* At this point the workers should be blocked waiting for read events
@@ -522,9 +523,9 @@ int bench_epoll_wait(int argc, const char **argv)
err(EXIT_FAILURE, "pthread_join");
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
/* sort the array back before reporting */
if (randomize)
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index f05db4cf983d..2005a3fa3026 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -23,6 +23,7 @@
#include <sys/mman.h>
#include <perf/cpumap.h>
+#include "../util/mutex.h"
#include "../util/stat.h"
#include <subcmd/parse-options.h>
#include "bench.h"
@@ -34,10 +35,10 @@ static bool done = false;
static int futex_flag = 0;
struct timeval bench__start, bench__end, bench__runtime;
-static pthread_mutex_t thread_lock;
+static struct mutex thread_lock;
static unsigned int threads_starting;
static struct stats throughput_stats;
-static pthread_cond_t thread_parent, thread_worker;
+static struct cond thread_parent, thread_worker;
struct worker {
int tid;
@@ -73,12 +74,12 @@ static void *workerfn(void *arg)
unsigned int i;
unsigned long ops = w->ops; /* avoid cacheline bouncing */
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
do {
for (i = 0; i < params.nfutexes; i++, ops++) {
@@ -165,9 +166,9 @@ int bench_futex_hash(int argc, const char **argv)
getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime);
init_stats(&throughput_stats);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
threads_starting = params.nthreads;
pthread_attr_init(&thread_attr);
@@ -203,11 +204,11 @@ int bench_futex_hash(int argc, const char **argv)
CPU_FREE(cpuset);
pthread_attr_destroy(&thread_attr);
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
sleep(params.runtime);
toggle_done(0, NULL, NULL);
@@ -219,9 +220,9 @@ int bench_futex_hash(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
for (i = 0; i < params.nthreads; i++) {
unsigned long t = bench__runtime.tv_sec > 0 ?
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 0abb3f7ee24f..2d0417949727 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -8,6 +8,7 @@
#include <pthread.h>
#include <signal.h>
+#include "../util/mutex.h"
#include "../util/stat.h"
#include <subcmd/parse-options.h>
#include <linux/compiler.h>
@@ -34,10 +35,10 @@ static u_int32_t global_futex = 0;
static struct worker *worker;
static bool done = false;
static int futex_flag = 0;
-static pthread_mutex_t thread_lock;
+static struct mutex thread_lock;
static unsigned int threads_starting;
static struct stats throughput_stats;
-static pthread_cond_t thread_parent, thread_worker;
+static struct cond thread_parent, thread_worker;
static struct bench_futex_parameters params = {
.runtime = 10,
@@ -83,12 +84,12 @@ static void *workerfn(void *arg)
struct worker *w = (struct worker *) arg;
unsigned long ops = w->ops;
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
do {
int ret;
@@ -197,9 +198,9 @@ int bench_futex_lock_pi(int argc, const char **argv)
getpid(), params.nthreads, params.runtime);
init_stats(&throughput_stats);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
threads_starting = params.nthreads;
pthread_attr_init(&thread_attr);
@@ -208,11 +209,11 @@ int bench_futex_lock_pi(int argc, const char **argv)
create_threads(worker, thread_attr, cpu);
pthread_attr_destroy(&thread_attr);
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
sleep(params.runtime);
toggle_done(0, NULL, NULL);
@@ -224,9 +225,9 @@ int bench_futex_lock_pi(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
for (i = 0; i < params.nthreads; i++) {
unsigned long t = bench__runtime.tv_sec > 0 ?
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
index b6faabfafb8e..69ad896f556c 100644
--- a/tools/perf/bench/futex-requeue.c
+++ b/tools/perf/bench/futex-requeue.c
@@ -15,6 +15,7 @@
#include <pthread.h>
#include <signal.h>
+#include "../util/mutex.h"
#include "../util/stat.h"
#include <subcmd/parse-options.h>
#include <linux/compiler.h>
@@ -34,8 +35,8 @@ static u_int32_t futex1 = 0, futex2 = 0;
static pthread_t *worker;
static bool done = false;
-static pthread_mutex_t thread_lock;
-static pthread_cond_t thread_parent, thread_worker;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
static struct stats requeuetime_stats, requeued_stats;
static unsigned int threads_starting;
static int futex_flag = 0;
@@ -82,12 +83,12 @@ static void *workerfn(void *arg __maybe_unused)
{
int ret;
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
while (1) {
if (!params.pi) {
@@ -209,9 +210,9 @@ int bench_futex_requeue(int argc, const char **argv)
init_stats(&requeued_stats);
init_stats(&requeuetime_stats);
pthread_attr_init(&thread_attr);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
for (j = 0; j < bench_repeat && !done; j++) {
unsigned int nrequeued = 0, wakeups = 0;
@@ -221,11 +222,11 @@ int bench_futex_requeue(int argc, const char **argv)
block_threads(worker, thread_attr, cpu);
/* make sure all threads are already blocked */
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
usleep(100000);
@@ -297,9 +298,9 @@ int bench_futex_requeue(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
pthread_attr_destroy(&thread_attr);
print_summary();
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
index e47f46a3a47e..6682e49d0ee0 100644
--- a/tools/perf/bench/futex-wake-parallel.c
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -10,6 +10,7 @@
#include "bench.h"
#include <linux/compiler.h>
#include "../util/debug.h"
+#include "../util/mutex.h"
#ifndef HAVE_PTHREAD_BARRIER
int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused)
@@ -49,8 +50,8 @@ static u_int32_t futex = 0;
static pthread_t *blocked_worker;
static bool done = false;
-static pthread_mutex_t thread_lock;
-static pthread_cond_t thread_parent, thread_worker;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
static pthread_barrier_t barrier;
static struct stats waketime_stats, wakeup_stats;
static unsigned int threads_starting;
@@ -125,12 +126,12 @@ static void wakeup_threads(struct thread_data *td, pthread_attr_t thread_attr)
static void *blocked_workerfn(void *arg __maybe_unused)
{
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
while (1) { /* handle spurious wakeups */
if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR)
@@ -294,9 +295,9 @@ int bench_futex_wake_parallel(int argc, const char **argv)
init_stats(&waketime_stats);
pthread_attr_init(&thread_attr);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
for (j = 0; j < bench_repeat && !done; j++) {
waking_worker = calloc(params.nwakes, sizeof(*waking_worker));
@@ -307,11 +308,11 @@ int bench_futex_wake_parallel(int argc, const char **argv)
block_threads(blocked_worker, thread_attr, cpu);
/* make sure all threads are already blocked */
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
usleep(100000);
@@ -332,9 +333,9 @@ int bench_futex_wake_parallel(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
pthread_attr_destroy(&thread_attr);
print_summary();
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
index 201a3555f09a..9ecab6620a87 100644
--- a/tools/perf/bench/futex-wake.c
+++ b/tools/perf/bench/futex-wake.c
@@ -14,6 +14,7 @@
#include <pthread.h>
#include <signal.h>
+#include "../util/mutex.h"
#include "../util/stat.h"
#include <subcmd/parse-options.h>
#include <linux/compiler.h>
@@ -34,8 +35,8 @@ static u_int32_t futex1 = 0;
static pthread_t *worker;
static bool done = false;
-static pthread_mutex_t thread_lock;
-static pthread_cond_t thread_parent, thread_worker;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
static struct stats waketime_stats, wakeup_stats;
static unsigned int threads_starting;
static int futex_flag = 0;
@@ -65,12 +66,12 @@ static const char * const bench_futex_wake_usage[] = {
static void *workerfn(void *arg __maybe_unused)
{
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
threads_starting--;
if (!threads_starting)
- pthread_cond_signal(&thread_parent);
- pthread_cond_wait(&thread_worker, &thread_lock);
- pthread_mutex_unlock(&thread_lock);
+ cond_signal(&thread_parent);
+ cond_wait(&thread_worker, &thread_lock);
+ mutex_unlock(&thread_lock);
while (1) {
if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR)
@@ -178,9 +179,9 @@ int bench_futex_wake(int argc, const char **argv)
init_stats(&wakeup_stats);
init_stats(&waketime_stats);
pthread_attr_init(&thread_attr);
- pthread_mutex_init(&thread_lock, NULL);
- pthread_cond_init(&thread_parent, NULL);
- pthread_cond_init(&thread_worker, NULL);
+ mutex_init(&thread_lock);
+ cond_init(&thread_parent);
+ cond_init(&thread_worker);
for (j = 0; j < bench_repeat && !done; j++) {
unsigned int nwoken = 0;
@@ -190,11 +191,11 @@ int bench_futex_wake(int argc, const char **argv)
block_threads(worker, thread_attr, cpu);
/* make sure all threads are already blocked */
- pthread_mutex_lock(&thread_lock);
+ mutex_lock(&thread_lock);
while (threads_starting)
- pthread_cond_wait(&thread_parent, &thread_lock);
- pthread_cond_broadcast(&thread_worker);
- pthread_mutex_unlock(&thread_lock);
+ cond_wait(&thread_parent, &thread_lock);
+ cond_broadcast(&thread_worker);
+ mutex_unlock(&thread_lock);
usleep(100000);
@@ -224,9 +225,9 @@ int bench_futex_wake(int argc, const char **argv)
}
/* cleanup & report results */
- pthread_cond_destroy(&thread_parent);
- pthread_cond_destroy(&thread_worker);
- pthread_mutex_destroy(&thread_lock);
+ cond_destroy(&thread_parent);
+ cond_destroy(&thread_worker);
+ mutex_destroy(&thread_lock);
pthread_attr_destroy(&thread_attr);
print_summary();
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 20eed1e53f80..e78dedf9e682 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -6,8 +6,6 @@
*/
#include <inttypes.h>
-/* For the CLR_() macros */
-#include <pthread.h>
#include <subcmd/parse-options.h>
#include "../util/cloexec.h"
@@ -35,6 +33,7 @@
#include <linux/zalloc.h>
#include "../util/header.h"
+#include "../util/mutex.h"
#include <numa.h>
#include <numaif.h>
@@ -67,7 +66,7 @@ struct thread_data {
u64 system_time_ns;
u64 user_time_ns;
double speed_gbs;
- pthread_mutex_t *process_lock;
+ struct mutex *process_lock;
};
/* Parameters set by options: */
@@ -137,16 +136,16 @@ struct params {
struct global_info {
u8 *data;
- pthread_mutex_t startup_mutex;
- pthread_cond_t startup_cond;
+ struct mutex startup_mutex;
+ struct cond startup_cond;
int nr_tasks_started;
- pthread_mutex_t start_work_mutex;
- pthread_cond_t start_work_cond;
+ struct mutex start_work_mutex;
+ struct cond start_work_cond;
int nr_tasks_working;
bool start_work;
- pthread_mutex_t stop_work_mutex;
+ struct mutex stop_work_mutex;
u64 bytes_done;
struct thread_data *threads;
@@ -524,30 +523,6 @@ static void * setup_private_data(ssize_t bytes)
return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random);
}
-/*
- * Return a process-shared (global) mutex:
- */
-static void init_global_mutex(pthread_mutex_t *mutex)
-{
- pthread_mutexattr_t attr;
-
- pthread_mutexattr_init(&attr);
- pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
- pthread_mutex_init(mutex, &attr);
-}
-
-/*
- * Return a process-shared (global) condition variable:
- */
-static void init_global_cond(pthread_cond_t *cond)
-{
- pthread_condattr_t attr;
-
- pthread_condattr_init(&attr);
- pthread_condattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
- pthread_cond_init(cond, &attr);
-}
-
static int parse_cpu_list(const char *arg)
{
p0.cpu_list_str = strdup(arg);
@@ -1220,22 +1195,22 @@ static void *worker_thread(void *__tdata)
}
if (g->p.serialize_startup) {
- pthread_mutex_lock(&g->startup_mutex);
+ mutex_lock(&g->startup_mutex);
g->nr_tasks_started++;
/* The last thread wakes the main process. */
if (g->nr_tasks_started == g->p.nr_tasks)
- pthread_cond_signal(&g->startup_cond);
+ cond_signal(&g->startup_cond);
- pthread_mutex_unlock(&g->startup_mutex);
+ mutex_unlock(&g->startup_mutex);
/* Here we will wait for the main process to start us all at once: */
- pthread_mutex_lock(&g->start_work_mutex);
+ mutex_lock(&g->start_work_mutex);
g->start_work = false;
g->nr_tasks_working++;
while (!g->start_work)
- pthread_cond_wait(&g->start_work_cond, &g->start_work_mutex);
+ cond_wait(&g->start_work_cond, &g->start_work_mutex);
- pthread_mutex_unlock(&g->start_work_mutex);
+ mutex_unlock(&g->start_work_mutex);
}
gettimeofday(&start0, NULL);
@@ -1254,17 +1229,17 @@ static void *worker_thread(void *__tdata)
val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val);
if (g->p.sleep_usecs) {
- pthread_mutex_lock(td->process_lock);
+ mutex_lock(td->process_lock);
usleep(g->p.sleep_usecs);
- pthread_mutex_unlock(td->process_lock);
+ mutex_unlock(td->process_lock);
}
/*
* Amount of work to be done under a process-global lock:
*/
if (g->p.bytes_process_locked) {
- pthread_mutex_lock(td->process_lock);
+ mutex_lock(td->process_lock);
val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val);
- pthread_mutex_unlock(td->process_lock);
+ mutex_unlock(td->process_lock);
}
work_done = g->p.bytes_global + g->p.bytes_process +
@@ -1361,9 +1336,9 @@ static void *worker_thread(void *__tdata)
free_data(thread_data, g->p.bytes_thread);
- pthread_mutex_lock(&g->stop_work_mutex);
+ mutex_lock(&g->stop_work_mutex);
g->bytes_done += bytes_done;
- pthread_mutex_unlock(&g->stop_work_mutex);
+ mutex_unlock(&g->stop_work_mutex);
return NULL;
}
@@ -1373,7 +1348,7 @@ static void *worker_thread(void *__tdata)
*/
static void worker_process(int process_nr)
{
- pthread_mutex_t process_lock;
+ struct mutex process_lock;
struct thread_data *td;
pthread_t *pthreads;
u8 *process_data;
@@ -1381,7 +1356,7 @@ static void worker_process(int process_nr)
int ret;
int t;
- pthread_mutex_init(&process_lock, NULL);
+ mutex_init(&process_lock);
set_taskname("process %d", process_nr);
/*
@@ -1540,11 +1515,11 @@ static int init(void)
g->data = setup_shared_data(g->p.bytes_global);
/* Startup serialization: */
- init_global_mutex(&g->start_work_mutex);
- init_global_cond(&g->start_work_cond);
- init_global_mutex(&g->startup_mutex);
- init_global_cond(&g->startup_cond);
- init_global_mutex(&g->stop_work_mutex);
+ mutex_init_pshared(&g->start_work_mutex);
+ cond_init_pshared(&g->start_work_cond);
+ mutex_init_pshared(&g->startup_mutex);
+ cond_init_pshared(&g->startup_cond);
+ mutex_init_pshared(&g->stop_work_mutex);
init_thread_data();
@@ -1633,17 +1608,17 @@ static int __bench_numa(const char *name)
* Wait for all the threads to start up. The last thread will
* signal this process.
*/
- pthread_mutex_lock(&g->startup_mutex);
+ mutex_lock(&g->startup_mutex);
while (g->nr_tasks_started != g->p.nr_tasks)
- pthread_cond_wait(&g->startup_cond, &g->startup_mutex);
+ cond_wait(&g->startup_cond, &g->startup_mutex);
- pthread_mutex_unlock(&g->startup_mutex);
+ mutex_unlock(&g->startup_mutex);
/* Wait for all threads to be at the start_work_cond. */
while (!threads_ready) {
- pthread_mutex_lock(&g->start_work_mutex);
+ mutex_lock(&g->start_work_mutex);
threads_ready = (g->nr_tasks_working == g->p.nr_tasks);
- pthread_mutex_unlock(&g->start_work_mutex);
+ mutex_unlock(&g->start_work_mutex);
if (!threads_ready)
usleep(1);
}
@@ -1661,10 +1636,10 @@ static int __bench_numa(const char *name)
start = stop;
/* Start all threads running. */
- pthread_mutex_lock(&g->start_work_mutex);
+ mutex_lock(&g->start_work_mutex);
g->start_work = true;
- pthread_mutex_unlock(&g->start_work_mutex);
- pthread_cond_broadcast(&g->start_work_cond);
+ mutex_unlock(&g->start_work_mutex);
+ cond_broadcast(&g->start_work_cond);
} else {
gettimeofday(&start, NULL);
}
diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index 438fc222e213..a9190458d2d5 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -679,28 +679,35 @@ STAT_FN(ld_l2hit)
STAT_FN(ld_llchit)
STAT_FN(rmt_hit)
-static uint64_t total_records(struct c2c_stats *stats)
+static uint64_t get_load_llc_misses(struct c2c_stats *stats)
{
- uint64_t lclmiss, ldcnt, total;
-
- lclmiss = stats->lcl_dram +
- stats->rmt_dram +
- stats->rmt_hitm +
- stats->rmt_hit;
+ return stats->lcl_dram +
+ stats->rmt_dram +
+ stats->rmt_hitm +
+ stats->rmt_hit;
+}
- ldcnt = lclmiss +
- stats->ld_fbhit +
- stats->ld_l1hit +
- stats->ld_l2hit +
- stats->ld_llchit +
- stats->lcl_hitm;
+static uint64_t get_load_cache_hits(struct c2c_stats *stats)
+{
+ return stats->ld_fbhit +
+ stats->ld_l1hit +
+ stats->ld_l2hit +
+ stats->ld_llchit +
+ stats->lcl_hitm;
+}
- total = ldcnt +
- stats->st_l1hit +
- stats->st_l1miss +
- stats->st_na;
+static uint64_t get_stores(struct c2c_stats *stats)
+{
+ return stats->st_l1hit +
+ stats->st_l1miss +
+ stats->st_na;
+}
- return total;
+static uint64_t total_records(struct c2c_stats *stats)
+{
+ return get_load_llc_misses(stats) +
+ get_load_cache_hits(stats) +
+ get_stores(stats);
}
static int
@@ -737,21 +744,8 @@ tot_recs_cmp(struct perf_hpp_fmt *fmt __maybe_unused,
static uint64_t total_loads(struct c2c_stats *stats)
{
- uint64_t lclmiss, ldcnt;
-
- lclmiss = stats->lcl_dram +
- stats->rmt_dram +
- stats->rmt_hitm +
- stats->rmt_hit;
-
- ldcnt = lclmiss +
- stats->ld_fbhit +
- stats->ld_l1hit +
- stats->ld_l2hit +
- stats->ld_llchit +
- stats->lcl_hitm;
-
- return ldcnt;
+ return get_load_llc_misses(stats) +
+ get_load_cache_hits(stats);
}
static int
@@ -2376,10 +2370,7 @@ static void print_c2c__display_stats(FILE *out)
int llc_misses;
struct c2c_stats *stats = &c2c.hists.stats;
- llc_misses = stats->lcl_dram +
- stats->rmt_dram +
- stats->rmt_hit +
- stats->rmt_hitm;
+ llc_misses = get_load_llc_misses(stats);
fprintf(out, "=================================================\n");
fprintf(out, " Trace Event Information \n");
@@ -3290,6 +3281,7 @@ static int perf_c2c__record(int argc, const char **argv)
*/
if (e->tag) {
e->record = true;
+ rec_argv[i++] = "-W";
} else {
e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
e->record = true;
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
index 2a0f992ca0be..e254f18986f7 100644
--- a/tools/perf/builtin-inject.c
+++ b/tools/perf/builtin-inject.c
@@ -21,6 +21,7 @@
#include "util/data.h"
#include "util/auxtrace.h"
#include "util/jit.h"
+#include "util/string2.h"
#include "util/symbol.h"
#include "util/synthetic-events.h"
#include "util/thread.h"
@@ -38,6 +39,7 @@
#include <linux/string.h>
#include <linux/zalloc.h>
#include <linux/hash.h>
+#include <ctype.h>
#include <errno.h>
#include <signal.h>
#include <inttypes.h>
@@ -123,6 +125,7 @@ struct perf_inject {
char event_copy[PERF_SAMPLE_MAX_SIZE];
struct perf_file_section secs[HEADER_FEAT_BITS];
struct guest_session guest_session;
+ struct strlist *known_build_ids;
};
struct event_entry {
@@ -433,8 +436,10 @@ static struct dso *findnew_dso(int pid, int tid, const char *filename,
}
if (dso) {
+ mutex_lock(&dso->lock);
nsinfo__put(dso->nsinfo);
dso->nsinfo = nsi;
+ mutex_unlock(&dso->lock);
} else
nsinfo__put(nsi);
@@ -617,6 +622,7 @@ static int dso__read_build_id(struct dso *dso)
if (dso->has_build_id)
return 0;
+ mutex_lock(&dso->lock);
nsinfo__mountns_enter(dso->nsinfo, &nsc);
if (filename__read_build_id(dso->long_name, &dso->bid) > 0)
dso->has_build_id = true;
@@ -630,13 +636,78 @@ static int dso__read_build_id(struct dso *dso)
free(new_name);
}
nsinfo__mountns_exit(&nsc);
+ mutex_unlock(&dso->lock);
return dso->has_build_id ? 0 : -1;
}
+static struct strlist *perf_inject__parse_known_build_ids(
+ const char *known_build_ids_string)
+{
+ struct str_node *pos, *tmp;
+ struct strlist *known_build_ids;
+ int bid_len;
+
+ known_build_ids = strlist__new(known_build_ids_string, NULL);
+ if (known_build_ids == NULL)
+ return NULL;
+ strlist__for_each_entry_safe(pos, tmp, known_build_ids) {
+ const char *build_id, *dso_name;
+
+ build_id = skip_spaces(pos->s);
+ dso_name = strchr(build_id, ' ');
+ if (dso_name == NULL) {
+ strlist__remove(known_build_ids, pos);
+ continue;
+ }
+ bid_len = dso_name - pos->s;
+ dso_name = skip_spaces(dso_name);
+ if (bid_len % 2 != 0 || bid_len >= SBUILD_ID_SIZE) {
+ strlist__remove(known_build_ids, pos);
+ continue;
+ }
+ for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) {
+ if (!isxdigit(build_id[2 * ix]) ||
+ !isxdigit(build_id[2 * ix + 1])) {
+ strlist__remove(known_build_ids, pos);
+ break;
+ }
+ }
+ }
+ return known_build_ids;
+}
+
+static bool perf_inject__lookup_known_build_id(struct perf_inject *inject,
+ struct dso *dso)
+{
+ struct str_node *pos;
+ int bid_len;
+
+ strlist__for_each_entry(pos, inject->known_build_ids) {
+ const char *build_id, *dso_name;
+
+ build_id = skip_spaces(pos->s);
+ dso_name = strchr(build_id, ' ');
+ bid_len = dso_name - pos->s;
+ dso_name = skip_spaces(dso_name);
+ if (strcmp(dso->long_name, dso_name))
+ continue;
+ for (int ix = 0; 2 * ix + 1 < bid_len; ++ix) {
+ dso->bid.data[ix] = (hex(build_id[2 * ix]) << 4 |
+ hex(build_id[2 * ix + 1]));
+ }
+ dso->bid.size = bid_len / 2;
+ dso->has_build_id = 1;
+ return true;
+ }
+ return false;
+}
+
static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
struct machine *machine, u8 cpumode, u32 flags)
{
+ struct perf_inject *inject = container_of(tool, struct perf_inject,
+ tool);
int err;
if (is_anon_memory(dso->long_name) || flags & MAP_HUGETLB)
@@ -644,6 +715,10 @@ static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
if (is_no_dso_memory(dso->long_name))
return 0;
+ if (inject->known_build_ids != NULL &&
+ perf_inject__lookup_known_build_id(inject, dso))
+ return 1;
+
if (dso__read_build_id(dso) < 0) {
pr_debug("no build_id found for %s\n", dso->long_name);
return -1;
@@ -2112,12 +2187,16 @@ int cmd_inject(int argc, const char **argv)
};
int ret;
bool repipe = true;
+ const char *known_build_ids = NULL;
struct option options[] = {
OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
"Inject build-ids into the output stream"),
OPT_BOOLEAN(0, "buildid-all", &inject.build_id_all,
"Inject build-ids of all DSOs into the output stream"),
+ OPT_STRING(0, "known-build-ids", &known_build_ids,
+ "buildid path [,buildid path...]",
+ "build-ids to use for given paths"),
OPT_STRING('i', "input", &inject.input_name, "file",
"input file name"),
OPT_STRING('o', "output", &inject.output.path, "file",
@@ -2257,6 +2336,15 @@ int cmd_inject(int argc, const char **argv)
*/
inject.tool.ordered_events = true;
inject.tool.ordering_requires_timestamps = true;
+ if (known_build_ids != NULL) {
+ inject.known_build_ids =
+ perf_inject__parse_known_build_ids(known_build_ids);
+
+ if (inject.known_build_ids == NULL) {
+ pr_err("Couldn't parse known build ids.\n");
+ goto out_delete;
+ }
+ }
}
if (inject.sched_stat) {
@@ -2285,6 +2373,7 @@ int cmd_inject(int argc, const char **argv)
guest_session__exit(&inject.guest_session);
out_delete:
+ strlist__delete(inject.known_build_ids);
zstd_fini(&(inject.session->zstd_data));
perf_session__delete(inject.session);
out_close_output:
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index 744dd3520584..58e1ec1654ef 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -60,7 +60,7 @@ int cmd_list(int argc, const char **argv)
setup_pager();
if (!raw_dump && pager_in_use())
- printf("\nList of pre-defined events (to be used in -e):\n\n");
+ printf("\nList of pre-defined events (to be used in -e or -M):\n\n");
if (hybrid_type) {
pmu_name = perf_pmu__hybrid_type_to_pmu(hybrid_type);
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index ea40ae52cd2c..9722d4ab2e55 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -28,7 +28,6 @@
#include <sys/types.h>
#include <sys/prctl.h>
#include <semaphore.h>
-#include <pthread.h>
#include <math.h>
#include <limits.h>
@@ -57,6 +56,9 @@ static bool combine_locks;
static bool show_thread_stats;
static bool use_bpf;
static unsigned long bpf_map_entries = 10240;
+static int max_stack_depth = CONTENTION_STACK_DEPTH;
+static int stack_skip = CONTENTION_STACK_SKIP;
+static int print_nr_entries = INT_MAX / 2;
static enum {
LOCK_AGGR_ADDR,
@@ -561,29 +563,50 @@ enum acquire_flags {
READ_LOCK = 2,
};
-static int report_lock_acquire_event(struct evsel *evsel,
- struct perf_sample *sample)
+static int get_key_by_aggr_mode_simple(u64 *key, u64 addr, u32 tid)
{
- struct lock_stat *ls;
- struct thread_stat *ts;
- struct lock_seq_stat *seq;
- const char *name = evsel__strval(evsel, sample, "name");
- u64 addr = evsel__intval(evsel, sample, "lockdep_addr");
- int flag = evsel__intval(evsel, sample, "flags");
- u64 key;
-
switch (aggr_mode) {
case LOCK_AGGR_ADDR:
- key = addr;
+ *key = addr;
break;
case LOCK_AGGR_TASK:
- key = sample->tid;
+ *key = tid;
break;
case LOCK_AGGR_CALLER:
default:
pr_err("Invalid aggregation mode: %d\n", aggr_mode);
return -EINVAL;
}
+ return 0;
+}
+
+static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample);
+
+static int get_key_by_aggr_mode(u64 *key, u64 addr, struct evsel *evsel,
+ struct perf_sample *sample)
+{
+ if (aggr_mode == LOCK_AGGR_CALLER) {
+ *key = callchain_id(evsel, sample);
+ return 0;
+ }
+ return get_key_by_aggr_mode_simple(key, addr, sample->tid);
+}
+
+static int report_lock_acquire_event(struct evsel *evsel,
+ struct perf_sample *sample)
+{
+ struct lock_stat *ls;
+ struct thread_stat *ts;
+ struct lock_seq_stat *seq;
+ const char *name = evsel__strval(evsel, sample, "name");
+ u64 addr = evsel__intval(evsel, sample, "lockdep_addr");
+ int flag = evsel__intval(evsel, sample, "flags");
+ u64 key;
+ int ret;
+
+ ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid);
+ if (ret < 0)
+ return ret;
ls = lock_stat_findnew(key, name, 0);
if (!ls)
@@ -654,19 +677,11 @@ static int report_lock_acquired_event(struct evsel *evsel,
const char *name = evsel__strval(evsel, sample, "name");
u64 addr = evsel__intval(evsel, sample, "lockdep_addr");
u64 key;
+ int ret;
- switch (aggr_mode) {
- case LOCK_AGGR_ADDR:
- key = addr;
- break;
- case LOCK_AGGR_TASK:
- key = sample->tid;
- break;
- case LOCK_AGGR_CALLER:
- default:
- pr_err("Invalid aggregation mode: %d\n", aggr_mode);
- return -EINVAL;
- }
+ ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid);
+ if (ret < 0)
+ return ret;
ls = lock_stat_findnew(key, name, 0);
if (!ls)
@@ -727,19 +742,11 @@ static int report_lock_contended_event(struct evsel *evsel,
const char *name = evsel__strval(evsel, sample, "name");
u64 addr = evsel__intval(evsel, sample, "lockdep_addr");
u64 key;
+ int ret;
- switch (aggr_mode) {
- case LOCK_AGGR_ADDR:
- key = addr;
- break;
- case LOCK_AGGR_TASK:
- key = sample->tid;
- break;
- case LOCK_AGGR_CALLER:
- default:
- pr_err("Invalid aggregation mode: %d\n", aggr_mode);
- return -EINVAL;
- }
+ ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid);
+ if (ret < 0)
+ return ret;
ls = lock_stat_findnew(key, name, 0);
if (!ls)
@@ -793,19 +800,11 @@ static int report_lock_release_event(struct evsel *evsel,
const char *name = evsel__strval(evsel, sample, "name");
u64 addr = evsel__intval(evsel, sample, "lockdep_addr");
u64 key;
+ int ret;
- switch (aggr_mode) {
- case LOCK_AGGR_ADDR:
- key = addr;
- break;
- case LOCK_AGGR_TASK:
- key = sample->tid;
- break;
- case LOCK_AGGR_CALLER:
- default:
- pr_err("Invalid aggregation mode: %d\n", aggr_mode);
- return -EINVAL;
- }
+ ret = get_key_by_aggr_mode_simple(&key, addr, sample->tid);
+ if (ret < 0)
+ return ret;
ls = lock_stat_findnew(key, name, 0);
if (!ls)
@@ -903,6 +902,23 @@ bool is_lock_function(struct machine *machine, u64 addr)
return false;
}
+static int get_symbol_name_offset(struct map *map, struct symbol *sym, u64 ip,
+ char *buf, int size)
+{
+ u64 offset;
+
+ if (map == NULL || sym == NULL) {
+ buf[0] = '\0';
+ return 0;
+ }
+
+ offset = map->map_ip(map, ip) - sym->start;
+
+ if (offset)
+ return scnprintf(buf, size, "%s+%#lx", sym->name, offset);
+ else
+ return strlcpy(buf, sym->name, size);
+}
static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sample,
char *buf, int size)
{
@@ -923,7 +939,7 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
/* use caller function name from the callchain */
ret = thread__resolve_callchain(thread, cursor, evsel, sample,
- NULL, NULL, CONTENTION_STACK_DEPTH);
+ NULL, NULL, max_stack_depth);
if (ret != 0) {
thread__put(thread);
return -1;
@@ -940,20 +956,13 @@ static int lock_contention_caller(struct evsel *evsel, struct perf_sample *sampl
break;
/* skip first few entries - for lock functions */
- if (++skip <= CONTENTION_STACK_SKIP)
+ if (++skip <= stack_skip)
goto next;
sym = node->ms.sym;
if (sym && !is_lock_function(machine, node->ip)) {
- struct map *map = node->ms.map;
- u64 offset;
-
- offset = map->map_ip(map, node->ip) - sym->start;
-
- if (offset)
- scnprintf(buf, size, "%s+%#lx", sym->name, offset);
- else
- strlcpy(buf, sym->name, size);
+ get_symbol_name_offset(node->ms.map, sym, node->ip,
+ buf, size);
return 0;
}
@@ -978,7 +987,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
/* use caller function name from the callchain */
ret = thread__resolve_callchain(thread, cursor, evsel, sample,
- NULL, NULL, CONTENTION_STACK_DEPTH);
+ NULL, NULL, max_stack_depth);
thread__put(thread);
if (ret != 0)
@@ -994,7 +1003,7 @@ static u64 callchain_id(struct evsel *evsel, struct perf_sample *sample)
break;
/* skip first few entries - for lock functions */
- if (++skip <= CONTENTION_STACK_SKIP)
+ if (++skip <= stack_skip)
goto next;
if (node->ms.sym && is_lock_function(machine, node->ip))
@@ -1008,6 +1017,27 @@ next:
return hash;
}
+static u64 *get_callstack(struct perf_sample *sample, int max_stack)
+{
+ u64 *callstack;
+ u64 i;
+ int c;
+
+ callstack = calloc(max_stack, sizeof(*callstack));
+ if (callstack == NULL)
+ return NULL;
+
+ for (i = 0, c = 0; i < sample->callchain->nr && c < max_stack; i++) {
+ u64 ip = sample->callchain->ips[i];
+
+ if (ip >= PERF_CONTEXT_MAX)
+ continue;
+
+ callstack[c++] = ip;
+ }
+ return callstack;
+}
+
static int report_lock_contention_begin_event(struct evsel *evsel,
struct perf_sample *sample)
{
@@ -1016,21 +1046,11 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
struct lock_seq_stat *seq;
u64 addr = evsel__intval(evsel, sample, "lock_addr");
u64 key;
+ int ret;
- switch (aggr_mode) {
- case LOCK_AGGR_ADDR:
- key = addr;
- break;
- case LOCK_AGGR_TASK:
- key = sample->tid;
- break;
- case LOCK_AGGR_CALLER:
- key = callchain_id(evsel, sample);
- break;
- default:
- pr_err("Invalid aggregation mode: %d\n", aggr_mode);
- return -EINVAL;
- }
+ ret = get_key_by_aggr_mode(&key, addr, evsel, sample);
+ if (ret < 0)
+ return ret;
ls = lock_stat_find(key);
if (!ls) {
@@ -1044,6 +1064,12 @@ static int report_lock_contention_begin_event(struct evsel *evsel,
ls = lock_stat_findnew(key, caller, flags);
if (!ls)
return -ENOMEM;
+
+ if (aggr_mode == LOCK_AGGR_CALLER && verbose) {
+ ls->callstack = get_callstack(sample, max_stack_depth);
+ if (ls->callstack == NULL)
+ return -ENOMEM;
+ }
}
ts = thread_stat_findnew(sample->tid);
@@ -1099,21 +1125,11 @@ static int report_lock_contention_end_event(struct evsel *evsel,
u64 contended_term;
u64 addr = evsel__intval(evsel, sample, "lock_addr");
u64 key;
+ int ret;
- switch (aggr_mode) {
- case LOCK_AGGR_ADDR:
- key = addr;
- break;
- case LOCK_AGGR_TASK:
- key = sample->tid;
- break;
- case LOCK_AGGR_CALLER:
- key = callchain_id(evsel, sample);
- break;
- default:
- pr_err("Invalid aggregation mode: %d\n", aggr_mode);
- return -EINVAL;
- }
+ ret = get_key_by_aggr_mode(&key, addr, evsel, sample);
+ if (ret < 0)
+ return ret;
ls = lock_stat_find(key);
if (!ls)
@@ -1234,7 +1250,7 @@ static void print_bad_events(int bad, int total)
for (i = 0; i < BROKEN_MAX; i++)
broken += bad_hist[i];
- if (broken == 0 && !verbose)
+ if (quiet || (broken == 0 && !verbose))
return;
pr_info("\n=== output for debug===\n\n");
@@ -1251,14 +1267,16 @@ static void print_result(void)
struct lock_stat *st;
struct lock_key *key;
char cut_name[20];
- int bad, total;
+ int bad, total, printed;
- pr_info("%20s ", "Name");
- list_for_each_entry(key, &lock_keys, list)
- pr_info("%*s ", key->len, key->header);
- pr_info("\n\n");
+ if (!quiet) {
+ pr_info("%20s ", "Name");
+ list_for_each_entry(key, &lock_keys, list)
+ pr_info("%*s ", key->len, key->header);
+ pr_info("\n\n");
+ }
- bad = total = 0;
+ bad = total = printed = 0;
while ((st = pop_from_result())) {
total++;
if (st->broken)
@@ -1296,6 +1314,9 @@ static void print_result(void)
pr_info(" ");
}
pr_info("\n");
+
+ if (++printed >= print_nr_entries)
+ break;
}
print_bad_events(bad, total);
@@ -1457,21 +1478,23 @@ static void sort_contention_result(void)
sort_result();
}
-static void print_contention_result(void)
+static void print_contention_result(struct lock_contention *con)
{
struct lock_stat *st;
struct lock_key *key;
- int bad, total;
+ int bad, total, printed;
- list_for_each_entry(key, &lock_keys, list)
- pr_info("%*s ", key->len, key->header);
+ if (!quiet) {
+ list_for_each_entry(key, &lock_keys, list)
+ pr_info("%*s ", key->len, key->header);
- if (show_thread_stats)
- pr_info(" %10s %s\n\n", "pid", "comm");
- else
- pr_info(" %10s %s\n\n", "type", "caller");
+ if (show_thread_stats)
+ pr_info(" %10s %s\n\n", "pid", "comm");
+ else
+ pr_info(" %10s %s\n\n", "type", "caller");
+ }
- bad = total = 0;
+ bad = total = printed = 0;
if (use_bpf)
bad = bad_hist[BROKEN_CONTENDED];
@@ -1492,10 +1515,30 @@ static void print_contention_result(void)
/* st->addr contains tid of thread */
t = perf_session__findnew(session, pid);
pr_info(" %10d %s\n", pid, thread__comm_str(t));
- continue;
+ goto next;
}
pr_info(" %10s %s\n", get_type_str(st), st->name);
+ if (verbose) {
+ struct map *kmap;
+ struct symbol *sym;
+ char buf[128];
+ u64 ip;
+
+ for (int i = 0; i < max_stack_depth; i++) {
+ if (!st->callstack || !st->callstack[i])
+ break;
+
+ ip = st->callstack[i];
+ sym = machine__find_kernel_symbol(con->machine, ip, &kmap);
+ get_symbol_name_offset(kmap, sym, ip, buf, sizeof(buf));
+ pr_info("\t\t\t%#lx %s\n", (unsigned long)ip, buf);
+ }
+ }
+
+next:
+ if (++printed >= print_nr_entries)
+ break;
}
print_bad_events(bad, total);
@@ -1603,6 +1646,8 @@ static int __cmd_contention(int argc, const char **argv)
.target = &target,
.result = &lockhash_table[0],
.map_nr_entries = bpf_map_entries,
+ .max_stack = max_stack_depth,
+ .stack_skip = stack_skip,
};
session = perf_session__new(use_bpf ? NULL : &data, &eops);
@@ -1611,6 +1656,8 @@ static int __cmd_contention(int argc, const char **argv)
return PTR_ERR(session);
}
+ con.machine = &session->machines.host;
+
/* for lock function check */
symbol_conf.sort_by_name = true;
symbol__init(&session->header.env);
@@ -1629,8 +1676,6 @@ static int __cmd_contention(int argc, const char **argv)
signal(SIGCHLD, sighandler);
signal(SIGTERM, sighandler);
- con.machine = &session->machines.host;
-
con.evlist = evlist__new();
if (con.evlist == NULL) {
err = -ENOMEM;
@@ -1702,7 +1747,7 @@ static int __cmd_contention(int argc, const char **argv)
setup_pager();
sort_contention_result();
- print_contention_result();
+ print_contention_result(&con);
out_delete:
evlist__delete(con.evlist);
@@ -1824,6 +1869,7 @@ int cmd_lock(int argc, const char **argv)
"file", "vmlinux pathname"),
OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
"file", "kallsyms pathname"),
+ OPT_BOOLEAN('q', "quiet", &quiet, "Do not show any message"),
OPT_END()
};
@@ -1845,6 +1891,7 @@ int cmd_lock(int argc, const char **argv)
"combine locks in the same class"),
OPT_BOOLEAN('t', "threads", &show_thread_stats,
"show per-thread lock stats"),
+ OPT_INTEGER('E', "entries", &print_nr_entries, "display this many functions"),
OPT_PARENT(lock_options)
};
@@ -1866,6 +1913,13 @@ int cmd_lock(int argc, const char **argv)
"Trace on existing thread id (exclusive to --pid)"),
OPT_CALLBACK(0, "map-nr-entries", &bpf_map_entries, "num",
"Max number of BPF map entries", parse_map_entry),
+ OPT_INTEGER(0, "max-stack", &max_stack_depth,
+ "Set the maximum stack depth when collecting lock contention, "
+ "Default: " __stringify(CONTENTION_STACK_DEPTH)),
+ OPT_INTEGER(0, "stack-skip", &stack_skip,
+ "Set the number of stack depth to skip when finding a lock caller, "
+ "Default: " __stringify(CONTENTION_STACK_SKIP)),
+ OPT_INTEGER('E', "entries", &print_nr_entries, "display this many functions"),
OPT_PARENT(lock_options)
};
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
index 9e435fd23503..923fb8316fda 100644
--- a/tools/perf/builtin-mem.c
+++ b/tools/perf/builtin-mem.c
@@ -97,6 +97,9 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
else
rec_argc = argc + 9 * perf_pmu__hybrid_pmu_num();
+ if (mem->cpu_list)
+ rec_argc += 2;
+
rec_argv = calloc(rec_argc + 1, sizeof(char *));
if (!rec_argv)
return -1;
@@ -122,6 +125,7 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
(mem->operation & MEM_OPERATION_LOAD) &&
(mem->operation & MEM_OPERATION_STORE)) {
e->record = true;
+ rec_argv[i++] = "-W";
} else {
if (mem->operation & MEM_OPERATION_LOAD) {
e = perf_mem_events__ptr(PERF_MEM_EVENTS__LOAD);
@@ -158,6 +162,11 @@ static int __cmd_record(int argc, const char **argv, struct perf_mem *mem)
if (all_kernel)
rec_argv[i++] = "--all-kernel";
+ if (mem->cpu_list) {
+ rec_argv[i++] = "-C";
+ rec_argv[i++] = mem->cpu_list;
+ }
+
for (j = 0; j < argc; j++, i++)
rec_argv[i] = argv[j];
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index f87ef43eb820..e128b855ddde 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -10,6 +10,7 @@
#include "util/build-id.h"
#include <subcmd/parse-options.h>
+#include <internal/xyarray.h>
#include "util/parse-events.h"
#include "util/config.h"
@@ -21,6 +22,7 @@
#include "util/evsel.h"
#include "util/debug.h"
#include "util/mmap.h"
+#include "util/mutex.h"
#include "util/target.h"
#include "util/session.h"
#include "util/tool.h"
@@ -143,6 +145,11 @@ static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
"undefined", "cpu", "core", "package", "numa", "user"
};
+struct pollfd_index_map {
+ int evlist_pollfd_index;
+ int thread_pollfd_index;
+};
+
struct record {
struct perf_tool tool;
struct record_opts opts;
@@ -171,6 +178,9 @@ struct record {
int nr_threads;
struct thread_mask *thread_masks;
struct record_thread *thread_data;
+ struct pollfd_index_map *index_map;
+ size_t index_map_sz;
+ size_t index_map_cnt;
};
static volatile int done;
@@ -608,17 +618,18 @@ static int process_synthesized_event(struct perf_tool *tool,
return record__write(rec, NULL, event, event->header.size);
}
+static struct mutex synth_lock;
+
static int process_locked_synthesized_event(struct perf_tool *tool,
union perf_event *event,
struct perf_sample *sample __maybe_unused,
struct machine *machine __maybe_unused)
{
- static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
int ret;
- pthread_mutex_lock(&synth_lock);
+ mutex_lock(&synth_lock);
ret = process_synthesized_event(tool, event, sample, machine);
- pthread_mutex_unlock(&synth_lock);
+ mutex_unlock(&synth_lock);
return ret;
}
@@ -638,7 +649,7 @@ static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
static volatile int signr = -1;
static volatile int child_finished;
#ifdef HAVE_EVENTFD_SUPPORT
-static int done_fd = -1;
+static volatile int done_fd = -1;
#endif
static void sig_handler(int sig)
@@ -650,19 +661,24 @@ static void sig_handler(int sig)
done = 1;
#ifdef HAVE_EVENTFD_SUPPORT
-{
- u64 tmp = 1;
- /*
- * It is possible for this signal handler to run after done is checked
- * in the main loop, but before the perf counter fds are polled. If this
- * happens, the poll() will continue to wait even though done is set,
- * and will only break out if either another signal is received, or the
- * counters are ready for read. To ensure the poll() doesn't sleep when
- * done is set, use an eventfd (done_fd) to wake up the poll().
- */
- if (write(done_fd, &tmp, sizeof(tmp)) < 0)
- pr_err("failed to signal wakeup fd, error: %m\n");
-}
+ if (done_fd >= 0) {
+ u64 tmp = 1;
+ int orig_errno = errno;
+
+ /*
+ * It is possible for this signal handler to run after done is
+ * checked in the main loop, but before the perf counter fds are
+ * polled. If this happens, the poll() will continue to wait
+ * even though done is set, and will only break out if either
+ * another signal is received, or the counters are ready for
+ * read. To ensure the poll() doesn't sleep when done is set,
+ * use an eventfd (done_fd) to wake up the poll().
+ */
+ if (write(done_fd, &tmp, sizeof(tmp)) < 0)
+ pr_err("failed to signal wakeup fd, error: %m\n");
+
+ errno = orig_errno;
+ }
#endif // HAVE_EVENTFD_SUPPORT
}
@@ -1074,6 +1090,70 @@ static void record__free_thread_data(struct record *rec)
zfree(&rec->thread_data);
}
+static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
+ int evlist_pollfd_index,
+ int thread_pollfd_index)
+{
+ size_t x = rec->index_map_cnt;
+
+ if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
+ return -ENOMEM;
+ rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
+ rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
+ rec->index_map_cnt += 1;
+ return 0;
+}
+
+static int record__update_evlist_pollfd_from_thread(struct record *rec,
+ struct evlist *evlist,
+ struct record_thread *thread_data)
+{
+ struct pollfd *e_entries = evlist->core.pollfd.entries;
+ struct pollfd *t_entries = thread_data->pollfd.entries;
+ int err = 0;
+ size_t i;
+
+ for (i = 0; i < rec->index_map_cnt; i++) {
+ int e_pos = rec->index_map[i].evlist_pollfd_index;
+ int t_pos = rec->index_map[i].thread_pollfd_index;
+
+ if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
+ e_entries[e_pos].events != t_entries[t_pos].events) {
+ pr_err("Thread and evlist pollfd index mismatch\n");
+ err = -EINVAL;
+ continue;
+ }
+ e_entries[e_pos].revents = t_entries[t_pos].revents;
+ }
+ return err;
+}
+
+static int record__dup_non_perf_events(struct record *rec,
+ struct evlist *evlist,
+ struct record_thread *thread_data)
+{
+ struct fdarray *fda = &evlist->core.pollfd;
+ int i, ret;
+
+ for (i = 0; i < fda->nr; i++) {
+ if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
+ continue;
+ ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
+ if (ret < 0) {
+ pr_err("Failed to duplicate descriptor in main thread pollfd\n");
+ return ret;
+ }
+ pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
+ thread_data, ret, fda->entries[i].fd);
+ ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
+ if (ret < 0) {
+ pr_err("Failed to map thread and evlist pollfd indexes\n");
+ return ret;
+ }
+ }
+ return 0;
+}
+
static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
{
int t, ret;
@@ -1121,18 +1201,12 @@ static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
thread_data[t].pipes.msg[0]);
} else {
thread_data[t].tid = gettid();
- if (evlist->ctl_fd.pos == -1)
- continue;
- ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
- &evlist->core.pollfd);
- if (ret < 0) {
- pr_err("Failed to duplicate descriptor in main thread pollfd\n");
+
+ ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
+ if (ret < 0)
goto out_free;
- }
- thread_data[t].ctlfd_pos = ret;
- pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
- thread_data, thread_data[t].ctlfd_pos,
- evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
+
+ thread_data[t].ctlfd_pos = -1; /* Not used */
}
}
@@ -1784,6 +1858,74 @@ record__switch_output(struct record *rec, bool at_exit)
return fd;
}
+static void __record__read_lost_samples(struct record *rec, struct evsel *evsel,
+ struct perf_record_lost_samples *lost,
+ int cpu_idx, int thread_idx)
+{
+ struct perf_counts_values count;
+ struct perf_sample_id *sid;
+ struct perf_sample sample = {};
+ int id_hdr_size;
+
+ if (perf_evsel__read(&evsel->core, cpu_idx, thread_idx, &count) < 0) {
+ pr_err("read LOST count failed\n");
+ return;
+ }
+
+ if (count.lost == 0)
+ return;
+
+ lost->lost = count.lost;
+ if (evsel->core.ids) {
+ sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
+ sample.id = sid->id;
+ }
+
+ id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
+ evsel->core.attr.sample_type, &sample);
+ lost->header.size = sizeof(*lost) + id_hdr_size;
+ record__write(rec, NULL, lost, lost->header.size);
+}
+
+static void record__read_lost_samples(struct record *rec)
+{
+ struct perf_session *session = rec->session;
+ struct perf_record_lost_samples *lost;
+ struct evsel *evsel;
+
+ /* there was an error during record__open */
+ if (session->evlist == NULL)
+ return;
+
+ lost = zalloc(PERF_SAMPLE_MAX_SIZE);
+ if (lost == NULL) {
+ pr_debug("Memory allocation failed\n");
+ return;
+ }
+
+ lost->header.type = PERF_RECORD_LOST_SAMPLES;
+
+ evlist__for_each_entry(session->evlist, evsel) {
+ struct xyarray *xy = evsel->core.sample_id;
+
+ if (xy == NULL || evsel->core.fd == NULL)
+ continue;
+ if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
+ xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
+ pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
+ continue;
+ }
+
+ for (int x = 0; x < xyarray__max_x(xy); x++) {
+ for (int y = 0; y < xyarray__max_y(xy); y++) {
+ __record__read_lost_samples(rec, evsel, lost, x, y);
+ }
+ }
+ }
+ free(lost);
+
+}
+
static volatile int workload_exec_errno;
/*
@@ -1921,6 +2063,7 @@ static int record__synthesize(struct record *rec, bool tail)
}
if (rec->opts.nr_threads_synthesize > 1) {
+ mutex_init(&synth_lock);
perf_set_multithreaded();
f = process_locked_synthesized_event;
}
@@ -1934,8 +2077,10 @@ static int record__synthesize(struct record *rec, bool tail)
rec->opts.nr_threads_synthesize);
}
- if (rec->opts.nr_threads_synthesize > 1)
+ if (rec->opts.nr_threads_synthesize > 1) {
perf_set_singlethreaded();
+ mutex_destroy(&synth_lock);
+ }
out:
return err;
@@ -2294,10 +2439,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
record__uniquify_name(rec);
+ /* Debug message used by test scripts */
+ pr_debug3("perf record opening and mmapping events\n");
if (record__open(rec) != 0) {
err = -1;
goto out_free_threads;
}
+ /* Debug message used by test scripts */
+ pr_debug3("perf record done opening and mmapping events\n");
session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
if (rec->opts.kcore) {
@@ -2436,6 +2585,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}
+ err = event_enable_timer__start(rec->evlist->eet);
+ if (err)
+ goto out_child;
+
+ /* Debug message used by test scripts */
+ pr_debug3("perf record has started\n");
+ fflush(stderr);
+
trigger_ready(&auxtrace_snapshot_trigger);
trigger_ready(&switch_output_trigger);
perf_hooks__invoke_record_start();
@@ -2534,8 +2691,9 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
record__thread_munmap_filtered, NULL) == 0)
draining = true;
- evlist__ctlfd_update(rec->evlist,
- &thread->pollfd.entries[thread->ctlfd_pos]);
+ err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
+ if (err)
+ goto out_child;
}
if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
@@ -2558,6 +2716,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}
+ err = event_enable_timer__process(rec->evlist->eet);
+ if (err < 0)
+ goto out_child;
+ if (err) {
+ err = 0;
+ done = 1;
+ }
+
/*
* When perf is starting the traced process, at the end events
* die with the process and we wait for that. Thus no need to
@@ -2630,6 +2796,7 @@ out_free_threads:
if (rec->off_cpu)
rec->bytes_written += off_cpu_write(rec->session);
+ record__read_lost_samples(rec);
record__synthesize(rec, true);
/* this will be recalculated during process_buildids() */
rec->samples = 0;
@@ -2672,8 +2839,12 @@ out_free_threads:
out_delete_session:
#ifdef HAVE_EVENTFD_SUPPORT
- if (done_fd >= 0)
- close(done_fd);
+ if (done_fd >= 0) {
+ fd = done_fd;
+ done_fd = -1;
+
+ close(fd);
+ }
#endif
zstd_fini(&session->zstd_data);
perf_session__delete(session);
@@ -2779,6 +2950,12 @@ static int perf_record_config(const char *var, const char *value, void *cb)
return 0;
}
+static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
+{
+ struct record *rec = (struct record *)opt->value;
+
+ return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
+}
static int record__parse_affinity(const struct option *opt, const char *str, int unset)
{
@@ -3240,8 +3417,10 @@ static struct option __record_options[] = {
OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
"monitor event in cgroup name only",
parse_cgroups),
- OPT_INTEGER('D', "delay", &record.opts.initial_delay,
- "ms to wait before starting measurement after program start (-1: start with events disabled)"),
+ OPT_CALLBACK('D', "delay", &record, "ms",
+ "ms to wait before starting measurement after program start (-1: start with events disabled), "
+ "or ranges of time to enable events e.g. '-D 10-20,30-40'",
+ record__parse_event_enable_time),
OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
"user to profile"),
@@ -3371,6 +3550,8 @@ static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cp
return 0;
perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
+ if (cpu.cpu == -1)
+ continue;
/* Return ENODEV is input cpu is greater than max cpu */
if ((unsigned long)cpu.cpu > mask->nbits)
return -ENODEV;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 91ed41cc7d88..8361890176c2 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -752,6 +752,22 @@ static int count_sample_event(struct perf_tool *tool __maybe_unused,
return 0;
}
+static int count_lost_samples_event(struct perf_tool *tool,
+ union perf_event *event,
+ struct perf_sample *sample,
+ struct machine *machine __maybe_unused)
+{
+ struct report *rep = container_of(tool, struct report, tool);
+ struct evsel *evsel;
+
+ evsel = evlist__id2evsel(rep->session->evlist, sample->id);
+ if (evsel) {
+ hists__inc_nr_lost_samples(evsel__hists(evsel),
+ event->lost_samples.lost);
+ }
+ return 0;
+}
+
static int process_attr(struct perf_tool *tool __maybe_unused,
union perf_event *event,
struct evlist **pevlist);
@@ -761,6 +777,7 @@ static void stats_setup(struct report *rep)
memset(&rep->tool, 0, sizeof(rep->tool));
rep->tool.attr = process_attr;
rep->tool.sample = count_sample_event;
+ rep->tool.lost_samples = count_lost_samples_event;
rep->tool.no_warn = true;
}
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index a5cf243c337f..f93737eef07b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -7,6 +7,7 @@
#include "util/evlist.h"
#include "util/evsel.h"
#include "util/evsel_fprintf.h"
+#include "util/mutex.h"
#include "util/symbol.h"
#include "util/thread.h"
#include "util/header.h"
@@ -184,8 +185,8 @@ struct perf_sched {
struct task_desc **pid_to_task;
struct task_desc **tasks;
const struct trace_sched_handler *tp_handler;
- pthread_mutex_t start_work_mutex;
- pthread_mutex_t work_done_wait_mutex;
+ struct mutex start_work_mutex;
+ struct mutex work_done_wait_mutex;
int profile_cpu;
/*
* Track the current task - that way we can know whether there's any
@@ -245,6 +246,7 @@ struct perf_sched {
const char *time_str;
struct perf_time_interval ptime;
struct perf_time_interval hist_time;
+ volatile bool thread_funcs_exit;
};
/* per thread run time data */
@@ -632,35 +634,34 @@ static void *thread_func(void *ctx)
prctl(PR_SET_NAME, comm2);
if (fd < 0)
return NULL;
-again:
- ret = sem_post(&this_task->ready_for_work);
- BUG_ON(ret);
- ret = pthread_mutex_lock(&sched->start_work_mutex);
- BUG_ON(ret);
- ret = pthread_mutex_unlock(&sched->start_work_mutex);
- BUG_ON(ret);
- cpu_usage_0 = get_cpu_usage_nsec_self(fd);
+ while (!sched->thread_funcs_exit) {
+ ret = sem_post(&this_task->ready_for_work);
+ BUG_ON(ret);
+ mutex_lock(&sched->start_work_mutex);
+ mutex_unlock(&sched->start_work_mutex);
- for (i = 0; i < this_task->nr_events; i++) {
- this_task->curr_event = i;
- perf_sched__process_event(sched, this_task->atoms[i]);
- }
+ cpu_usage_0 = get_cpu_usage_nsec_self(fd);
- cpu_usage_1 = get_cpu_usage_nsec_self(fd);
- this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
- ret = sem_post(&this_task->work_done_sem);
- BUG_ON(ret);
+ for (i = 0; i < this_task->nr_events; i++) {
+ this_task->curr_event = i;
+ perf_sched__process_event(sched, this_task->atoms[i]);
+ }
- ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
- BUG_ON(ret);
- ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
- BUG_ON(ret);
+ cpu_usage_1 = get_cpu_usage_nsec_self(fd);
+ this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
+ ret = sem_post(&this_task->work_done_sem);
+ BUG_ON(ret);
- goto again;
+ mutex_lock(&sched->work_done_wait_mutex);
+ mutex_unlock(&sched->work_done_wait_mutex);
+ }
+ return NULL;
}
static void create_tasks(struct perf_sched *sched)
+ EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex)
+ EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex)
{
struct task_desc *task;
pthread_attr_t attr;
@@ -672,10 +673,8 @@ static void create_tasks(struct perf_sched *sched)
err = pthread_attr_setstacksize(&attr,
(size_t) max(16 * 1024, (int)PTHREAD_STACK_MIN));
BUG_ON(err);
- err = pthread_mutex_lock(&sched->start_work_mutex);
- BUG_ON(err);
- err = pthread_mutex_lock(&sched->work_done_wait_mutex);
- BUG_ON(err);
+ mutex_lock(&sched->start_work_mutex);
+ mutex_lock(&sched->work_done_wait_mutex);
for (i = 0; i < sched->nr_tasks; i++) {
struct sched_thread_parms *parms = malloc(sizeof(*parms));
BUG_ON(parms == NULL);
@@ -691,7 +690,30 @@ static void create_tasks(struct perf_sched *sched)
}
}
+static void destroy_tasks(struct perf_sched *sched)
+ UNLOCK_FUNCTION(sched->start_work_mutex)
+ UNLOCK_FUNCTION(sched->work_done_wait_mutex)
+{
+ struct task_desc *task;
+ unsigned long i;
+ int err;
+
+ mutex_unlock(&sched->start_work_mutex);
+ mutex_unlock(&sched->work_done_wait_mutex);
+ /* Get rid of threads so they won't be upset by mutex destrunction */
+ for (i = 0; i < sched->nr_tasks; i++) {
+ task = sched->tasks[i];
+ err = pthread_join(task->thread, NULL);
+ BUG_ON(err);
+ sem_destroy(&task->sleep_sem);
+ sem_destroy(&task->ready_for_work);
+ sem_destroy(&task->work_done_sem);
+ }
+}
+
static void wait_for_tasks(struct perf_sched *sched)
+ EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
+ EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
{
u64 cpu_usage_0, cpu_usage_1;
struct task_desc *task;
@@ -699,7 +721,7 @@ static void wait_for_tasks(struct perf_sched *sched)
sched->start_time = get_nsecs();
sched->cpu_usage = 0;
- pthread_mutex_unlock(&sched->work_done_wait_mutex);
+ mutex_unlock(&sched->work_done_wait_mutex);
for (i = 0; i < sched->nr_tasks; i++) {
task = sched->tasks[i];
@@ -707,12 +729,11 @@ static void wait_for_tasks(struct perf_sched *sched)
BUG_ON(ret);
sem_init(&task->ready_for_work, 0, 0);
}
- ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
- BUG_ON(ret);
+ mutex_lock(&sched->work_done_wait_mutex);
cpu_usage_0 = get_cpu_usage_nsec_parent();
- pthread_mutex_unlock(&sched->start_work_mutex);
+ mutex_unlock(&sched->start_work_mutex);
for (i = 0; i < sched->nr_tasks; i++) {
task = sched->tasks[i];
@@ -734,8 +755,7 @@ static void wait_for_tasks(struct perf_sched *sched)
sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - 1) +
sched->parent_cpu_usage)/sched->replay_repeat;
- ret = pthread_mutex_lock(&sched->start_work_mutex);
- BUG_ON(ret);
+ mutex_lock(&sched->start_work_mutex);
for (i = 0; i < sched->nr_tasks; i++) {
task = sched->tasks[i];
@@ -745,6 +765,8 @@ static void wait_for_tasks(struct perf_sched *sched)
}
static void run_one_test(struct perf_sched *sched)
+ EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
+ EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
{
u64 T0, T1, delta, avg_delta, fluct;
@@ -3316,11 +3338,14 @@ static int perf_sched__replay(struct perf_sched *sched)
print_task_traces(sched);
add_cross_task_wakeups(sched);
+ sched->thread_funcs_exit = false;
create_tasks(sched);
printf("------------------------------------------------------------\n");
for (i = 0; i < sched->replay_repeat; i++)
run_one_test(sched);
+ sched->thread_funcs_exit = true;
+ destroy_tasks(sched);
return 0;
}
@@ -3444,8 +3469,6 @@ int cmd_sched(int argc, const char **argv)
},
.cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
.sort_list = LIST_HEAD_INIT(sched.sort_list),
- .start_work_mutex = PTHREAD_MUTEX_INITIALIZER,
- .work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
.sort_order = default_sort_order,
.replay_repeat = 10,
.profile_cpu = -1,
@@ -3559,8 +3582,10 @@ int cmd_sched(int argc, const char **argv)
.fork_event = replay_fork_event,
};
unsigned int i;
- int ret;
+ int ret = 0;
+ mutex_init(&sched.start_work_mutex);
+ mutex_init(&sched.work_done_wait_mutex);
for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
sched.curr_pid[i] = -1;
@@ -3572,11 +3597,10 @@ int cmd_sched(int argc, const char **argv)
/*
* Aliased to 'perf script' for now:
*/
- if (!strcmp(argv[0], "script"))
- return cmd_script(argc, argv);
-
- if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
- return __cmd_record(argc, argv);
+ if (!strcmp(argv[0], "script")) {
+ ret = cmd_script(argc, argv);
+ } else if (strlen(argv[0]) > 2 && strstarts("record", argv[0])) {
+ ret = __cmd_record(argc, argv);
} else if (strlen(argv[0]) > 2 && strstarts("latency", argv[0])) {
sched.tp_handler = &lat_ops;
if (argc > 1) {
@@ -3585,7 +3609,7 @@ int cmd_sched(int argc, const char **argv)
usage_with_options(latency_usage, latency_options);
}
setup_sorting(&sched, latency_options, latency_usage);
- return perf_sched__lat(&sched);
+ ret = perf_sched__lat(&sched);
} else if (!strcmp(argv[0], "map")) {
if (argc) {
argc = parse_options(argc, argv, map_options, map_usage, 0);
@@ -3594,7 +3618,7 @@ int cmd_sched(int argc, const char **argv)
}
sched.tp_handler = &map_ops;
setup_sorting(&sched, latency_options, latency_usage);
- return perf_sched__map(&sched);
+ ret = perf_sched__map(&sched);
} else if (strlen(argv[0]) > 2 && strstarts("replay", argv[0])) {
sched.tp_handler = &replay_ops;
if (argc) {
@@ -3602,7 +3626,7 @@ int cmd_sched(int argc, const char **argv)
if (argc)
usage_with_options(replay_usage, replay_options);
}
- return perf_sched__replay(&sched);
+ ret = perf_sched__replay(&sched);
} else if (!strcmp(argv[0], "timehist")) {
if (argc) {
argc = parse_options(argc, argv, timehist_options,
@@ -3618,16 +3642,21 @@ int cmd_sched(int argc, const char **argv)
parse_options_usage(NULL, timehist_options, "w", true);
if (sched.show_next)
parse_options_usage(NULL, timehist_options, "n", true);
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
ret = symbol__validate_sym_arguments();
if (ret)
- return ret;
+ goto out;
- return perf_sched__timehist(&sched);
+ ret = perf_sched__timehist(&sched);
} else {
usage_with_options(sched_usage, sched_options);
}
- return 0;
+out:
+ mutex_destroy(&sched.start_work_mutex);
+ mutex_destroy(&sched.work_done_wait_mutex);
+
+ return ret;
}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 029b4330e59b..7ca238277d83 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -882,7 +882,7 @@ static int print_bstack_flags(FILE *fp, struct branch_entry *br)
br->flags.in_tx ? 'X' : '-',
br->flags.abort ? 'A' : '-',
br->flags.cycles,
- br->flags.type ? branch_type_name(br->flags.type) : "-");
+ get_branch_type(br));
}
static int perf_sample__fprintf_brstack(struct perf_sample *sample,
@@ -2243,9 +2243,6 @@ static void __process_stat(struct evsel *counter, u64 tstamp)
struct perf_cpu cpu;
static int header_printed;
- if (counter->core.system_wide)
- nthreads = 1;
-
if (!header_printed) {
printf("%3s %8s %15s %15s %15s %15s %s\n",
"CPU", "THREAD", "VAL", "ENA", "RUN", "TIME", "EVENT");
@@ -3849,9 +3846,10 @@ int cmd_script(int argc, const char **argv)
"Valid types: hw,sw,trace,raw,synth. "
"Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
"addr,symoff,srcline,period,iregs,uregs,brstack,"
- "brstacksym,flags,bpf-output,brstackinsn,brstackinsnlen,brstackoff,"
- "callindent,insn,insnlen,synth,phys_addr,metric,misc,ipc,tod,"
- "data_page_size,code_page_size,ins_lat",
+ "brstacksym,flags,data_src,weight,bpf-output,brstackinsn,"
+ "brstackinsnlen,brstackoff,callindent,insn,insnlen,synth,"
+ "phys_addr,metric,misc,srccode,ipc,tod,data_page_size,"
+ "code_page_size,ins_lat",
parse_output_fields),
OPT_BOOLEAN('a', "all-cpus", &system_wide,
"system-wide collection from all CPUs"),
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 0b4a62e4ff67..265b05157972 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -191,6 +191,7 @@ static bool append_file;
static bool interval_count;
static const char *output_name;
static int output_fd;
+static char *metrics;
struct perf_stat {
bool record;
@@ -291,13 +292,8 @@ static inline void diff_timespec(struct timespec *r, struct timespec *a,
static void perf_stat__reset_stats(void)
{
- int i;
-
evlist__reset_stats(evsel_list);
perf_stat__reset_shadow_stats();
-
- for (i = 0; i < stat_config.stats_num; i++)
- perf_stat__reset_shadow_per_stat(&stat_config.stats[i]);
}
static int process_synthesized_event(struct perf_tool *tool __maybe_unused,
@@ -488,46 +484,6 @@ static void read_counters(struct timespec *rs)
}
}
-static int runtime_stat_new(struct perf_stat_config *config, int nthreads)
-{
- int i;
-
- config->stats = calloc(nthreads, sizeof(struct runtime_stat));
- if (!config->stats)
- return -1;
-
- config->stats_num = nthreads;
-
- for (i = 0; i < nthreads; i++)
- runtime_stat__init(&config->stats[i]);
-
- return 0;
-}
-
-static void runtime_stat_delete(struct perf_stat_config *config)
-{
- int i;
-
- if (!config->stats)
- return;
-
- for (i = 0; i < config->stats_num; i++)
- runtime_stat__exit(&config->stats[i]);
-
- zfree(&config->stats);
-}
-
-static void runtime_stat_reset(struct perf_stat_config *config)
-{
- int i;
-
- if (!config->stats)
- return;
-
- for (i = 0; i < config->stats_num; i++)
- perf_stat__reset_shadow_per_stat(&config->stats[i]);
-}
-
static void process_interval(void)
{
struct timespec ts, rs;
@@ -536,7 +492,6 @@ static void process_interval(void)
diff_timespec(&rs, &ts, &ref_time);
perf_stat__reset_shadow_per_stat(&rt_stat);
- runtime_stat_reset(&stat_config);
read_counters(&rs);
if (STAT_RECORD) {
@@ -661,9 +616,7 @@ static void process_evlist(struct evlist *evlist, unsigned int interval)
if (evlist__ctlfd_process(evlist, &cmd) > 0) {
switch (cmd) {
case EVLIST_CTL_CMD_ENABLE:
- if (interval)
- process_interval();
- break;
+ __fallthrough;
case EVLIST_CTL_CMD_DISABLE:
if (interval)
process_interval();
@@ -901,8 +854,6 @@ try_again:
evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) {
counter = evlist_cpu_itr.evsel;
- if (!counter->reset_group && !counter->errored)
- continue;
if (!counter->reset_group)
continue;
try_again_reset:
@@ -1017,7 +968,6 @@ try_again_reset:
evlist__copy_prev_raw_counts(evsel_list);
evlist__reset_prev_raw_counts(evsel_list);
- runtime_stat_reset(&stat_config);
perf_stat__reset_shadow_per_stat(&rt_stat);
} else {
update_stats(&walltime_nsecs_stats, t1 - t0);
@@ -1148,14 +1098,23 @@ static int enable_metric_only(const struct option *opt __maybe_unused,
return 0;
}
-static int parse_metric_groups(const struct option *opt,
+static int append_metric_groups(const struct option *opt __maybe_unused,
const char *str,
int unset __maybe_unused)
{
- return metricgroup__parse_groups(opt, str,
- stat_config.metric_no_group,
- stat_config.metric_no_merge,
- &stat_config.metric_events);
+ if (metrics) {
+ char *tmp;
+
+ if (asprintf(&tmp, "%s,%s", metrics, str) < 0)
+ return -ENOMEM;
+ free(metrics);
+ metrics = tmp;
+ } else {
+ metrics = strdup(str);
+ if (!metrics)
+ return -ENOMEM;
+ }
+ return 0;
}
static int parse_control_option(const struct option *opt,
@@ -1299,7 +1258,7 @@ static struct option stat_options[] = {
"measure SMI cost"),
OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list",
"monitor specified metrics or metric groups (separated by ,)",
- parse_metric_groups),
+ append_metric_groups),
OPT_BOOLEAN_FLAG(0, "all-kernel", &stat_config.all_kernel,
"Configure all used events to run in kernel space.",
PARSE_OPT_EXCLUSIVE),
@@ -1792,11 +1751,11 @@ static int add_default_attributes(void)
* on an architecture test for such a metric name.
*/
if (metricgroup__has_metric("transaction")) {
- struct option opt = { .value = &evsel_list };
-
- return metricgroup__parse_groups(&opt, "transaction",
+ return metricgroup__parse_groups(evsel_list, "transaction",
stat_config.metric_no_group,
- stat_config.metric_no_merge,
+ stat_config.metric_no_merge,
+ stat_config.user_requested_cpu_list,
+ stat_config.system_wide,
&stat_config.metric_events);
}
@@ -2183,6 +2142,8 @@ static int __cmd_report(int argc, const char **argv)
input_name = "perf.data";
}
+ perf_stat__init_shadow_stats();
+
perf_stat.data.path = input_name;
perf_stat.data.mode = PERF_DATA_MODE_READ;
@@ -2262,8 +2223,6 @@ int cmd_stat(int argc, const char **argv)
argc = parse_options_subcommand(argc, argv, stat_options, stat_subcommands,
(const char **) stat_usage,
PARSE_OPT_STOP_AT_NON_OPTION);
- perf_stat__collect_metric_expr(evsel_list);
- perf_stat__init_shadow_stats();
if (stat_config.csv_sep) {
stat_config.csv_output = true;
@@ -2430,6 +2389,34 @@ int cmd_stat(int argc, const char **argv)
target.system_wide = true;
}
+ if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide))
+ target.per_thread = true;
+
+ stat_config.system_wide = target.system_wide;
+ if (target.cpu_list) {
+ stat_config.user_requested_cpu_list = strdup(target.cpu_list);
+ if (!stat_config.user_requested_cpu_list) {
+ status = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * Metric parsing needs to be delayed as metrics may optimize events
+ * knowing the target is system-wide.
+ */
+ if (metrics) {
+ metricgroup__parse_groups(evsel_list, metrics,
+ stat_config.metric_no_group,
+ stat_config.metric_no_merge,
+ stat_config.user_requested_cpu_list,
+ stat_config.system_wide,
+ &stat_config.metric_events);
+ zfree(&metrics);
+ }
+ perf_stat__collect_metric_expr(evsel_list);
+ perf_stat__init_shadow_stats();
+
if (add_default_attributes())
goto out;
@@ -2449,9 +2436,6 @@ int cmd_stat(int argc, const char **argv)
}
}
- if ((stat_config.aggr_mode == AGGR_THREAD) && (target.system_wide))
- target.per_thread = true;
-
if (evlist__fix_hybrid_cpus(evsel_list, target.cpu_list)) {
pr_err("failed to use cpu list %s\n", target.cpu_list);
goto out;
@@ -2479,12 +2463,6 @@ int cmd_stat(int argc, const char **argv)
*/
if (stat_config.aggr_mode == AGGR_THREAD) {
thread_map__read_comms(evsel_list->core.threads);
- if (target.system_wide) {
- if (runtime_stat_new(&stat_config,
- perf_thread_map__nr(evsel_list->core.threads))) {
- goto out;
- }
- }
}
if (stat_config.aggr_mode == AGGR_NODE)
@@ -2617,6 +2595,7 @@ out:
iostat_release(evsel_list);
zfree(&stat_config.walltime_run);
+ zfree(&stat_config.user_requested_cpu_list);
if (smi_cost && smi_reset)
sysfs__write_int(FREEZE_ON_SMI_PATH, 0);
@@ -2624,7 +2603,6 @@ out:
evlist__delete(evsel_list);
metricgroup__rblist_exit(&stat_config.metric_events);
- runtime_stat_delete(&stat_config);
evlist__close_control(stat_config.ctl_fd, stat_config.ctl_fd_ack, &stat_config.ctl_fd_close);
return status;
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index e2e9ad929baf..c36296bb7637 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -215,6 +215,19 @@ static struct per_pid *find_create_pid(struct timechart *tchart, int pid)
return cursor;
}
+static struct per_pidcomm *create_pidcomm(struct per_pid *p)
+{
+ struct per_pidcomm *c;
+
+ c = zalloc(sizeof(*c));
+ if (!c)
+ return NULL;
+ p->current = c;
+ c->next = p->all;
+ p->all = c;
+ return c;
+}
+
static void pid_set_comm(struct timechart *tchart, int pid, char *comm)
{
struct per_pid *p;
@@ -233,12 +246,9 @@ static void pid_set_comm(struct timechart *tchart, int pid, char *comm)
}
c = c->next;
}
- c = zalloc(sizeof(*c));
+ c = create_pidcomm(p);
assert(c != NULL);
c->comm = strdup(comm);
- p->current = c;
- c->next = p->all;
- p->all = c;
}
static void pid_fork(struct timechart *tchart, int pid, int ppid, u64 timestamp)
@@ -277,11 +287,8 @@ static void pid_put_sample(struct timechart *tchart, int pid, int type,
p = find_create_pid(tchart, pid);
c = p->current;
if (!c) {
- c = zalloc(sizeof(*c));
+ c = create_pidcomm(p);
assert(c != NULL);
- p->current = c;
- c->next = p->all;
- p->all = c;
}
sample = zalloc(sizeof(*sample));
@@ -369,16 +376,13 @@ static void c_state_end(struct timechart *tchart, int cpu, u64 timestamp)
tchart->power_events = pwr;
}
-static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 new_freq)
+static struct power_event *p_state_end(struct timechart *tchart, int cpu,
+ u64 timestamp)
{
- struct power_event *pwr;
-
- if (new_freq > 8000000) /* detect invalid data */
- return;
+ struct power_event *pwr = zalloc(sizeof(*pwr));
- pwr = zalloc(sizeof(*pwr));
if (!pwr)
- return;
+ return NULL;
pwr->state = cpus_pstate_state[cpu];
pwr->start_time = cpus_pstate_start_times[cpu];
@@ -386,11 +390,23 @@ static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64
pwr->cpu = cpu;
pwr->type = PSTATE;
pwr->next = tchart->power_events;
-
if (!pwr->start_time)
pwr->start_time = tchart->first_time;
tchart->power_events = pwr;
+ return pwr;
+}
+
+static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 new_freq)
+{
+ struct power_event *pwr;
+
+ if (new_freq > 8000000) /* detect invalid data */
+ return;
+
+ pwr = p_state_end(tchart, cpu, timestamp);
+ if (!pwr)
+ return;
cpus_pstate_state[cpu] = new_freq;
cpus_pstate_start_times[cpu] = timestamp;
@@ -698,22 +714,12 @@ static void end_sample_processing(struct timechart *tchart)
#endif
/* P state */
- pwr = zalloc(sizeof(*pwr));
+ pwr = p_state_end(tchart, cpu, tchart->last_time);
if (!pwr)
return;
- pwr->state = cpus_pstate_state[cpu];
- pwr->start_time = cpus_pstate_start_times[cpu];
- pwr->end_time = tchart->last_time;
- pwr->cpu = cpu;
- pwr->type = PSTATE;
- pwr->next = tchart->power_events;
-
- if (!pwr->start_time)
- pwr->start_time = tchart->first_time;
if (!pwr->state)
pwr->state = tchart->min_freq;
- tchart->power_events = pwr;
}
}
@@ -726,12 +732,9 @@ static int pid_begin_io_sample(struct timechart *tchart, int pid, int type,
struct io_sample *prev;
if (!c) {
- c = zalloc(sizeof(*c));
+ c = create_pidcomm(p);
if (!c)
return -ENOMEM;
- p->current = c;
- c->next = p->all;
- p->all = c;
}
prev = c->io_samples;
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index fd8fd913c533..4b3ff7687236 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -136,10 +136,10 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
}
notes = symbol__annotation(sym);
- pthread_mutex_lock(&notes->lock);
+ mutex_lock(&notes->lock);
if (!symbol__hists(sym, top->evlist->core.nr_entries)) {
- pthread_mutex_unlock(&notes->lock);
+ mutex_unlock(&notes->lock);
pr_err("Not enough memory for annotating '%s' symbol!\n",
sym->name);
sleep(1);
@@ -155,7 +155,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
pr_err("Couldn't annotate %s: %s\n", sym->name, msg);
}
- pthread_mutex_unlock(&notes->lock);
+ mutex_unlock(&notes->lock);
return err;
}
@@ -196,6 +196,7 @@ static void perf_top__record_precise_ip(struct perf_top *top,
struct hist_entry *he,
struct perf_sample *sample,
struct evsel *evsel, u64 ip)
+ EXCLUSIVE_LOCKS_REQUIRED(he->hists->lock)
{
struct annotation *notes;
struct symbol *sym = he->ms.sym;
@@ -208,19 +209,19 @@ static void perf_top__record_precise_ip(struct perf_top *top,
notes = symbol__annotation(sym);
- if (pthread_mutex_trylock(&notes->lock))
+ if (!mutex_trylock(&notes->lock))
return;
err = hist_entry__inc_addr_samples(he, sample, evsel, ip);
- pthread_mutex_unlock(&notes->lock);
+ mutex_unlock(&notes->lock);
if (unlikely(err)) {
/*
* This function is now called with he->hists->lock held.
* Release it before going to sleep.
*/
- pthread_mutex_unlock(&he->hists->lock);
+ mutex_unlock(&he->hists->lock);
if (err == -ERANGE && !he->ms.map->erange_warned)
ui__warn_map_erange(he->ms.map, sym, ip);
@@ -230,7 +231,7 @@ static void perf_top__record_precise_ip(struct perf_top *top,
sleep(1);
}
- pthread_mutex_lock(&he->hists->lock);
+ mutex_lock(&he->hists->lock);
}
}
@@ -250,7 +251,7 @@ static void perf_top__show_details(struct perf_top *top)
symbol = he->ms.sym;
notes = symbol__annotation(symbol);
- pthread_mutex_lock(&notes->lock);
+ mutex_lock(&notes->lock);
symbol__calc_percent(symbol, evsel);
@@ -271,7 +272,7 @@ static void perf_top__show_details(struct perf_top *top)
if (more != 0)
printf("%d lines not displayed, maybe increase display entries [e]\n", more);
out_unlock:
- pthread_mutex_unlock(&notes->lock);
+ mutex_unlock(&notes->lock);
}
static void perf_top__resort_hists(struct perf_top *t)
@@ -724,13 +725,13 @@ repeat:
static int hist_iter__top_callback(struct hist_entry_iter *iter,
struct addr_location *al, bool single,
void *arg)
+ EXCLUSIVE_LOCKS_REQUIRED(iter->he->hists->lock)
{
struct perf_top *top = arg;
- struct hist_entry *he = iter->he;
struct evsel *evsel = iter->evsel;
if (perf_hpp_list.sym && single)
- perf_top__record_precise_ip(top, he, iter->sample, evsel, al->addr);
+ perf_top__record_precise_ip(top, iter->he, iter->sample, evsel, al->addr);
hist__account_cycles(iter->sample->branch_stack, al, iter->sample,
!(top->record_opts.branch_stack & PERF_SAMPLE_BRANCH_ANY),
@@ -836,12 +837,12 @@ static void perf_event__process_sample(struct perf_tool *tool,
else
iter.ops = &hist_iter_normal;
- pthread_mutex_lock(&hists->lock);
+ mutex_lock(&hists->lock);
if (hist_entry_iter__add(&iter, &al, top->max_stack, top) < 0)
pr_err("Problem incrementing symbol period, skipping event\n");
- pthread_mutex_unlock(&hists->lock);
+ mutex_unlock(&hists->lock);
}
addr_location__put(&al);
@@ -893,10 +894,10 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
perf_mmap__consume(&md->core);
if (top->qe.rotate) {
- pthread_mutex_lock(&top->qe.mutex);
+ mutex_lock(&top->qe.mutex);
top->qe.rotate = false;
- pthread_cond_signal(&top->qe.cond);
- pthread_mutex_unlock(&top->qe.mutex);
+ cond_signal(&top->qe.cond);
+ mutex_unlock(&top->qe.mutex);
}
}
@@ -1100,10 +1101,10 @@ static void *process_thread(void *arg)
out = rotate_queues(top);
- pthread_mutex_lock(&top->qe.mutex);
+ mutex_lock(&top->qe.mutex);
top->qe.rotate = true;
- pthread_cond_wait(&top->qe.cond, &top->qe.mutex);
- pthread_mutex_unlock(&top->qe.mutex);
+ cond_wait(&top->qe.cond, &top->qe.mutex);
+ mutex_unlock(&top->qe.mutex);
if (ordered_events__flush(out, OE_FLUSH__TOP))
pr_err("failed to process events\n");
@@ -1217,8 +1218,8 @@ static void init_process_thread(struct perf_top *top)
ordered_events__set_copy_on_queue(&top->qe.data[0], true);
ordered_events__set_copy_on_queue(&top->qe.data[1], true);
top->qe.in = &top->qe.data[0];
- pthread_mutex_init(&top->qe.mutex, NULL);
- pthread_cond_init(&top->qe.cond, NULL);
+ mutex_init(&top->qe.mutex);
+ cond_init(&top->qe.cond);
}
static int __cmd_top(struct perf_top *top)
@@ -1349,7 +1350,7 @@ static int __cmd_top(struct perf_top *top)
out_join:
pthread_join(thread, NULL);
out_join_thread:
- pthread_cond_signal(&top->qe.cond);
+ cond_signal(&top->qe.cond);
pthread_join(thread_process, NULL);
return ret;
}
@@ -1706,6 +1707,7 @@ int cmd_top(int argc, const char **argv)
if (evlist__create_maps(top.evlist, target) < 0) {
ui__error("Couldn't create thread/CPU maps: %s\n",
errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
+ status = -errno;
goto out_delete_evlist;
}
@@ -1758,11 +1760,13 @@ int cmd_top(int argc, const char **argv)
if (top.sb_evlist == NULL) {
pr_err("Couldn't create side band evlist.\n.");
+ status = -EINVAL;
goto out_delete_evlist;
}
if (evlist__add_bpf_sb_event(top.sb_evlist, &perf_env)) {
pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
+ status = -EINVAL;
goto out_delete_evlist;
}
}
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 0bd9d01c0df9..d3c757769b96 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -615,11 +615,8 @@ bool strarray__strtoul_flags(struct strarray *sa, char *bf, size_t size, u64 *re
if (isalpha(*tok) || *tok == '_') {
if (!strarray__strtoul(sa, tok, toklen, &val))
return false;
- } else {
- bool is_hexa = tok[0] == 0 && (tok[1] = 'x' || tok[1] == 'X');
-
- val = strtoul(tok, NULL, is_hexa ? 16 : 0);
- }
+ } else
+ val = strtoul(tok, NULL, 0);
*ret |= (1 << (val - 1));
@@ -2173,13 +2170,10 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr
stats = inode->priv;
if (stats == NULL) {
- stats = malloc(sizeof(*stats));
+ stats = zalloc(sizeof(*stats));
if (stats == NULL)
return;
- stats->nr_failures = 0;
- stats->max_errno = 0;
- stats->errnos = NULL;
init_stats(&stats->stats);
inode->priv = stats;
}
@@ -2762,11 +2756,7 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel,
printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : "");
- /*
- * XXX Perhaps we should have a show_tp_arg_names,
- * leaving show_arg_names just for syscalls?
- */
- if (1 || trace->show_arg_names)
+ if (trace->show_arg_names)
printed += scnprintf(bf + printed, size - printed, "%s: ", field->name);
printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val);
diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh
index 6ee44b18c6b5..eacca9a874e2 100755
--- a/tools/perf/check-headers.sh
+++ b/tools/perf/check-headers.sh
@@ -143,7 +143,7 @@ for i in $SYNC_CHECK_FILES; do
done
# diff with extra ignore lines
-check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>" -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))"'
+check arch/x86/lib/memcpy_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>" -I"^SYM_FUNC_START\(_LOCAL\)*(memcpy_\(erms\|orig\))" -I"^#include <linux/cfi_types.h>"'
check arch/x86/lib/memset_64.S '-I "^EXPORT_SYMBOL" -I "^#include <asm/export.h>" -I"^SYM_FUNC_START\(_LOCAL\)*(memset_\(erms\|orig\))"'
check arch/x86/include/asm/amd-ibs.h '-I "^#include [<\"]\(asm/\)*msr-index.h"'
check arch/arm64/include/asm/cputype.h '-I "^#include [<\"]\(asm/\)*sysreg.h"'
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index c21b3973641a..7af135dea1cd 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -99,10 +99,16 @@ struct pager_config {
int val;
};
+static bool same_cmd_with_prefix(const char *var, struct pager_config *c,
+ const char *header)
+{
+ return (strstarts(var, header) && !strcmp(var + strlen(header), c->cmd));
+}
+
static int pager_command_config(const char *var, const char *value, void *data)
{
struct pager_config *c = data;
- if (strstarts(var, "pager.") && !strcmp(var + 6, c->cmd))
+ if (same_cmd_with_prefix(var, c, "pager."))
c->val = perf_config_bool(var, value);
return 0;
}
@@ -121,9 +127,9 @@ static int check_pager_config(const char *cmd)
static int browser_command_config(const char *var, const char *value, void *data)
{
struct pager_config *c = data;
- if (strstarts(var, "tui.") && !strcmp(var + 4, c->cmd))
+ if (same_cmd_with_prefix(var, c, "tui."))
c->val = perf_config_bool(var, value);
- if (strstarts(var, "gtk.") && !strcmp(var + 4, c->cmd))
+ if (same_cmd_with_prefix(var, c, "gtk."))
c->val = perf_config_bool(var, value) ? 2 : 0;
return 0;
}
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/branch.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/branch.json
index 2f2d137f5f55..2f2d137f5f55 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/branch.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/branch.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/bus.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/bus.json
index 75d850b781ac..75d850b781ac 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/bus.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/bus.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/cache.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/cache.json
index 118c5cb0674b..118c5cb0674b 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/cache.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/cache.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/dpu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/dpu.json
index b8e402a91bdd..b8e402a91bdd 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/dpu.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/dpu.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/exception.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/exception.json
index 27c3fe9c831a..27c3fe9c831a 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/exception.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/exception.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/ifu.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/ifu.json
index 13178c5dca14..13178c5dca14 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/ifu.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/ifu.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/instruction.json
index 2e0d60779dce..2e0d60779dce 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/instruction.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/instruction.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/memory.json
index 18d527f7fad4..18d527f7fad4 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/memory.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/memory.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/pipeline.json
index eeac798d403a..eeac798d403a 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a65/pipeline.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a65-e1/pipeline.json
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json
index 20a929e7728d..5bed2514b245 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/memory.json
@@ -4,6 +4,9 @@
"ArchStdEvent": "MEM_ACCESS"
},
{
+ "ArchStdEvent": "REMOTE_ACCESS"
+ },
+ {
"ArchStdEvent": "MEM_ACCESS_RD"
},
{
diff --git a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json b/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json
deleted file mode 100644
index 20d8365756c5..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/cortex-a76-n1/other.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
- {
- "ArchStdEvent": "REMOTE_ACCESS"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json
deleted file mode 100644
index 2f2d137f5f55..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/branch.json
+++ /dev/null
@@ -1,17 +0,0 @@
-[
- {
- "ArchStdEvent": "BR_MIS_PRED"
- },
- {
- "ArchStdEvent": "BR_PRED"
- },
- {
- "ArchStdEvent": "BR_IMMED_SPEC"
- },
- {
- "ArchStdEvent": "BR_RETURN_SPEC"
- },
- {
- "ArchStdEvent": "BR_INDIRECT_SPEC"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json
deleted file mode 100644
index 75d850b781ac..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/bus.json
+++ /dev/null
@@ -1,17 +0,0 @@
-[
- {
- "ArchStdEvent": "CPU_CYCLES"
- },
- {
- "ArchStdEvent": "BUS_ACCESS"
- },
- {
- "ArchStdEvent": "BUS_CYCLES"
- },
- {
- "ArchStdEvent": "BUS_ACCESS_RD"
- },
- {
- "ArchStdEvent": "BUS_ACCESS_WR"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json
deleted file mode 100644
index 3ad15e3a93a9..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/cache.json
+++ /dev/null
@@ -1,107 +0,0 @@
-[
- {
- "ArchStdEvent": "L1I_CACHE_REFILL"
- },
- {
- "ArchStdEvent": "L1I_TLB_REFILL"
- },
- {
- "ArchStdEvent": "L1D_CACHE_REFILL"
- },
- {
- "ArchStdEvent": "L1D_CACHE"
- },
- {
- "ArchStdEvent": "L1D_TLB_REFILL"
- },
- {
- "ArchStdEvent": "L1I_CACHE"
- },
- {
- "ArchStdEvent": "L1D_CACHE_WB"
- },
- {
- "ArchStdEvent": "L2D_CACHE"
- },
- {
- "ArchStdEvent": "L2D_CACHE_REFILL"
- },
- {
- "ArchStdEvent": "L2D_CACHE_WB"
- },
- {
- "ArchStdEvent": "L1D_CACHE_ALLOCATE"
- },
- {
- "ArchStdEvent": "L2D_CACHE_ALLOCATE"
- },
- {
- "ArchStdEvent": "L1D_TLB"
- },
- {
- "ArchStdEvent": "L1I_TLB"
- },
- {
- "ArchStdEvent": "L3D_CACHE_ALLOCATE"
- },
- {
- "ArchStdEvent": "L3D_CACHE_REFILL"
- },
- {
- "ArchStdEvent": "L3D_CACHE"
- },
- {
- "ArchStdEvent": "L2D_TLB_REFILL"
- },
- {
- "ArchStdEvent": "L2D_TLB"
- },
- {
- "ArchStdEvent": "DTLB_WALK"
- },
- {
- "ArchStdEvent": "ITLB_WALK"
- },
- {
- "ArchStdEvent": "LL_CACHE_RD"
- },
- {
- "ArchStdEvent": "LL_CACHE_MISS_RD"
- },
- {
- "ArchStdEvent": "L1D_CACHE_RD"
- },
- {
- "ArchStdEvent": "L1D_CACHE_WR"
- },
- {
- "ArchStdEvent": "L1D_CACHE_REFILL_RD"
- },
- {
- "ArchStdEvent": "L1D_CACHE_REFILL_WR"
- },
- {
- "ArchStdEvent": "L1D_CACHE_REFILL_INNER"
- },
- {
- "ArchStdEvent": "L1D_CACHE_REFILL_OUTER"
- },
- {
- "ArchStdEvent": "L2D_CACHE_RD"
- },
- {
- "ArchStdEvent": "L2D_CACHE_WR"
- },
- {
- "ArchStdEvent": "L2D_CACHE_REFILL_RD"
- },
- {
- "ArchStdEvent": "L2D_CACHE_REFILL_WR"
- },
- {
- "ArchStdEvent": "L3D_CACHE_RD"
- },
- {
- "ArchStdEvent": "L3D_CACHE_REFILL_RD"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json
deleted file mode 100644
index 27c3fe9c831a..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/exception.json
+++ /dev/null
@@ -1,14 +0,0 @@
-[
- {
- "ArchStdEvent": "EXC_TAKEN"
- },
- {
- "ArchStdEvent": "MEMORY_ERROR"
- },
- {
- "ArchStdEvent": "EXC_IRQ"
- },
- {
- "ArchStdEvent": "EXC_FIQ"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json
deleted file mode 100644
index 6c3b8f772e7f..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/instruction.json
+++ /dev/null
@@ -1,65 +0,0 @@
-[
- {
- "ArchStdEvent": "SW_INCR"
- },
- {
- "ArchStdEvent": "LD_RETIRED"
- },
- {
- "ArchStdEvent": "ST_RETIRED"
- },
- {
- "ArchStdEvent": "INST_RETIRED"
- },
- {
- "ArchStdEvent": "EXC_RETURN"
- },
- {
- "ArchStdEvent": "CID_WRITE_RETIRED"
- },
- {
- "ArchStdEvent": "PC_WRITE_RETIRED"
- },
- {
- "ArchStdEvent": "BR_IMMED_RETIRED"
- },
- {
- "ArchStdEvent": "BR_RETURN_RETIRED"
- },
- {
- "ArchStdEvent": "INST_SPEC"
- },
- {
- "ArchStdEvent": "TTBR_WRITE_RETIRED"
- },
- {
- "ArchStdEvent": "BR_RETIRED"
- },
- {
- "ArchStdEvent": "BR_MIS_PRED_RETIRED"
- },
- {
- "ArchStdEvent": "LD_SPEC"
- },
- {
- "ArchStdEvent": "ST_SPEC"
- },
- {
- "ArchStdEvent": "LDST_SPEC"
- },
- {
- "ArchStdEvent": "DP_SPEC"
- },
- {
- "ArchStdEvent": "ASE_SPEC"
- },
- {
- "ArchStdEvent": "VFP_SPEC"
- },
- {
- "ArchStdEvent": "CRYPTO_SPEC"
- },
- {
- "ArchStdEvent": "ISB_SPEC"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json
deleted file mode 100644
index 78ed6dfcedc1..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/memory.json
+++ /dev/null
@@ -1,23 +0,0 @@
-[
- {
- "ArchStdEvent": "MEM_ACCESS"
- },
- {
- "ArchStdEvent": "REMOTE_ACCESS_RD"
- },
- {
- "ArchStdEvent": "MEM_ACCESS_RD"
- },
- {
- "ArchStdEvent": "MEM_ACCESS_WR"
- },
- {
- "ArchStdEvent": "UNALIGNED_LD_SPEC"
- },
- {
- "ArchStdEvent": "UNALIGNED_ST_SPEC"
- },
- {
- "ArchStdEvent": "UNALIGNED_LDST_SPEC"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json
deleted file mode 100644
index eeac798d403a..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/pipeline.json
+++ /dev/null
@@ -1,8 +0,0 @@
-[
- {
- "ArchStdEvent": "STALL_FRONTEND"
- },
- {
- "ArchStdEvent": "STALL_BACKEND"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json
deleted file mode 100644
index 20f2165c85fe..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-e1/spe.json
+++ /dev/null
@@ -1,14 +0,0 @@
-[
- {
- "ArchStdEvent": "SAMPLE_POP"
- },
- {
- "ArchStdEvent": "SAMPLE_FEED"
- },
- {
- "ArchStdEvent": "SAMPLE_FILTRATE"
- },
- {
- "ArchStdEvent": "SAMPLE_COLLISION"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
index e522113aeb96..7b2b21ac150f 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/memory.json
@@ -3,6 +3,9 @@
"ArchStdEvent": "MEM_ACCESS"
},
{
+ "ArchStdEvent": "REMOTE_ACCESS"
+ },
+ {
"ArchStdEvent": "MEM_ACCESS_RD"
},
{
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json
deleted file mode 100644
index 20d8365756c5..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-n2/other.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
- {
- "ArchStdEvent": "REMOTE_ACCESS"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json
index 25825e14c535..e29b88fb7f24 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/instruction.json
@@ -85,5 +85,35 @@
},
{
"ArchStdEvent": "RC_ST_SPEC"
+ },
+ {
+ "ArchStdEvent": "ASE_INST_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_INST_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_PRED_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_PRED_EMPTY_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_PRED_FULL_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_PRED_PARTIAL_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_LDFF_SPEC"
+ },
+ {
+ "ArchStdEvent": "SVE_LDFF_FAULT_SPEC"
+ },
+ {
+ "ArchStdEvent": "FP_SCALE_OPS_SPEC"
+ },
+ {
+ "ArchStdEvent": "FP_FIXED_OPS_SPEC"
}
]
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json
index e3d08f1f7c92..5aff6e93c1ad 100644
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json
+++ b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/memory.json
@@ -3,6 +3,9 @@
"ArchStdEvent": "MEM_ACCESS"
},
{
+ "ArchStdEvent": "REMOTE_ACCESS"
+ },
+ {
"ArchStdEvent": "MEM_ACCESS_RD"
},
{
diff --git a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json b/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json
deleted file mode 100644
index 20d8365756c5..000000000000
--- a/tools/perf/pmu-events/arch/arm64/arm/neoverse-v1/other.json
+++ /dev/null
@@ -1,5 +0,0 @@
-[
- {
- "ArchStdEvent": "REMOTE_ACCESS"
- }
-]
diff --git a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
index 6970203cb247..6443a061e22a 100644
--- a/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
+++ b/tools/perf/pmu-events/arch/arm64/hisilicon/hip08/metrics.json
@@ -112,21 +112,21 @@
"MetricName": "indirect_branch"
},
{
- "MetricExpr": "(armv8_pmuv3_0@event\\=0x1014@ + armv8_pmuv3_0@event\\=0x1018@) / BR_MIS_PRED",
+ "MetricExpr": "(armv8_pmuv3_0@event\\=0x1013@ + armv8_pmuv3_0@event\\=0x1016@) / BR_MIS_PRED",
"PublicDescription": "Push branch L3 topdown metric",
"BriefDescription": "Push branch L3 topdown metric",
"MetricGroup": "TopDownL3",
"MetricName": "push_branch"
},
{
- "MetricExpr": "armv8_pmuv3_0@event\\=0x100c@ / BR_MIS_PRED",
+ "MetricExpr": "armv8_pmuv3_0@event\\=0x100d@ / BR_MIS_PRED",
"PublicDescription": "Pop branch L3 topdown metric",
"BriefDescription": "Pop branch L3 topdown metric",
"MetricGroup": "TopDownL3",
"MetricName": "pop_branch"
},
{
- "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1014@ - armv8_pmuv3_0@event\\=0x1018@ - armv8_pmuv3_0@event\\=0x100c@) / BR_MIS_PRED",
+ "MetricExpr": "(BR_MIS_PRED - armv8_pmuv3_0@event\\=0x1010@ - armv8_pmuv3_0@event\\=0x1013@ - armv8_pmuv3_0@event\\=0x1016@ - armv8_pmuv3_0@event\\=0x100d@) / BR_MIS_PRED",
"PublicDescription": "Other branch L3 topdown metric",
"BriefDescription": "Other branch L3 topdown metric",
"MetricGroup": "TopDownL3",
diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv
index 406f6edd4e12..ad502d00f460 100644
--- a/tools/perf/pmu-events/arch/arm64/mapfile.csv
+++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv
@@ -17,7 +17,8 @@
0x00000000420f1000,v1,arm/cortex-a53,core
0x00000000410fd040,v1,arm/cortex-a35,core
0x00000000410fd050,v1,arm/cortex-a55,core
-0x00000000410fd060,v1,arm/cortex-a65,core
+0x00000000410fd060,v1,arm/cortex-a65-e1,core
+0x00000000410fd4a0,v1,arm/cortex-a65-e1,core
0x00000000410fd070,v1,arm/cortex-a57-a72,core
0x00000000410fd080,v1,arm/cortex-a57-a72,core
0x00000000410fd090,v1,arm/cortex-a73,core
@@ -34,7 +35,6 @@
0x00000000410fd470,v1,arm/cortex-a710,core
0x00000000410fd480,v1,arm/cortex-x2,core
0x00000000410fd490,v1,arm/neoverse-n2,core
-0x00000000410fd4a0,v1,arm/neoverse-e1,core
0x00000000420f5160,v1,cavium/thunderx2,core
0x00000000430f0af0,v1,cavium/thunderx2,core
0x00000000460f0010,v1,fujitsu/a64fx,core
diff --git a/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
index 8ba3e81c9808..fe050d44374b 100644
--- a/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
+++ b/tools/perf/pmu-events/arch/powerpc/power10/nest_metrics.json
@@ -1,13 +1,13 @@
[
{
"MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P01",
- "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP01\\,chip\\=?@ / hv_24x7@PM_PB_VG_PUMP01\\,chip\\=?@) * 100",
+ "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP01\\,chip\\=?@ / (1 + hv_24x7@PM_PB_VG_PUMP01\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "VEC_GROUP_PUMP_RETRY_RATIO_P23",
- "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP23\\,chip\\=?@ / hv_24x7@PM_PB_VG_PUMP23\\,chip\\=?@) * 100",
+ "MetricExpr": "(hv_24x7@PM_PB_RTY_VG_PUMP23\\,chip\\=?@ / (1 + hv_24x7@PM_PB_VG_PUMP23\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
@@ -61,13 +61,13 @@
},
{
"MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P01",
- "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP01\\,chip\\=?@ / hv_24x7@PM_PB_RNS_PUMP01\\,chip\\=?@) * 100",
+ "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP01\\,chip\\=?@ / (1 + hv_24x7@PM_PB_RNS_PUMP01\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "REMOTE_NODE_PUMPS_RETRIES_RATIO_P23",
- "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP23\\,chip\\=?@ / hv_24x7@PM_PB_RNS_PUMP23\\,chip\\=?@) * 100",
+ "MetricExpr": "(hv_24x7@PM_PB_RTY_RNS_PUMP23\\,chip\\=?@ / (1 + hv_24x7@PM_PB_RNS_PUMP23\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
@@ -151,193 +151,193 @@
},
{
"MetricName": "XLINK0_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK0_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK0_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK1_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK1_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK1_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK2_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK2_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK2_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK3_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK3_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK3_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK4_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK4_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK4_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK5_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK5_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK5_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK6_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK6_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK6_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK7_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK7_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_XLINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK7_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK0_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK0_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK0_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK1_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK1_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK1_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK2_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK2_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK2_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK3_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK3_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK3_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK4_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK4_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK4_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK5_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK5_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK5_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK6_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK6_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK6_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "XLINK7_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_XLINK7_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_XLINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_XLINK7_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_XLINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_XLINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK0_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK0_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK0_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK1_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK1_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK1_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK2_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK2_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK2_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK3_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK3_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK3_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK4_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK4_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK4_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK5_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK5_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK5_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK6_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK6_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK6_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK7_OUT_TOTAL_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK7_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (hv_24x7@PM_ALINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK7_OUT_ODD_TOTAL_UTIL\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_TOTAL_UTIL\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK0_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK0_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK0_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK0_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK0_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK1_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK1_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK1_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK1_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK1_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK2_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK2_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK2_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK2_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK2_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK3_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK3_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK3_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK3_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK3_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK4_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK4_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK4_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK4_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK4_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK5_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK5_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK5_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK5_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK5_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK6_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK6_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK6_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK6_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK6_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
{
"MetricName": "ALINK7_OUT_DATA_UTILIZATION",
- "MetricExpr": "((hv_24x7@PM_ALINK7_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_DATA\\,chip\\=?@) / (hv_24x7@PM_ALINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
+ "MetricExpr": "((hv_24x7@PM_ALINK7_OUT_ODD_DATA\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_DATA\\,chip\\=?@) / (1 + hv_24x7@PM_ALINK7_OUT_ODD_AVLBL_CYCLES\\,chip\\=?@ + hv_24x7@PM_ALINK7_OUT_EVEN_AVLBL_CYCLES\\,chip\\=?@)) * 100",
"ScaleUnit": "1.063%",
"AggregationMode": "PerChip"
},
diff --git a/tools/perf/pmu-events/arch/s390/cf_z16/pai.json b/tools/perf/pmu-events/arch/s390/cf_z16/pai_crypto.json
index cf8563d059b9..cf8563d059b9 100644
--- a/tools/perf/pmu-events/arch/s390/cf_z16/pai.json
+++ b/tools/perf/pmu-events/arch/s390/cf_z16/pai_crypto.json
diff --git a/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json b/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json
index 42d9b5242fd7..70ec8caaaf6f 100644
--- a/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json
+++ b/tools/perf/pmu-events/arch/test/test_soc/cpu/metrics.json
@@ -34,15 +34,15 @@
"MetricName": "DCache_L2_All_Miss"
},
{
- "MetricExpr": "dcache_l2_all_hits + dcache_l2_all_miss",
+ "MetricExpr": "DCache_L2_All_Hits + DCache_L2_All_Miss",
"MetricName": "DCache_L2_All"
},
{
- "MetricExpr": "d_ratio(dcache_l2_all_hits, dcache_l2_all)",
+ "MetricExpr": "d_ratio(DCache_L2_All_Hits, DCache_L2_All)",
"MetricName": "DCache_L2_Hits"
},
{
- "MetricExpr": "d_ratio(dcache_l2_all_miss, dcache_l2_all)",
+ "MetricExpr": "d_ratio(DCache_L2_All_Miss, DCache_L2_All)",
"MetricName": "DCache_L2_Misses"
},
{
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
index 095dd8c7f161..e06d26ad5138 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json
@@ -1,22 +1,852 @@
[
{
+ "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
+ "MetricExpr": "topdown\\-fe\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS",
+ "MetricGroup": "PGO;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+ "MetricExpr": "(topdown\\-fetch\\-lat / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) - INT_MISC.UOP_DROPPING / SLOTS)",
+ "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_latency",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
+ "MetricExpr": "ICACHE_DATA.STALLS / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+ "MetricExpr": "ICACHE_TAG.STALLS / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+ "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_branch_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
+ "MetricExpr": "(tma_branch_mispredicts / tma_bad_speculation) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_mispredicts_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
+ "MetricExpr": "(1 - (tma_branch_mispredicts / tma_bad_speculation)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_clears_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+ "MetricExpr": "INT_MISC.UNKNOWN_BRANCH_CYCLES / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_unknown_branches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
+ "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_dsb_switches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+ "MetricExpr": "DECODE.LCP / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+ "MetricExpr": "3 * IDQ.MS_SWITCHES / CLKS",
+ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_ms_switches",
+ "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: FRONTEND_RETIRED.MS_FLOWS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+ "MetricExpr": "max(0, tma_frontend_bound - tma_fetch_latency)",
+ "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_bandwidth",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+ "MetricExpr": "(IDQ.MITE_CYCLES_ANY - IDQ.MITE_CYCLES_OK) / CORE_CLKS / 2",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_mite",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
+ "MetricExpr": "(cpu_core@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu_core@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group",
+ "MetricName": "tma_decoder0_alone",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+ "MetricExpr": "(IDQ.DSB_CYCLES_ANY - IDQ.DSB_CYCLES_OK) / CORE_CLKS / 2",
+ "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_dsb",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit",
+ "MetricExpr": "(LSD.CYCLES_ACTIVE - LSD.CYCLES_OK) / CORE_CLKS / 2",
+ "MetricGroup": "FetchBW;LSD;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_lsd",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop supply. However; in some rare cases; optimal uop-delivery could not be reached for small loops whose size (in terms of number of uops) does not suit well the LSD structure.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
+ "MetricExpr": "max(1 - (tma_frontend_bound + tma_backend_bound + tma_retiring), 0)",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_bad_speculation",
+ "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+ "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+ "MetricExpr": "max(0, tma_bad_speculation - tma_branch_mispredicts)",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
+ "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_backend_bound",
+ "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+ "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
+ "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_memory_bound",
+ "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+ "MetricExpr": "max((EXE_ACTIVITY.BOUND_ON_LOADS - MEMORY_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l1_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+ "MetricExpr": "min(7 * cpu_core@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_dtlb_load",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
+ "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
+ "MetricName": "tma_load_stlb_hit",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
+ "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
+ "MetricName": "tma_load_stlb_miss",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd_blk",
+ "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS",
+ "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_lock_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+ "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_split_loads",
+ "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+ "MetricExpr": "L1D_PEND_MISS.FB_FULL / CLKS",
+ "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_fb_full",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+ "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L1D_MISS - MEMORY_ACTIVITY.STALLS_L2_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l2_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+ "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L2_MISS - MEMORY_ACTIVITY.STALLS_L3_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l3_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+ "MetricExpr": "((25 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + (24 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_contested_accesses",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+ "MetricExpr": "(24 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_data_sharing",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+ "MetricExpr": "(9 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_l3_hit_latency",
+ "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+ "MetricExpr": "(XQ.FULL_CYCLES + L1D_PEND_MISS.L2_STALLS) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_sq_full",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+ "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / CLKS)",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_dram_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_bandwidth",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+ "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_store_bound",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+ "MetricExpr": "((MEM_STORE_RETIRED.L2_HIT * 10 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_store_latency",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+ "MetricExpr": "(28 * Average_Frequency) * OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_false_sharing",
+ "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents rate of split store accesses",
+ "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS",
+ "MetricGroup": "TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_split_stores",
+ "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores",
+ "MetricExpr": "9 * OCR.STREAMING_WR.ANY_RESPONSE / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_streaming_stores",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to Streaming store memory accesses; Streaming store optimize out a read request required by RFO stores. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should Streaming stores be a bottleneck. Sample with: OCR.STREAMING_WR.ANY_RESPONSE",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+ "MetricExpr": "(7 * cpu_core@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_dtlb_store",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
+ "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
+ "MetricName": "tma_store_stlb_hit",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
+ "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
+ "MetricName": "tma_store_stlb_miss",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+ "MetricExpr": "max(0, tma_backend_bound - tma_memory_bound)",
+ "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+ "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS",
+ "MetricGroup": "TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_divider",
+ "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricExpr": "(cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * cpu_core@EXE_ACTIVITY.2_PORTS_UTIL\\,umask\\=0xc@) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_ports_utilization",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "cpu_core@EXE_ACTIVITY.3_PORTS_UTIL\\,umask\\=0x80@ / CLKS + tma_serializing_operation * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_0",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
+ "MetricExpr": "RESOURCE_STALLS.SCOREBOARD / CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
+ "MetricName": "tma_serializing_operation",
+ "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
+ "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / CLKS",
+ "MetricGroup": "TopdownL6;tma_serializing_operation_group",
+ "MetricName": "tma_slow_pause",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: CPU_CLK_UNHALTED.PAUSE_INST",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.",
+ "MetricExpr": "13 * MISC2_RETIRED.LFENCE / CLKS",
+ "MetricGroup": "TopdownL6;tma_serializing_operation_group",
+ "MetricName": "tma_memory_fence",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+ "MetricExpr": "160 * ASSISTS.SSE_AVX_MIX / CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
+ "MetricName": "tma_mixing_vectors",
+ "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_1",
+ "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_2",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_3m",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+ "MetricExpr": "(UOPS_DISPATCHED.PORT_0 + UOPS_DISPATCHED.PORT_1 + UOPS_DISPATCHED.PORT_5_11 + UOPS_DISPATCHED.PORT_6) / (5 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_alu_op_utilization",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0",
+ "MetricExpr": "UOPS_DISPATCHED.PORT_0 / CORE_CLKS",
+ "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_0",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1",
+ "MetricExpr": "UOPS_DISPATCHED.PORT_1 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_1",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6",
+ "MetricExpr": "UOPS_DISPATCHED.PORT_6 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_6",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3_10",
+ "MetricExpr": "UOPS_DISPATCHED.PORT_2_3_10 / (3 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_load_op_utilization",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8",
+ "MetricExpr": "(UOPS_DISPATCHED.PORT_4_9 + UOPS_DISPATCHED.PORT_7_8) / (4 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_store_op_utilization",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
+ "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",
+ "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
+ "MetricExpr": "max(0, tma_retiring - tma_heavy_operations)",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_light_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
+ "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
+ "MetricGroup": "HPC;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_fp_arith",
+ "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
+ "MetricExpr": "tma_retiring * UOPS_EXECUTED.X87 / UOPS_EXECUTED.THREAD",
+ "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_x87_use",
+ "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_scalar",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_vector",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_128b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_256b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired)",
+ "MetricExpr": "tma_int_vector_128b + tma_int_vector_256b + tma_shuffles",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_int_operations",
+ "PublicDescription": "This metric represents overall Integer (Int) select operations fraction the CPU has executed (retired). Vector/Matrix Int operations and shuffles are counted. Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents 128-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.",
+ "MetricExpr": "(INT_VEC_RETIRED.ADD_128 + INT_VEC_RETIRED.VNNI_128) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group",
+ "MetricName": "tma_int_vector_128b",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents 256-bit vector Integer ADD/SUB/SAD or VNNI (Vector Neural Network Instructions) uops fraction the CPU has retired.",
+ "MetricExpr": "(INT_VEC_RETIRED.ADD_256 + INT_VEC_RETIRED.MUL_256 + INT_VEC_RETIRED.VNNI_256) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Compute;IntVector;Pipeline;TopdownL4;tma_int_operations_group",
+ "MetricName": "tma_int_vector_256b",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents Shuffle (cross \"vector lane\" data transfers) uops fraction the CPU has retired.",
+ "MetricExpr": "INT_VEC_RETIRED.SHUFFLES / (tma_retiring * SLOTS)",
+ "MetricGroup": "HPC;Pipeline;TopdownL4;tma_int_operations_group",
+ "MetricName": "tma_shuffles",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring memory operations -- uops for memory load or store accesses.",
+ "MetricExpr": "tma_light_operations * MEM_UOP_RETIRED.ANY / (tma_retiring * SLOTS)",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_memory_operations",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions",
+ "MetricExpr": "tma_light_operations * INST_RETIRED.MACRO_FUSED / (tma_retiring * SLOTS)",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_fused_instructions",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring fused instructions -- where one uop can represent multiple contiguous instructions. The instruction pairs of CMP+JCC or DEC+JCC are commonly used examples.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused",
+ "MetricExpr": "tma_light_operations * (BR_INST_RETIRED.ALL_BRANCHES - INST_RETIRED.MACRO_FUSED) / (tma_retiring * SLOTS)",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_non_fused_branches",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring branch instructions that were not fused. Non-conditional branches like direct JMP or CALL would count here. Can be used to examine fusible conditional jumps that were not fused.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions",
+ "MetricExpr": "tma_light_operations * INST_RETIRED.NOP / (tma_retiring * SLOTS)",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_nop_instructions",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring NOP (no op) instructions. Compilers often use NOPs for certain address alignments - e.g. start address of a function or loop body. Sample with: INST_RETIRED.NOP",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes. May undercount due to FMA double counting",
+ "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches + tma_nop_instructions))",
+ "MetricGroup": "Pipeline;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_other_light_ops",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
+ "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0*SLOTS",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_heavy_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences. Sample with: UOPS_RETIRED.HEAVY",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops",
+ "MetricExpr": "tma_heavy_operations - tma_microcode_sequencer",
+ "MetricGroup": "TopdownL3;tma_heavy_operations_group",
+ "MetricName": "tma_few_uops_instructions",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring instructions that that are decoder into two or up to ([SNB+] four; [ADL+] five) uops. This highly-correlates with the number of uops in such instructions.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
+ "MetricExpr": "UOPS_RETIRED.MS / SLOTS",
+ "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
+ "MetricName": "tma_microcode_sequencer",
+ "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
+ "MetricExpr": "100 * cpu_core@ASSISTS.ANY\\,umask\\=0x1B@ / SLOTS",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_assists",
+ "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults",
+ "MetricExpr": "99 * ASSISTS.PAGE_FAULT / SLOTS",
+ "MetricGroup": "TopdownL5;tma_assists_group",
+ "MetricName": "tma_page_faults",
+ "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Page Faults. A Page Fault may apply on first application access to a memory page. Note operating system handling of page faults accounts for the majority of its cost.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists",
+ "MetricExpr": "30 * ASSISTS.FP / SLOTS",
+ "MetricGroup": "HPC;TopdownL5;tma_assists_group",
+ "MetricName": "tma_fp_assists",
+ "PublicDescription": "This metric roughly estimates fraction of slots the CPU retired uops as a result of handing Floating Point (FP) Assists. FP Assist may apply when working with very small floating point values (so-called denormals).",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of slots the CPU retired uops as a result of handing SSE to AVX* or AVX* to SSE transition Assists. ",
+ "MetricExpr": "63 * ASSISTS.SSE_AVX_MIX / SLOTS",
+ "MetricGroup": "HPC;TopdownL5;tma_assists_group",
+ "MetricName": "tma_avx_assists",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
+ "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_cisc",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources. Sample with: FRONTEND_RETIRED.MS_FLOWS",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks",
+ "MetricExpr": "100 * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))",
+ "MetricGroup": "Bad;BadSpec;BrMispredicts",
+ "MetricName": "Mispredictions",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of (external) Memory Bandwidth related bottlenecks",
+ "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full))) + (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) ",
+ "MetricGroup": "Mem;MemoryBW;Offcore",
+ "MetricName": "Memory_Bandwidth",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of Memory Latency related bottlenecks (external memory and off-core caches)",
+ "MetricExpr": "100 * tma_memory_bound * ((tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + (tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)))",
+ "MetricGroup": "Mem;MemoryLat;Offcore",
+ "MetricName": "Memory_Latency",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)",
+ "MetricExpr": "100 * tma_memory_bound * ((tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores))) ",
+ "MetricGroup": "Mem;MemoryTLB;Offcore",
+ "MetricName": "Memory_Data_TLBs",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Total pipeline cost of branch related instructions (used for program control-flow including function calls)",
- "MetricExpr": "100 * (( BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) ) / TOPDOWN.SLOTS)",
+ "MetricExpr": "100 * ((BR_INST_RETIRED.COND + 3 * BR_INST_RETIRED.NEAR_CALL + (BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL)) / SLOTS)",
"MetricGroup": "Ret",
"MetricName": "Branching_Overhead",
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)",
+ "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)",
+ "MetricGroup": "BigFoot;Fed;Frontend;IcMiss;MemoryTLB",
+ "MetricName": "Big_Code",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks",
+ "MetricExpr": "100 * (tma_frontend_bound - tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - Big_Code",
+ "MetricGroup": "Fed;FetchBW;Frontend",
+ "MetricName": "Instruction_Fetch_BW",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "INST_RETIRED.ANY / CLKS",
"MetricGroup": "Ret;Summary",
"MetricName": "IPC",
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Uops Per Instruction",
+ "MetricExpr": "(tma_retiring * SLOTS) / INST_RETIRED.ANY",
+ "MetricGroup": "Pipeline;Ret;Retire",
+ "MetricName": "UPI",
+ "Unit": "cpu_core"
+ },
+ {
+ "BriefDescription": "Instruction per taken branch",
+ "MetricExpr": "(tma_retiring * SLOTS) / BR_INST_RETIRED.NEAR_TAKEN",
+ "MetricGroup": "Branches;Fed;FetchBW",
+ "MetricName": "UpTB",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Cycles Per Instruction (per Logical Processor)",
- "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "Pipeline;Mem",
+ "MetricExpr": "1 / IPC",
+ "MetricGroup": "Mem;Pipeline",
"MetricName": "CPI",
"Unit": "cpu_core"
},
@@ -30,14 +860,14 @@
{
"BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
"MetricExpr": "TOPDOWN.SLOTS",
- "MetricGroup": "TmaL1",
+ "MetricGroup": "tma_L1_group",
"MetricName": "SLOTS",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fraction of Physical Core issue-slots utilized by this Logical Processor",
- "MetricExpr": "TOPDOWN.SLOTS / ( TOPDOWN.SLOTS / 2 ) if #SMT_on else 1",
- "MetricGroup": "SMT;TmaL1",
+ "MetricExpr": "SLOTS / (TOPDOWN.SLOTS / 2) if #SMT_on else 1",
+ "MetricGroup": "SMT;tma_L1_group",
"MetricName": "Slots_Utilization",
"Unit": "cpu_core"
},
@@ -51,21 +881,21 @@
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.DISTRIBUTED",
- "MetricGroup": "Ret;SMT;TmaL1",
+ "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS",
+ "MetricGroup": "Ret;SMT;tma_L1_group",
"MetricName": "CoreIPC",
"Unit": "cpu_core"
},
{
"BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.DISTRIBUTED",
- "MetricGroup": "Ret;Flops",
+ "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
+ "MetricGroup": "Flops;Ret",
"MetricName": "FLOPc",
"Unit": "cpu_core"
},
{
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
- "MetricExpr": "( FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5 ) / ( 2 * CPU_CLK_UNHALTED.DISTRIBUTED )",
+ "MetricExpr": "(FP_ARITH_DISPATCHED.PORT_0 + FP_ARITH_DISPATCHED.PORT_1 + FP_ARITH_DISPATCHED.PORT_5) / (2 * CORE_CLKS)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "FP_Arith_Utilization",
"PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common).",
@@ -73,12 +903,19 @@
},
{
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "UOPS_EXECUTED.THREAD / (( UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 ) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
+ "MetricExpr": "UOPS_EXECUTED.THREAD / ((UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2) if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_1)",
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
"MetricName": "ILP",
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Probability of Core Bound bottleneck hidden by SMT-profiling artifacts",
+ "MetricExpr": "(1 - tma_core_bound / tma_ports_utilization if tma_core_bound < tma_ports_utilization else 1) if SMT_2T_Utilization > 0.5 else 0",
+ "MetricGroup": "Cor;SMT",
+ "MetricName": "Core_Bound_Likely",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
"MetricExpr": "CPU_CLK_UNHALTED.DISTRIBUTED",
"MetricGroup": "SMT",
@@ -129,14 +966,14 @@
},
{
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;InsType",
"MetricName": "IpFLOP",
"Unit": "cpu_core"
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )",
+ "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
"MetricGroup": "Flops;InsType",
"MetricName": "IpArith",
"PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW.",
@@ -160,7 +997,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX128",
"PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
@@ -168,7 +1005,7 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX256",
"PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting.",
@@ -182,13 +1019,20 @@
"Unit": "cpu_core"
},
{
- "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST",
+ "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST",
"MetricExpr": "INST_RETIRED.ANY",
- "MetricGroup": "Summary;TmaL1",
+ "MetricGroup": "Summary;tma_L1_group",
"MetricName": "Instructions",
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
+ "MetricExpr": "(tma_retiring * SLOTS) / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
+ "MetricGroup": "Pipeline;Ret",
+ "MetricName": "Retire",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Estimated fraction of retirement-cycles dealing with repeat instructions",
"MetricExpr": "INST_RETIRED.REP_ITERATION / cpu_core@UOPS_RETIRED.SLOTS\\,cmask\\=1@",
"MetricGroup": "Pipeline;Ret",
@@ -238,6 +1082,13 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Total penalty related to DSB (uop cache) misses - subset of the Instruction_Fetch_BW Bottleneck.",
+ "MetricExpr": "100 * (tma_fetch_latency * tma_dsb_switches / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_mite / (tma_dsb + tma_lsd + tma_mite))",
+ "MetricGroup": "DSBmiss;Fed",
+ "MetricName": "DSB_Misses",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Number of Instructions per non-speculative DSB miss (lower number means higher occurrence rate)",
"MetricExpr": "INST_RETIRED.ANY / FRONTEND_RETIRED.ANY_DSB_MISS",
"MetricGroup": "DSBmiss;Fed",
@@ -252,6 +1103,13 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
+ "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BrMispredicts",
+ "MetricName": "Branch_Misprediction_Cost",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Fraction of branches that are non-taken conditionals",
"MetricExpr": "BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES",
"MetricGroup": "Bad;Branches;CodeGen;PGO",
@@ -267,7 +1125,7 @@
},
{
"BriefDescription": "Fraction of branches that are CALL or RET",
- "MetricExpr": "( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES",
+ "MetricExpr": "(BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN) / BR_INST_RETIRED.ALL_BRANCHES",
"MetricGroup": "Bad;Branches",
"MetricName": "CallRet",
"Unit": "cpu_core"
@@ -281,7 +1139,7 @@
},
{
"BriefDescription": "Fraction of branches of other types (not individually covered by other metrics in Info.Branches group)",
- "MetricExpr": "1 - ( (BR_INST_RETIRED.COND_NTAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (BR_INST_RETIRED.COND_TAKEN / BR_INST_RETIRED.ALL_BRANCHES) + (( BR_INST_RETIRED.NEAR_CALL + BR_INST_RETIRED.NEAR_RETURN ) / BR_INST_RETIRED.ALL_BRANCHES) + ((BR_INST_RETIRED.NEAR_TAKEN - BR_INST_RETIRED.COND_TAKEN - 2 * BR_INST_RETIRED.NEAR_CALL) / BR_INST_RETIRED.ALL_BRANCHES) )",
+ "MetricExpr": "1 - (Cond_NT + Cond_TK + CallRet + Jump)",
"MetricGroup": "Bad;Branches",
"MetricName": "Other_Branches",
"Unit": "cpu_core"
@@ -296,77 +1154,77 @@
{
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
"MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBound;MemoryBW",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
"MetricName": "MLP",
"Unit": "cpu_core"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L1MPKI",
"Unit": "cpu_core"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.ALL_DEMAND_DATA_RD / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L1MPKI_Load",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L2_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;Backend;CacheMisses",
+ "MetricGroup": "Backend;CacheMisses;Mem",
"MetricName": "L2MPKI",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses;Offcore",
+ "MetricGroup": "CacheMisses;Mem;Offcore",
"MetricName": "L2MPKI_All",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2MPKI_Load",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_All",
"Unit": "cpu_core"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_Load",
"Unit": "cpu_core"
},
{
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L3MPKI",
"Unit": "cpu_core"
},
{
"BriefDescription": "Fill Buffer (FB) hits per kilo instructions for retired demand loads (L1D misses that merge into ongoing miss-handling entries)",
"MetricExpr": "1000 * MEM_LOAD_RETIRED.FB_HIT / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "FB_HPKI",
"Unit": "cpu_core"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
"MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "( ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING ) / ( 4 * CPU_CLK_UNHALTED.DISTRIBUTED )",
+ "MetricExpr": "(ITLB_MISSES.WALK_PENDING + DTLB_LOAD_MISSES.WALK_PENDING + DTLB_STORE_MISSES.WALK_PENDING) / (4 * CORE_CLKS)",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "Page_Walks_Utilization",
"Unit": "cpu_core"
@@ -401,28 +1259,28 @@
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
+ "MetricExpr": "L1D_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L1D_Cache_Fill_BW_1T",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
+ "MetricExpr": "L2_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L2_Cache_Fill_BW_1T",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
+ "MetricExpr": "L3_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L3_Cache_Fill_BW_1T",
"Unit": "cpu_core"
},
{
"BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "(64 * OFFCORE_REQUESTS.ALL_REQUESTS / 1000000000 / duration_time)",
+ "MetricExpr": "L3_Cache_Access_BW",
"MetricGroup": "Mem;MemoryBW;Offcore",
"MetricName": "L3_Cache_Access_BW_1T",
"Unit": "cpu_core"
@@ -436,14 +1294,14 @@
},
{
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time",
- "MetricGroup": "Summary;Power",
+ "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time",
+ "MetricGroup": "Power;Summary",
"MetricName": "Average_Frequency",
"Unit": "cpu_core"
},
{
"BriefDescription": "Giga Floating Point Operations Per Second",
- "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
+ "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "GFLOPs",
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine.",
@@ -451,7 +1309,7 @@
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
"MetricName": "Turbo_Utilization",
"Unit": "cpu_core"
@@ -479,7 +1337,7 @@
},
{
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000",
+ "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000",
"MetricGroup": "HPC;Mem;MemoryBW;SoC",
"MetricName": "DRAM_BW_Use",
"Unit": "cpu_core"
@@ -500,41 +1358,408 @@
},
{
"BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to frontend stalls.",
- "MetricExpr": "TOPDOWN_FE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE)",
+ "MetricExpr": "TOPDOWN_FE_BOUND.ALL / SLOTS",
"MetricGroup": "TopdownL1",
- "MetricName": "Frontend_Bound",
+ "MetricName": "tma_frontend_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_LATENCY / SLOTS",
+ "MetricGroup": "TopdownL2;tma_frontend_bound_group",
+ "MetricName": "tma_frontend_latency",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to instruction cache misses.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.ICACHE / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+ "MetricName": "tma_icache",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to Instruction Table Lookaside Buffer (ITLB) misses.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.ITLB / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+ "MetricName": "tma_itlb",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend",
+ "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_DETECT / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+ "MetricName": "tma_branch_detect",
+ "PublicDescription": "Counts the number of issue slots that were not delivered by the frontend due to BACLEARS, which occurs when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to BTCLEARS, which occurs when the Branch Target Buffer (BTB) predicts a taken branch.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.BRANCH_RESTEER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_latency_group",
+ "MetricName": "tma_branch_resteer",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to frontend bandwidth restrictions due to decode, predecode, cisc, and other limitations.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.FRONTEND_BANDWIDTH / SLOTS",
+ "MetricGroup": "TopdownL2;tma_frontend_bound_group",
+ "MetricName": "tma_frontend_bandwidth",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to the microcode sequencer (MS).",
+ "MetricExpr": "TOPDOWN_FE_BOUND.CISC / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+ "MetricName": "tma_cisc",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to decode stalls.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.DECODE / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+ "MetricName": "tma_decode",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to wrong predecodes.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.PREDECODE / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+ "MetricName": "tma_predecode",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not delivered by the frontend due to other common frontend stalls not categorized.",
+ "MetricExpr": "TOPDOWN_FE_BOUND.OTHER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_frontend_bandwidth_group",
+ "MetricName": "tma_other_fb",
+ "ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear",
- "MetricExpr": "TOPDOWN_BAD_SPECULATION.ALL / (5 * CPU_CLK_UNHALTED.CORE)",
+ "MetricExpr": "(SLOTS - (TOPDOWN_FE_BOUND.ALL + TOPDOWN_BE_BOUND.ALL + TOPDOWN_RETIRING.ALL)) / SLOTS",
"MetricGroup": "TopdownL1",
- "MetricName": "Bad_Speculation",
+ "MetricName": "tma_bad_speculation",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to branch mispredicts.",
+ "MetricExpr": "TOPDOWN_BAD_SPECULATION.MISPREDICT / SLOTS",
+ "MetricGroup": "TopdownL2;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a machine clear (nuke) of any kind including memory ordering and memory disambiguation.",
+ "MetricExpr": "TOPDOWN_BAD_SPECULATION.MACHINE_CLEARS / SLOTS",
+ "MetricGroup": "TopdownL2;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear (slow nuke).",
+ "MetricExpr": "TOPDOWN_BAD_SPECULATION.NUKE / SLOTS",
+ "MetricGroup": "TopdownL3;tma_machine_clears_group",
+ "MetricName": "tma_nuke",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to SMC. ",
+ "MetricExpr": "tma_nuke * (MACHINE_CLEARS.SMC / MACHINE_CLEARS.SLOW)",
+ "MetricGroup": "TopdownL4;tma_nuke_group",
+ "MetricName": "tma_smc",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory ordering. ",
+ "MetricExpr": "tma_nuke * (MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.SLOW)",
+ "MetricGroup": "TopdownL4;tma_nuke_group",
+ "MetricName": "tma_memory_ordering",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to FP assists. ",
+ "MetricExpr": "tma_nuke * (MACHINE_CLEARS.FP_ASSIST / MACHINE_CLEARS.SLOW)",
+ "MetricGroup": "TopdownL4;tma_nuke_group",
+ "MetricName": "tma_fp_assist",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to memory disambiguation. ",
+ "MetricExpr": "tma_nuke * (MACHINE_CLEARS.DISAMBIGUATION / MACHINE_CLEARS.SLOW)",
+ "MetricGroup": "TopdownL4;tma_nuke_group",
+ "MetricName": "tma_disambiguation",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of machine clears relative to the number of nuke slots due to page faults. ",
+ "MetricExpr": "tma_nuke * (MACHINE_CLEARS.PAGE_FAULT / MACHINE_CLEARS.SLOW)",
+ "MetricGroup": "TopdownL4;tma_nuke_group",
+ "MetricName": "tma_page_fault",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to a machine clear classified as a fast nuke due to memory ordering, memory disambiguation and memory renaming.",
+ "MetricExpr": "TOPDOWN_BAD_SPECULATION.FASTNUKE / SLOTS",
+ "MetricGroup": "TopdownL3;tma_machine_clears_group",
+ "MetricName": "tma_fast_nuke",
+ "ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
- "MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "TOPDOWN_BE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE)",
+ "MetricExpr": "TOPDOWN_BE_BOUND.ALL / SLOTS",
"MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound",
+ "MetricName": "tma_backend_bound",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles due to backend bound stalls that are core execution bound and not attributed to outstanding demand load or store stalls. ",
+ "MetricExpr": "max(0, tma_backend_bound - tma_load_store_bound)",
+ "MetricGroup": "TopdownL2;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles the core is stalled due to stores or loads. ",
+ "MetricExpr": "min((TOPDOWN_BE_BOUND.ALL / SLOTS), (LD_HEAD.ANY_AT_RET / CLKS) + tma_store_bound)",
+ "MetricGroup": "TopdownL2;tma_backend_bound_group",
+ "MetricName": "tma_load_store_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles the core is stalled due to store buffer full.",
+ "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_store_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a load block.",
+ "MetricExpr": "LD_HEAD.L1_BOUND_AT_RET / CLKS",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_l1_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a store forward block.",
+ "MetricExpr": "LD_HEAD.ST_ADDR_AT_RET / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a first level TLB miss.",
+ "MetricExpr": "LD_HEAD.DTLB_MISS_AT_RET / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_stlb_hit",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a second level TLB miss requiring a page walk.",
+ "MetricExpr": "LD_HEAD.PGWALK_AT_RET / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_stlb_miss",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles that the oldest load of the load buffer is stalled at retirement due to a number of other load blocks.",
+ "MetricExpr": "LD_HEAD.OTHER_AT_RET / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_other_l1",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 Cache.",
+ "MetricExpr": "(MEM_BOUND_STALLS.LOAD_L2_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_BOUND_STALLS.LOAD)",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_l2_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the Last Level Cache (LLC) or other core with HITE/F/M.",
+ "MetricExpr": "(MEM_BOUND_STALLS.LOAD_LLC_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_LLC_HIT / MEM_BOUND_STALLS.LOAD)",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_l3_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hit in DRAM or MMIO (Non-DRAM).",
+ "MetricExpr": "(MEM_BOUND_STALLS.LOAD_DRAM_HIT / CLKS) - (MEM_BOUND_STALLS_AT_RET_CORRECTION * MEM_BOUND_STALLS.LOAD_DRAM_HIT / MEM_BOUND_STALLS.LOAD)",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_dram_bound",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load miss which hits in the L2, LLC, DRAM or MMIO (Non-DRAM) but could not be correctly attributed or cycles in which the load miss is waiting on a request buffer.",
+ "MetricExpr": "max(0, tma_load_store_bound - (tma_store_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_dram_bound))",
+ "MetricGroup": "TopdownL3;tma_load_store_bound_group",
+ "MetricName": "tma_other_load_store",
+ "ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
- "MetricExpr": "(TOPDOWN_BE_BOUND.ALL / (5 * CPU_CLK_UNHALTED.CORE))",
+ "MetricExpr": "tma_backend_bound",
"MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound_Aux",
+ "MetricName": "tma_backend_bound_aux",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation. ",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls",
+ "MetricExpr": "tma_backend_bound",
+ "MetricGroup": "TopdownL2;tma_backend_bound_aux_group",
+ "MetricName": "tma_resource_bound",
+ "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. ",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to memory reservation stalls in which a scheduler is not able to accept uops.",
+ "MetricExpr": "TOPDOWN_BE_BOUND.MEM_SCHEDULER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_mem_scheduler",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to store buffer full",
+ "MetricExpr": "tma_mem_scheduler * (MEM_SCHEDULER_BLOCK.ST_BUF / MEM_SCHEDULER_BLOCK.ALL)",
+ "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+ "MetricName": "tma_st_buffer",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to load buffer full",
+ "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.LD_BUF / MEM_SCHEDULER_BLOCK.ALL",
+ "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+ "MetricName": "tma_ld_buffer",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cycles, relative to the number of mem_scheduler slots, in which uops are blocked due to RSV full relative ",
+ "MetricExpr": "tma_mem_scheduler * MEM_SCHEDULER_BLOCK.RSV / MEM_SCHEDULER_BLOCK.ALL",
+ "MetricGroup": "TopdownL4;tma_mem_scheduler_group",
+ "MetricName": "tma_rsv",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to IEC or FPC RAT stalls, which can be due to FIQ or IEC reservation stalls in which the integer, floating point or SIMD scheduler is not able to accept uops.",
+ "MetricExpr": "TOPDOWN_BE_BOUND.NON_MEM_SCHEDULER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_non_mem_scheduler",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the physical register file unable to accept an entry (marble stalls).",
+ "MetricExpr": "TOPDOWN_BE_BOUND.REGISTER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_register",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to the reorder buffer being full (ROB stalls).",
+ "MetricExpr": "TOPDOWN_BE_BOUND.REORDER_BUFFER / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_reorder_buffer",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to certain allocation restrictions.",
+ "MetricExpr": "TOPDOWN_BE_BOUND.ALLOC_RESTRICTIONS / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_alloc_restriction",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of issue slots that were not consumed by the backend due to scoreboards from the instruction queue (IQ), jump execution unit (JEU), or microcode sequencer (MS).",
+ "MetricExpr": "TOPDOWN_BE_BOUND.SERIALIZATION / SLOTS",
+ "MetricGroup": "TopdownL3;tma_resource_bound_group",
+ "MetricName": "tma_serialization",
+ "ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Counts the numer of issue slots that result in retirement slots. ",
- "MetricExpr": "TOPDOWN_RETIRING.ALL / (5 * CPU_CLK_UNHALTED.CORE)",
+ "MetricExpr": "TOPDOWN_RETIRING.ALL / SLOTS",
"MetricGroup": "TopdownL1",
- "MetricName": "Retiring",
+ "MetricName": "tma_retiring",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of uops that are not from the microsequencer. ",
+ "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS) / SLOTS",
+ "MetricGroup": "TopdownL2;tma_retiring_group",
+ "MetricName": "tma_base",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of floating point operations per uop with all default weighting.",
+ "MetricExpr": "UOPS_RETIRED.FPDIV / SLOTS",
+ "MetricGroup": "TopdownL3;tma_base_group",
+ "MetricName": "tma_fp_uops",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of uops retired excluding ms and fp div uops.",
+ "MetricExpr": "(TOPDOWN_RETIRING.ALL - UOPS_RETIRED.MS - UOPS_RETIRED.FPDIV) / SLOTS",
+ "MetricGroup": "TopdownL3;tma_base_group",
+ "MetricName": "tma_other_ret",
+ "ScaleUnit": "100%",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS)",
+ "MetricExpr": "UOPS_RETIRED.MS / SLOTS",
+ "MetricGroup": "TopdownL2;tma_retiring_group",
+ "MetricName": "tma_ms_uops",
+ "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
+ "ScaleUnit": "100%",
"Unit": "cpu_atom"
},
{
@@ -551,19 +1776,19 @@
},
{
"BriefDescription": "",
- "MetricExpr": "5 * CPU_CLK_UNHALTED.CORE",
+ "MetricExpr": "5 * CLKS",
"MetricName": "SLOTS",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Instructions Per Cycle",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.CORE",
+ "MetricExpr": "INST_RETIRED.ANY / CLKS",
"MetricName": "IPC",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Cycles Per Instruction",
- "MetricExpr": "CPU_CLK_UNHALTED.CORE / INST_RETIRED.ANY",
+ "MetricExpr": "CLKS / INST_RETIRED.ANY",
"MetricName": "CPI",
"Unit": "cpu_atom"
},
@@ -623,7 +1848,7 @@
},
{
"BriefDescription": "Instructions per Far Branch",
- "MetricExpr": "INST_RETIRED.ANY / ( BR_INST_RETIRED.FAR_BRANCH / 2 )",
+ "MetricExpr": "INST_RETIRED.ANY / (BR_INST_RETIRED.FAR_BRANCH / 2)",
"MetricName": "IpFarBranch",
"Unit": "cpu_atom"
},
@@ -665,7 +1890,7 @@
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "CPU_CLK_UNHALTED.CORE / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
"MetricName": "Turbo_Utilization",
"Unit": "cpu_atom"
},
@@ -682,12 +1907,6 @@
"Unit": "cpu_atom"
},
{
- "BriefDescription": "Estimated Pause cost. In percent",
- "MetricExpr": "100 * SERIALIZATION.NON_C01_MS_SCB / (5 * CPU_CLK_UNHALTED.CORE)",
- "MetricName": "Estimated_Pause_Cost",
- "Unit": "cpu_atom"
- },
- {
"BriefDescription": "Cycle cost per L2 hit",
"MetricExpr": "MEM_BOUND_STALLS.LOAD_L2_HIT / MEM_LOAD_UOPS_RETIRED.L2_HIT",
"MetricName": "Cycles_per_Demand_Load_L2_Hit",
@@ -707,19 +1926,19 @@
},
{
"BriefDescription": "Percent of instruction miss cost that hit in the L2",
- "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / ( MEM_BOUND_STALLS.IFETCH )",
+ "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_L2_HIT / (MEM_BOUND_STALLS.IFETCH)",
"MetricName": "Inst_Miss_Cost_L2Hit_Percent",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Percent of instruction miss cost that hit in the L3",
- "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / ( MEM_BOUND_STALLS.IFETCH )",
+ "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_LLC_HIT / (MEM_BOUND_STALLS.IFETCH)",
"MetricName": "Inst_Miss_Cost_L3Hit_Percent",
"Unit": "cpu_atom"
},
{
"BriefDescription": "Percent of instruction miss cost that hit in DRAM",
- "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / ( MEM_BOUND_STALLS.IFETCH )",
+ "MetricExpr": "100 * MEM_BOUND_STALLS.IFETCH_DRAM_HIT / (MEM_BOUND_STALLS.IFETCH)",
"MetricName": "Inst_Miss_Cost_DRAMHit_Percent",
"Unit": "cpu_atom"
},
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
index 887dce4dfeba..2cc62d2779d2 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json
@@ -1,5 +1,29 @@
[
{
+ "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0x2e",
+ "EventName": "LONGEST_LAT_CACHE.MISS",
+ "PEBScounters": "0,1,2,3,4,5",
+ "SampleAfterValue": "200003",
+ "Speculative": "1",
+ "UMask": "0x41",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts the number of cacheable memory requests that access the LLC. Counts on a per core basis.",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0x2e",
+ "EventName": "LONGEST_LAT_CACHE.REFERENCE",
+ "PEBScounters": "0,1,2,3,4,5",
+ "SampleAfterValue": "200003",
+ "Speculative": "1",
+ "UMask": "0x4f",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Counts the number of cycles the core is stalled due to an instruction cache or TLB miss which hit in the L2, LLC, DRAM or MMIO (Non-DRAM).",
"CollectPEBSRecord": "2",
"Counter": "0,1,2,3,4,5",
@@ -210,8 +234,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 128 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128",
@@ -219,7 +243,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x80",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -227,8 +251,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 16 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16",
@@ -236,7 +260,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x10",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -244,8 +268,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 256 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256",
@@ -253,7 +277,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x100",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -261,8 +285,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 32 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32",
@@ -270,7 +294,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x20",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -278,8 +302,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 4 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4",
@@ -287,7 +311,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x4",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -295,8 +319,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 512 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512",
@@ -304,7 +328,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x200",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -312,8 +336,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 64 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64",
@@ -321,7 +345,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x40",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -329,8 +353,8 @@
},
{
"BriefDescription": "Counts the number of tagged loads with an instruction latency that exceeds or equals the threshold of 8 cycles as defined in MEC_CR_PEBS_LD_LAT_THRESHOLD (3F6H). Only counts with PEBS enabled.",
- "CollectPEBSRecord": "3",
- "Counter": "0,1,2,3,4,5",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1",
"Data_LA": "1",
"EventCode": "0xd0",
"EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8",
@@ -338,7 +362,7 @@
"MSRIndex": "0x3F6",
"MSRValue": "0x8",
"PEBS": "2",
- "PEBScounters": "0,1,2,3,4,5",
+ "PEBScounters": "0,1",
"SampleAfterValue": "1000003",
"TakenAlone": "1",
"UMask": "0x5",
@@ -359,7 +383,7 @@
},
{
"BriefDescription": "Counts the number of stores uops retired. Counts with or without PEBS enabled.",
- "CollectPEBSRecord": "3",
+ "CollectPEBSRecord": "2",
"Counter": "0,1,2,3,4,5",
"Data_LA": "1",
"EventCode": "0xd0",
@@ -372,6 +396,61 @@
"Unit": "cpu_atom"
},
{
+ "BriefDescription": "Counts demand data reads that were supplied by the L3 cache.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_DATA_RD.L3_HIT",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x3F803C0001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x10003C0001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, but no data was forwarded.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_NO_FWD",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x4003C0001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop was sent, the snoop hit, and non-modified data was forwarded.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x8003C0001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
+ "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_RFO.L3_HIT",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x3F803C0002",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop was sent, the snoop hit, and modified data was forwarded.",
"Counter": "0,1,2,3,4,5",
"EventCode": "0xB7",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
index 2cfa70b2d5e1..da1a7ba0e568 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json
@@ -48,6 +48,18 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Cycles the Microcode Sequencer is busy.",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1,2,3",
+ "EventCode": "0x87",
+ "EventName": "DECODE.MS_BUSY",
+ "PEBScounters": "0,1,2,3",
+ "SampleAfterValue": "500009",
+ "Speculative": "1",
+ "UMask": "0x2",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "DSB-to-MITE switch true penalty cycles.",
"CollectPEBSRecord": "2",
"Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/memory.json b/tools/perf/pmu-events/arch/x86/alderlake/memory.json
index 586fb961e46d..f894e4a0212b 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/memory.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/memory.json
@@ -83,6 +83,17 @@
"Unit": "cpu_atom"
},
{
+ "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_DATA_RD.L3_MISS_LOCAL",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x3F84400001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
"Counter": "0,1,2,3,4,5",
"EventCode": "0xB7",
@@ -94,6 +105,17 @@
"Unit": "cpu_atom"
},
{
+ "BriefDescription": "Counts demand reads for ownership (RFO) and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.DEMAND_RFO.L3_MISS_LOCAL",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x3F84400002",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Execution stalls while L3 cache miss demand load is outstanding.",
"CollectPEBSRecord": "2",
"Counter": "0,1,2,3",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json
index 67a9c13cc71d..c49d8ce27310 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/other.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json
@@ -1,5 +1,16 @@
[
{
+ "BriefDescription": "Counts modified writebacks from L1 cache and L2 cache that have any type of response.",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0xB7",
+ "EventName": "OCR.COREWB_M.ANY_RESPONSE",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x10008",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Counts demand data reads that have any type of response.",
"Counter": "0,1,2,3,4,5",
"EventCode": "0xB7",
@@ -104,6 +115,17 @@
"Unit": "cpu_core"
},
{
+ "BriefDescription": "Counts demand data reads that were supplied by DRAM.",
+ "Counter": "0,1,2,3,4,5,6,7",
+ "EventCode": "0x2A,0x2B",
+ "EventName": "OCR.DEMAND_DATA_RD.DRAM",
+ "MSRIndex": "0x1a6,0x1a7",
+ "MSRValue": "0x184000001",
+ "SampleAfterValue": "100003",
+ "UMask": "0x1",
+ "Unit": "cpu_core"
+ },
+ {
"BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.",
"Counter": "0,1,2,3,4,5,6,7",
"EventCode": "0x2A,0x2B",
diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
index d02e078a90c9..1a137f7f8b7e 100644
--- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
+++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json
@@ -331,6 +331,18 @@
"Unit": "cpu_atom"
},
{
+ "BriefDescription": "Counts the number of unhalted reference clock cycles at TSC frequency.",
+ "CollectPEBSRecord": "2",
+ "Counter": "0,1,2,3,4,5",
+ "EventCode": "0x3c",
+ "EventName": "CPU_CLK_UNHALTED.REF_TSC_P",
+ "PEBScounters": "0,1,2,3,4,5",
+ "SampleAfterValue": "2000003",
+ "Speculative": "1",
+ "UMask": "0x1",
+ "Unit": "cpu_atom"
+ },
+ {
"BriefDescription": "Counts the number of unhalted core clock cycles. (Fixed event)",
"CollectPEBSRecord": "2",
"Counter": "Fixed counter 1",
@@ -874,7 +886,7 @@
"PEBScounters": "0,1,2,3,4,5,6,7",
"SampleAfterValue": "100003",
"Speculative": "1",
- "UMask": "0x1f",
+ "UMask": "0x1b",
"Unit": "cpu_core"
},
{
diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
index d65afe3d0b06..c220b1cf1740 100644
--- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json
@@ -1,64 +1,552 @@
[
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Frontend_Bound",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound."
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS",
+ "MetricGroup": "PGO;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Frontend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+ "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS",
+ "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_latency",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
+ "MetricExpr": "ICACHE.IFDATA_STALL / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+ "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+ "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_branch_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. ",
+ "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_mispredicts_resteers",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. ",
+ "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_clears_resteers",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+ "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
+ "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_unknown_branches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
+ "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_dsb_switches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+ "MetricExpr": "ILD_STALL.LCP / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS",
+ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_ms_switches",
+ "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+ "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
+ "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_bandwidth",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_mite",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_dsb",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Bad_Speculation",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example."
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_bad_speculation",
+ "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+ "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Bad_Speculation_SMT",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+ "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
- "MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )",
- "MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound."
+ "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_backend_bound",
+ "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound",
+ "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_memory_bound",
+ "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+ "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l1_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+ "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_dtlb_load",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd_blk",
+ "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS",
+ "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_lock_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+ "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_split_loads",
+ "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_4k_aliasing",
+ "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+ "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS",
+ "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_fb_full",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l2_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+ "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l3_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+ "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_contested_accesses",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+ "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS",
+ "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_data_sharing",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+ "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS",
+ "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_l3_hit_latency",
+ "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+ "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_sq_full",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+ "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_dram_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_bandwidth",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+ "MetricExpr": "RESOURCE_STALLS.SB / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_store_bound",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+ "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_store_latency",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+ "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_false_sharing",
+ "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents rate of split store accesses",
+ "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS",
+ "MetricGroup": "TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_split_stores",
+ "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+ "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_dtlb_store",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Backend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+ "MetricExpr": "tma_backend_bound - tma_memory_bound",
+ "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+ "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS",
+ "MetricGroup": "TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_divider",
+ "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_ports_utilization",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_0",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_1",
+ "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_2",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
+ "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_3m",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_alu_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS",
+ "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_0",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_1",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_5",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_6",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_load_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_2",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_3",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_store_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_4",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_7",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Retiring",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. "
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",
+ "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
+ "MetricExpr": "tma_retiring - tma_heavy_operations",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_light_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Retiring_SMT",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
+ "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
+ "MetricGroup": "HPC;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_fp_arith",
+ "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
+ "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_x87_use",
+ "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_scalar",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_vector",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_128b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_256b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
+ "MetricExpr": "tma_microcode_sequencer",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_heavy_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
+ "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS",
+ "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
+ "MetricName": "tma_microcode_sequencer",
+ "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
+ "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_assists",
+ "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
+ "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_cisc",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "INST_RETIRED.ANY / CLKS",
"MetricGroup": "Ret;Summary",
"MetricName": "IPC"
},
@@ -76,8 +564,8 @@
},
{
"BriefDescription": "Cycles Per Instruction (per Logical Processor)",
- "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "Pipeline;Mem",
+ "MetricExpr": "1 / IPC",
+ "MetricGroup": "Mem;Pipeline",
"MetricName": "CPI"
},
{
@@ -88,17 +576,11 @@
},
{
"BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "TmaL1",
+ "MetricExpr": "4 * CORE_CLKS",
+ "MetricGroup": "tma_L1_group",
"MetricName": "SLOTS"
},
{
- "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "TmaL1_SMT",
- "MetricName": "SLOTS_SMT"
- },
- {
"BriefDescription": "The ratio of Executed- by Issued-Uops",
"MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
"MetricGroup": "Cor;Pipeline",
@@ -107,51 +589,32 @@
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;SMT;TmaL1",
+ "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS",
+ "MetricGroup": "Ret;SMT;tma_L1_group",
"MetricName": "CoreIPC"
},
{
- "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;SMT;TmaL1_SMT",
- "MetricName": "CoreIPC_SMT"
- },
- {
"BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;Flops",
+ "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
+ "MetricGroup": "Flops;Ret",
"MetricName": "FLOPc"
},
{
- "BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;Flops_SMT",
- "MetricName": "FLOPc_SMT"
- },
- {
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )",
+ "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "FP_Arith_Utilization",
"PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
},
{
- "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )",
- "MetricGroup": "Cor;Flops;HPC_SMT",
- "MetricName": "FP_Arith_Utilization_SMT",
- "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU."
- },
- {
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+ "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
"MetricName": "ILP"
},
{
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
- "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
+ "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS",
"MetricGroup": "SMT",
"MetricName": "CORE_CLKS"
},
@@ -193,13 +656,13 @@
},
{
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;InsType",
"MetricName": "IpFLOP"
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )",
+ "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
"MetricGroup": "Flops;InsType",
"MetricName": "IpArith",
"PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
@@ -220,22 +683,22 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX128",
"PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX256",
"PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
- "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST",
+ "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST",
"MetricExpr": "INST_RETIRED.ANY",
- "MetricGroup": "Summary;TmaL1",
+ "MetricGroup": "Summary;tma_L1_group",
"MetricName": "Instructions"
},
{
@@ -252,7 +715,7 @@
},
{
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
- "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+ "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))",
"MetricGroup": "DSB;Fed;FetchBW",
"MetricName": "DSB_Coverage"
},
@@ -264,84 +727,72 @@
},
{
"BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "Branch_Misprediction_Cost"
},
{
- "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BrMispredicts_SMT",
- "MetricName": "Branch_Misprediction_Cost_SMT"
- },
- {
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)",
"MetricGroup": "Mem;MemoryBound;MemoryLat",
"MetricName": "Load_Miss_Real_Latency"
},
{
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
"MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBound;MemoryBW",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
"MetricName": "MLP"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L1MPKI"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;Backend;CacheMisses",
+ "MetricGroup": "Backend;CacheMisses;Mem",
"MetricName": "L2MPKI"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses;Offcore",
+ "MetricGroup": "CacheMisses;Mem;Offcore",
"MetricName": "L2MPKI_All"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2MPKI_Load"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_All"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_Load"
},
{
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L3MPKI"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
"MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "(cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / CORE_CLKS",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "Page_Walks_Utilization"
},
{
- "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
- "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Mem;MemoryTLB_SMT",
- "MetricName": "Page_Walks_Utilization_SMT"
- },
- {
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
"MetricGroup": "Mem;MemoryBW",
@@ -361,19 +812,19 @@
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
+ "MetricExpr": "L1D_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L1D_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
+ "MetricExpr": "L2_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L2_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
+ "MetricExpr": "L3_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L3_Cache_Fill_BW_1T"
},
@@ -391,26 +842,26 @@
},
{
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time",
- "MetricGroup": "Summary;Power",
+ "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time",
+ "MetricGroup": "Power;Summary",
"MetricName": "Average_Frequency"
},
{
"BriefDescription": "Giga Floating Point Operations Per Second",
- "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
+ "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "GFLOPs",
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
"MetricName": "Turbo_Utilization"
},
{
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
- "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+ "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0",
"MetricGroup": "SMT",
"MetricName": "SMT_2T_Utilization"
},
@@ -428,7 +879,7 @@
},
{
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "64 * ( arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@ ) / 1000000 / duration_time / 1000",
+ "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000",
"MetricGroup": "HPC;Mem;MemoryBW;SoC",
"MetricName": "DRAM_BW_Use"
},
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
index b6fdf5ba2c9a..5a074cf7c77d 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
@@ -1,64 +1,556 @@
[
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Frontend_Bound",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound."
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS",
+ "MetricGroup": "PGO;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Frontend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+ "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS",
+ "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_latency",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
+ "MetricExpr": "ICACHE.IFDATA_STALL / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+ "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+ "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_branch_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
+ "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_mispredicts_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
+ "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_clears_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+ "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
+ "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_unknown_branches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: FRONTEND_RETIRED.UNKNOWN_BRANCH",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
+ "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_dsb_switches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+ "MetricExpr": "ILD_STALL.LCP / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS",
+ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_ms_switches",
+ "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+ "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
+ "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_bandwidth",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_mite",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_dsb",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Bad_Speculation",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example."
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_bad_speculation",
+ "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Bad_Speculation_SMT",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+ "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: TOPDOWN.BR_MISPREDICT_SLOTS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+ "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
- "MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )",
- "MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound."
+ "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_backend_bound",
+ "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. Sample with: TOPDOWN.BACKEND_BOUND_SLOTS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound",
+ "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_memory_bound",
+ "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+ "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l1_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+ "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_dtlb_load",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd_blk",
+ "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS",
+ "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_lock_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+ "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_split_loads",
+ "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_4k_aliasing",
+ "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+ "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS",
+ "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_fb_full",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l2_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+ "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l3_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+ "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS)))) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_contested_accesses",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+ "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS",
+ "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_data_sharing",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+ "MetricExpr": "29 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_RETIRED.L3_MISS))) / CLKS",
+ "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_l3_hit_latency",
+ "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+ "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_sq_full",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+ "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_dram_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_bandwidth",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+ "MetricExpr": "RESOURCE_STALLS.SB / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_store_bound",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+ "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_store_latency",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+ "MetricExpr": "60 * OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_false_sharing",
+ "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents rate of split store accesses",
+ "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS",
+ "MetricGroup": "TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_split_stores",
+ "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+ "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_dtlb_store",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+ "MetricExpr": "tma_backend_bound - tma_memory_bound",
+ "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Backend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+ "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS",
+ "MetricGroup": "TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_divider",
+ "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_ports_utilization",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_0",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_1",
+ "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful. Sample with: EXE_ACTIVITY.1_PORTS_UTIL",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_2",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop. Sample with: EXE_ACTIVITY.2_PORTS_UTIL",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_3m",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Sample with: UOPS_EXECUTED.CYCLES_GE_3",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_alu_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED.PORT_0",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS",
+ "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_0",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED.PORT_1",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_1",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU)",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_5",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED.PORT_6",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_6",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3_10",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_load_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads)",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_2",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads)",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_3",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations Sample with: UOPS_DISPATCHED.PORT_7_8",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_store_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data)",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_4",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address)",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_7",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Retiring",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. "
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",
+ "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.SLOTS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Retiring_SMT",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
+ "MetricExpr": "tma_retiring - tma_heavy_operations",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_light_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
+ "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
+ "MetricGroup": "HPC;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_fp_arith",
+ "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
+ "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_x87_use",
+ "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_scalar",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_vector",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_128b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_256b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
+ "MetricExpr": "tma_microcode_sequencer",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_heavy_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
+ "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS",
+ "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
+ "MetricName": "tma_microcode_sequencer",
+ "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: UOPS_RETIRED.MS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
+ "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_assists",
+ "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: ASSISTS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
+ "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_cisc",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "INST_RETIRED.ANY / CLKS",
"MetricGroup": "Ret;Summary",
"MetricName": "IPC"
},
@@ -76,8 +568,8 @@
},
{
"BriefDescription": "Cycles Per Instruction (per Logical Processor)",
- "MetricExpr": "1 / (INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "Pipeline;Mem",
+ "MetricExpr": "1 / IPC",
+ "MetricGroup": "Mem;Pipeline",
"MetricName": "CPI"
},
{
@@ -88,17 +580,11 @@
},
{
"BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "TmaL1",
+ "MetricExpr": "4 * CORE_CLKS",
+ "MetricGroup": "tma_L1_group",
"MetricName": "SLOTS"
},
{
- "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "TmaL1_SMT",
- "MetricName": "SLOTS_SMT"
- },
- {
"BriefDescription": "The ratio of Executed- by Issued-Uops",
"MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
"MetricGroup": "Cor;Pipeline",
@@ -107,51 +593,32 @@
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;SMT;TmaL1",
+ "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS",
+ "MetricGroup": "Ret;SMT;tma_L1_group",
"MetricName": "CoreIPC"
},
{
- "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;SMT;TmaL1_SMT",
- "MetricName": "CoreIPC_SMT"
- },
- {
"BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;Flops",
+ "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
+ "MetricGroup": "Flops;Ret",
"MetricName": "FLOPc"
},
{
- "BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;Flops_SMT",
- "MetricName": "FLOPc_SMT"
- },
- {
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )",
+ "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "FP_Arith_Utilization",
"PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
},
{
- "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )",
- "MetricGroup": "Cor;Flops;HPC_SMT",
- "MetricName": "FP_Arith_Utilization_SMT",
- "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU."
- },
- {
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+ "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
"MetricName": "ILP"
},
{
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
- "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
+ "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS",
"MetricGroup": "SMT",
"MetricName": "CORE_CLKS"
},
@@ -193,13 +660,13 @@
},
{
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;InsType",
"MetricName": "IpFLOP"
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )",
+ "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
"MetricGroup": "Flops;InsType",
"MetricName": "IpArith",
"PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
@@ -220,22 +687,22 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX128",
"PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX256",
"PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
- "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST",
+ "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST",
"MetricExpr": "INST_RETIRED.ANY",
- "MetricGroup": "Summary;TmaL1",
+ "MetricGroup": "Summary;tma_L1_group",
"MetricName": "Instructions"
},
{
@@ -252,7 +719,7 @@
},
{
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
- "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+ "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))",
"MetricGroup": "DSB;Fed;FetchBW",
"MetricName": "DSB_Coverage"
},
@@ -264,84 +731,72 @@
},
{
"BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "Branch_Misprediction_Cost"
},
{
- "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BrMispredicts_SMT",
- "MetricName": "Branch_Misprediction_Cost_SMT"
- },
- {
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)",
"MetricGroup": "Mem;MemoryBound;MemoryLat",
"MetricName": "Load_Miss_Real_Latency"
},
{
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
"MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBound;MemoryBW",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
"MetricName": "MLP"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L1MPKI"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;Backend;CacheMisses",
+ "MetricGroup": "Backend;CacheMisses;Mem",
"MetricName": "L2MPKI"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses;Offcore",
+ "MetricGroup": "CacheMisses;Mem;Offcore",
"MetricName": "L2MPKI_All"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2MPKI_Load"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_All"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_Load"
},
{
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L3MPKI"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
"MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * (( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) if #core_wide < 1 else ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else CPU_CLK_UNHALTED.THREAD) )",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "Page_Walks_Utilization"
},
{
- "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
- "MetricExpr": "( cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Mem;MemoryTLB_SMT",
- "MetricName": "Page_Walks_Utilization_SMT"
- },
- {
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
"MetricGroup": "Mem;MemoryBW",
@@ -361,19 +816,19 @@
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
+ "MetricExpr": "L1D_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L1D_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
+ "MetricExpr": "L2_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L2_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
+ "MetricExpr": "L3_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L3_Cache_Fill_BW_1T"
},
@@ -391,26 +846,26 @@
},
{
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time",
- "MetricGroup": "Summary;Power",
+ "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time",
+ "MetricGroup": "Power;Summary",
"MetricName": "Average_Frequency"
},
{
"BriefDescription": "Giga Floating Point Operations Per Second",
- "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
+ "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "GFLOPs",
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
"MetricName": "Turbo_Utilization"
},
{
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
- "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+ "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0",
"MetricGroup": "SMT",
"MetricName": "SMT_2T_Utilization"
},
@@ -428,33 +883,21 @@
},
{
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+ "MetricExpr": "64 * (arb@event\\=0x81\\,umask\\=0x1@ + arb@event\\=0x84\\,umask\\=0x1@) / 1000000 / duration_time / 1000",
"MetricGroup": "HPC;Mem;MemoryBW;SoC",
"MetricName": "DRAM_BW_Use"
},
{
- "BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
- "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )",
- "MetricGroup": "Mem;MemoryLat;SoC",
- "MetricName": "MEM_Read_Latency"
- },
- {
- "BriefDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches",
- "MetricExpr": "cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182\\,thresh\\=1@",
- "MetricGroup": "Mem;MemoryBW;SoC",
- "MetricName": "MEM_Parallel_Reads"
- },
- {
- "BriefDescription": "Socket actual clocks when any core is active on that socket",
- "MetricExpr": "cbox_0@event\\=0x0@",
- "MetricGroup": "SoC",
- "MetricName": "Socket_CLKS"
+ "BriefDescription": "Average latency of all requests to external memory (in Uncore cycles)",
+ "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@",
+ "MetricGroup": "Mem;SoC",
+ "MetricName": "MEM_Request_Latency"
},
{
- "BriefDescription": "Uncore frequency per die [GHZ]",
- "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000",
- "MetricGroup": "SoC",
- "MetricName": "UNCORE_FREQ"
+ "BriefDescription": "Average number of parallel requests to external memory. Accounts for all requests",
+ "MetricExpr": "UNC_ARB_TRK_OCCUPANCY.ALL / arb@event\\=0x81\\,umask\\=0x1@",
+ "MetricGroup": "Mem;SoC",
+ "MetricName": "MEM_Parallel_Requests"
},
{
"BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
index a3a15ee52841..e89fa536ca03 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json
@@ -1,64 +1,576 @@
[
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Frontend_Bound",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound."
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS",
+ "MetricGroup": "PGO;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Frontend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+ "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS",
+ "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_latency",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
+ "MetricExpr": "ICACHE.IFDATA_STALL / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+ "MetricExpr": "(14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * ITLB_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: ITLB_MISSES.WALK_COMPLETED",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+ "MetricExpr": "12 * (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY) / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_branch_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. ",
+ "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_mispredicts_resteers",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. ",
+ "MetricExpr": "MACHINE_CLEARS.COUNT * tma_branch_resteers / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY)",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_clears_resteers",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+ "MetricExpr": "tma_branch_resteers - tma_mispredicts_resteers - tma_clears_resteers",
+ "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_unknown_branches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
+ "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_dsb_switches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+ "MetricExpr": "ILD_STALL.LCP / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS",
+ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_ms_switches",
+ "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+ "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
+ "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_bandwidth",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_mite",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_dsb",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Bad_Speculation",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example."
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_bad_speculation",
+ "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+ "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Bad_Speculation_SMT",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+ "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
- "MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)) )",
- "MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound."
+ "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_backend_bound",
+ "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB) / (CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB)) * tma_backend_bound",
+ "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_memory_bound",
+ "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+ "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l1_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS;MEM_LOAD_UOPS_RETIRED.HIT_LFB_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+ "MetricExpr": "(8 * DTLB_LOAD_MISSES.STLB_HIT + cpu@DTLB_LOAD_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_LOAD_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_dtlb_load",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_UOPS_RETIRED.STLB_MISS_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd_blk",
+ "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricExpr": "(MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO) / CLKS",
+ "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_lock_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_UOPS_RETIRED.LOCK_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+ "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_split_loads",
+ "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_UOPS_RETIRED.SPLIT_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_4k_aliasing",
+ "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+ "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS",
+ "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_fb_full",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l2_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L2_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+ "MetricExpr": "(MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS)) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l3_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+ "MetricExpr": "(60 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_contested_accesses",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+ "MetricExpr": "43 * (MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS",
+ "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_data_sharing",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+ "MetricExpr": "41 * (MEM_LOAD_UOPS_RETIRED.L3_HIT * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS",
+ "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_l3_hit_latency",
+ "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_UOPS_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+ "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_sq_full",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+ "MetricExpr": "(1 - (MEM_LOAD_UOPS_RETIRED.L3_HIT / (MEM_LOAD_UOPS_RETIRED.L3_HIT + 7 * MEM_LOAD_UOPS_RETIRED.L3_MISS))) * CYCLE_ACTIVITY.STALLS_L2_MISS / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_dram_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_RETIRED.L3_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_bandwidth",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
+ "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS",
+ "MetricGroup": "Server;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_local_dram",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM_PS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "1 - ( (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) + (UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) )",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Backend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
+ "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / CLKS",
+ "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_remote_dram",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
+ "MetricExpr": "(200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) + 180 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD * (1 + mem_load_uops_retired.hit_lfb / ((MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS) + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD)))) / CLKS",
+ "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_remote_cache",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+ "MetricExpr": "RESOURCE_STALLS.SB / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_store_bound",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_UOPS_RETIRED.ALL_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+ "MetricExpr": "((L2_RQSTS.RFO_HIT * 9 * (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES))) + (1 - (MEM_UOPS_RETIRED.LOCK_LOADS / MEM_UOPS_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_store_latency",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+ "MetricExpr": "(200 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_MISS.REMOTE_HITM + 60 * OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_false_sharing",
+ "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents rate of split store accesses",
+ "MetricExpr": "2 * MEM_UOPS_RETIRED.SPLIT_STORES / CORE_CLKS",
+ "MetricGroup": "TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_split_stores",
+ "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_UOPS_RETIRED.SPLIT_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+ "MetricExpr": "(8 * DTLB_STORE_MISSES.STLB_HIT + cpu@DTLB_STORE_MISSES.WALK_DURATION\\,cmask\\=1@ + 7 * DTLB_STORE_MISSES.WALK_COMPLETED) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_dtlb_store",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_UOPS_RETIRED.STLB_MISS_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+ "MetricExpr": "tma_backend_bound - tma_memory_bound",
+ "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+ "MetricExpr": "ARITH.FPU_DIV_ACTIVE / CORE_CLKS",
+ "MetricGroup": "TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_divider",
+ "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_UOPS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if (IPC > 1.8) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_ports_utilization",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@) / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - RS_EVENTS.EMPTY_CYCLES if (tma_fetch_latency > 0.1) else 0) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_0",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_1",
+ "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_2",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
+ "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_3m",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_alu_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS",
+ "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_0",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_1",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_5",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_6",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_load_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_2",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_3",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_store_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_4",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_7",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Retiring",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. "
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",
+ "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Retiring_SMT",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation)",
+ "MetricExpr": "tma_retiring - tma_heavy_operations",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_light_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired)",
+ "MetricExpr": "tma_x87_use + tma_fp_scalar + tma_fp_vector",
+ "MetricGroup": "HPC;TopdownL3;tma_light_operations_group",
+ "MetricName": "tma_fp_arith",
+ "PublicDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric serves as an approximation of legacy x87 usage",
+ "MetricExpr": "INST_RETIRED.X87 * UPI / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_x87_use",
+ "PublicDescription": "This metric serves as an approximation of legacy x87 usage. It accounts for instructions beyond X87 FP arithmetic operations; hence may be used as a thermometer to avoid X87 high usage and preferably upgrade to modern ISA. See Tip under Tuning Hint.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_scalar",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL4;tma_fp_arith_group",
+ "MetricName": "tma_fp_vector",
+ "PublicDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_128b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors",
+ "MetricExpr": "(FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / UOPS_RETIRED.RETIRE_SLOTS",
+ "MetricGroup": "Compute;Flops;TopdownL5;tma_fp_vector_group",
+ "MetricName": "tma_fp_vector_256b",
+ "PublicDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors. May overcount due to FMA double counting.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences",
+ "MetricExpr": "tma_microcode_sequencer",
+ "MetricGroup": "Retire;TopdownL2;tma_L2_group;tma_retiring_group",
+ "MetricName": "tma_heavy_operations",
+ "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit",
+ "MetricExpr": "(UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY) * IDQ.MS_UOPS / SLOTS",
+ "MetricGroup": "MicroSeq;TopdownL3;tma_heavy_operations_group",
+ "MetricName": "tma_microcode_sequencer",
+ "PublicDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided. Sample with: IDQ.MS_UOPS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists",
+ "MetricExpr": "100 * OTHER_ASSISTS.ANY_WB_ASSIST / SLOTS",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_assists",
+ "PublicDescription": "This metric estimates fraction of slots the CPU retired uops delivered by the Microcode_Sequencer as a result of Assists. Assists are long sequences of uops that are required in certain corner-cases for operations that cannot be handled natively by the execution pipeline. For example; when working with very small floating point values (so-called Denormals); the FP units are not set up to perform these operations natively. Instead; a sequence of instructions to perform the computation on the Denormals is injected into the pipeline. Since these microcode sequences might be dozens of uops long; Assists can be extremely deleterious to performance and they can be avoided in many cases. Sample with: OTHER_ASSISTS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction",
+ "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)",
+ "MetricGroup": "TopdownL4;tma_microcode_sequencer_group",
+ "MetricName": "tma_cisc",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction. A CISC instruction has multiple uops that are required to perform the instruction's functionality as in the case of read-modify-write as an example. Since these instructions require multiple uops they may or may not imply sub-optimal use of machine resources.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "Instructions Per Cycle (per Logical Processor)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
+ "MetricExpr": "INST_RETIRED.ANY / CLKS",
"MetricGroup": "Ret;Summary",
"MetricName": "IPC"
},
@@ -75,6 +587,12 @@
"MetricName": "UpTB"
},
{
+ "BriefDescription": "Cycles Per Instruction (per Logical Processor)",
+ "MetricExpr": "1 / IPC",
+ "MetricGroup": "Mem;Pipeline",
+ "MetricName": "CPI"
+ },
+ {
"BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.",
"MetricExpr": "CPU_CLK_UNHALTED.THREAD",
"MetricGroup": "Pipeline",
@@ -82,17 +600,11 @@
},
{
"BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "TmaL1",
+ "MetricExpr": "4 * CORE_CLKS",
+ "MetricGroup": "tma_L1_group",
"MetricName": "SLOTS"
},
{
- "BriefDescription": "Total issue-pipeline slots (per-Physical Core till ICL; per-Logical Processor ICL onward)",
- "MetricExpr": "4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "TmaL1_SMT",
- "MetricName": "SLOTS_SMT"
- },
- {
"BriefDescription": "The ratio of Executed- by Issued-Uops",
"MetricExpr": "UOPS_EXECUTED.THREAD / UOPS_ISSUED.ANY",
"MetricGroup": "Cor;Pipeline",
@@ -101,51 +613,32 @@
},
{
"BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;SMT;TmaL1",
+ "MetricExpr": "INST_RETIRED.ANY / CORE_CLKS",
+ "MetricGroup": "Ret;SMT;tma_L1_group",
"MetricName": "CoreIPC"
},
{
- "BriefDescription": "Instructions Per Cycle across hyper-threads (per physical core)",
- "MetricExpr": "INST_RETIRED.ANY / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;SMT;TmaL1_SMT",
- "MetricName": "CoreIPC_SMT"
- },
- {
"BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / CPU_CLK_UNHALTED.THREAD",
- "MetricGroup": "Ret;Flops",
+ "MetricExpr": "(1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / CORE_CLKS",
+ "MetricGroup": "Flops;Ret",
"MetricName": "FLOPc"
},
{
- "BriefDescription": "Floating Point Operations Per Cycle",
- "MetricExpr": "( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
- "MetricGroup": "Ret;Flops_SMT",
- "MetricName": "FLOPc_SMT"
- },
- {
"BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )",
+ "MetricExpr": "((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)) / (2 * CORE_CLKS)",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "FP_Arith_Utilization",
"PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
},
{
- "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )",
- "MetricGroup": "Cor;Flops;HPC_SMT",
- "MetricName": "FP_Arith_Utilization_SMT",
- "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU."
- },
- {
"BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
- "MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
+ "MetricExpr": "UOPS_EXECUTED.THREAD / ((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
"MetricName": "ILP"
},
{
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
- "MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
+ "MetricExpr": "((CPU_CLK_UNHALTED.THREAD / 2) * (1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK)) if #core_wide < 1 else (CPU_CLK_UNHALTED.THREAD_ANY / 2) if #SMT_on else CLKS",
"MetricGroup": "SMT",
"MetricName": "CORE_CLKS"
},
@@ -187,13 +680,13 @@
},
{
"BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;InsType",
"MetricName": "IpFLOP"
},
{
"BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) )",
+ "MetricExpr": "INST_RETIRED.ANY / ((FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE))",
"MetricGroup": "Flops;InsType",
"MetricName": "IpArith",
"PublicDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate). May undercount due to FMA double counting. Approximated prior to BDW."
@@ -214,22 +707,22 @@
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX128",
"PublicDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
"BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)",
- "MetricExpr": "INST_RETIRED.ANY / ( FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE )",
+ "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE)",
"MetricGroup": "Flops;FpVector;InsType",
"MetricName": "IpArith_AVX256",
"PublicDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate). May undercount due to FMA double counting."
},
{
- "BriefDescription": "Total number of retired Instructions, Sample with: INST_RETIRED.PREC_DIST",
+ "BriefDescription": "Total number of retired Instructions Sample with: INST_RETIRED.PREC_DIST",
"MetricExpr": "INST_RETIRED.ANY",
- "MetricGroup": "Summary;TmaL1",
+ "MetricGroup": "Summary;tma_L1_group",
"MetricName": "Instructions"
},
{
@@ -246,7 +739,7 @@
},
{
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
- "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
+ "MetricExpr": "IDQ.DSB_UOPS / ((IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS))",
"MetricGroup": "DSB;Fed;FetchBW",
"MetricName": "DSB_Coverage"
},
@@ -258,84 +751,72 @@
},
{
"BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricExpr": " (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) * SLOTS / BR_MISP_RETIRED.ALL_BRANCHES",
"MetricGroup": "Bad;BrMispredicts",
"MetricName": "Branch_Misprediction_Cost"
},
{
- "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BrMispredicts_SMT",
- "MetricName": "Branch_Misprediction_Cost_SMT"
- },
- {
"BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
- "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / (MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb)",
"MetricGroup": "Mem;MemoryBound;MemoryLat",
"MetricName": "Load_Miss_Real_Latency"
},
{
"BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
"MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBound;MemoryBW",
+ "MetricGroup": "Mem;MemoryBW;MemoryBound",
"MetricName": "MLP"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L1_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L1MPKI"
},
{
"BriefDescription": "L2 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L2_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;Backend;CacheMisses",
+ "MetricGroup": "Backend;CacheMisses;Mem",
"MetricName": "L2MPKI"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses;Offcore",
+ "MetricGroup": "CacheMisses;Mem;Offcore",
"MetricName": "L2MPKI_All"
},
{
"BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2MPKI_Load"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all request types (including speculative)",
- "MetricExpr": "1000 * ( L2_RQSTS.REFERENCES - L2_RQSTS.MISS ) / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricExpr": "1000 * (L2_RQSTS.REFERENCES - L2_RQSTS.MISS) / INST_RETIRED.ANY",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_All"
},
{
"BriefDescription": "L2 cache hits per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_HIT / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L2HPKI_Load"
},
{
"BriefDescription": "L3 cache true misses per kilo instruction for retired demand loads",
"MetricExpr": "1000 * MEM_LOAD_UOPS_RETIRED.L3_MISS / INST_RETIRED.ANY",
- "MetricGroup": "Mem;CacheMisses",
+ "MetricGroup": "CacheMisses;Mem",
"MetricName": "L3MPKI"
},
{
"BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
"MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )",
+ "MetricExpr": "(ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * (DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED)) / (2 * CORE_CLKS)",
"MetricGroup": "Mem;MemoryTLB",
"MetricName": "Page_Walks_Utilization"
},
{
- "BriefDescription": "Utilization of the core's Page Walker(s) serving STLB misses triggered by instruction/Load/Store accesses",
- "MetricExpr": "( ITLB_MISSES.WALK_DURATION + DTLB_LOAD_MISSES.WALK_DURATION + DTLB_STORE_MISSES.WALK_DURATION + 7 * ( DTLB_STORE_MISSES.WALK_COMPLETED + DTLB_LOAD_MISSES.WALK_COMPLETED + ITLB_MISSES.WALK_COMPLETED ) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )",
- "MetricGroup": "Mem;MemoryTLB_SMT",
- "MetricName": "Page_Walks_Utilization_SMT"
- },
- {
"BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
"MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
"MetricGroup": "Mem;MemoryBW",
@@ -355,19 +836,19 @@
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
+ "MetricExpr": "L1D_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L1D_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
+ "MetricExpr": "L2_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L2_Cache_Fill_BW_1T"
},
{
"BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
+ "MetricExpr": "L3_Cache_Fill_BW",
"MetricGroup": "Mem;MemoryBW",
"MetricName": "L3_Cache_Fill_BW_1T"
},
@@ -385,26 +866,26 @@
},
{
"BriefDescription": "Measured Average Frequency for unhalted processors [GHz]",
- "MetricExpr": "(CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC) * msr@tsc@ / 1000000000 / duration_time",
- "MetricGroup": "Summary;Power",
+ "MetricExpr": "Turbo_Utilization * msr@tsc@ / 1000000000 / duration_time",
+ "MetricGroup": "Power;Summary",
"MetricName": "Average_Frequency"
},
{
"BriefDescription": "Giga Floating Point Operations Per Second",
- "MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
+ "MetricExpr": "((1 * (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * (FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) / 1000000000) / duration_time",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "GFLOPs",
"PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC",
+ "MetricExpr": "CLKS / CPU_CLK_UNHALTED.REF_TSC",
"MetricGroup": "Power",
"MetricName": "Turbo_Utilization"
},
{
"BriefDescription": "Fraction of cycles where both hardware Logical Processors were active",
- "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / ( CPU_CLK_UNHALTED.REF_XCLK_ANY / 2 ) if #SMT_on else 0",
+ "MetricExpr": "1 - CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / (CPU_CLK_UNHALTED.REF_XCLK_ANY / 2) if #SMT_on else 0",
"MetricGroup": "SMT",
"MetricName": "SMT_2T_Utilization"
},
@@ -422,13 +903,13 @@
},
{
"BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]",
- "MetricExpr": "( 64 * ( uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@ ) / 1000000000 ) / duration_time",
+ "MetricExpr": "(64 * (uncore_imc@cas_count_read@ + uncore_imc@cas_count_write@) / 1000000000) / duration_time",
"MetricGroup": "HPC;Mem;MemoryBW;SoC",
"MetricName": "DRAM_BW_Use"
},
{
"BriefDescription": "Average latency of data read request to external memory (in nanoseconds). Accounts for demand loads and L1/L2 prefetches",
- "MetricExpr": "1000000000 * ( cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@ ) / ( cbox_0@event\\=0x0@ / duration_time )",
+ "MetricExpr": "1000000000 * (cbox@event\\=0x36\\,umask\\=0x3\\,filter_opc\\=0x182@ / cbox@event\\=0x35\\,umask\\=0x3\\,filter_opc\\=0x182@) / (Socket_CLKS / duration_time)",
"MetricGroup": "Mem;MemoryLat;SoC",
"MetricName": "MEM_Read_Latency"
},
@@ -445,12 +926,6 @@
"MetricName": "Socket_CLKS"
},
{
- "BriefDescription": "Uncore frequency per die [GHZ]",
- "MetricExpr": "cbox_0@event\\=0x0@ / #num_dies / duration_time / 1000000000",
- "MetricGroup": "SoC",
- "MetricName": "UNCORE_FREQ"
- },
- {
"BriefDescription": "Instructions per Far Branch ( Far Branches apply upon transition from application to operating system, handling interrupts, exceptions) [lower number means higher occurrence rate]",
"MetricExpr": "INST_RETIRED.ANY / BR_INST_RETIRED.FAR_BRANCH:u",
"MetricGroup": "Branches;OS",
@@ -499,20 +974,19 @@
"MetricName": "C7_Pkg_Residency"
},
{
+ "BriefDescription": "Uncore frequency per die [GHZ]",
+ "MetricExpr": "Socket_CLKS / #num_dies / duration_time / 1000000000",
+ "MetricGroup": "SoC",
+ "MetricName": "UNCORE_FREQ"
+ },
+ {
"BriefDescription": "CPU operating frequency (in GHz)",
- "MetricExpr": "( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000",
+ "MetricExpr": "(( CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ ) / 1000000000) / duration_time",
"MetricGroup": "",
"MetricName": "cpu_operating_frequency",
"ScaleUnit": "1GHz"
},
{
- "BriefDescription": "Cycles per instruction retired; indicating how much time each executed instruction took; in units of cycles.",
- "MetricExpr": "CPU_CLK_UNHALTED.THREAD / INST_RETIRED.ANY",
- "MetricGroup": "",
- "MetricName": "cpi",
- "ScaleUnit": "1per_instr"
- },
- {
"BriefDescription": "The ratio of number of completed memory load instructions to the total number completed instructions",
"MetricExpr": "MEM_UOPS_RETIRED.ALL_LOADS / INST_RETIRED.ANY",
"MetricGroup": "",
@@ -530,7 +1004,7 @@
"BriefDescription": "Ratio of number of requests missing L1 data cache (includes data+rfo w/ prefetches) to the total number of completed instructions",
"MetricExpr": "L1D.REPLACEMENT / INST_RETIRED.ANY",
"MetricGroup": "",
- "MetricName": "l1d_mpi_includes_data_plus_rfo_with_prefetches",
+ "MetricName": "l1d_mpi",
"ScaleUnit": "1per_instr"
},
{
@@ -558,7 +1032,7 @@
"BriefDescription": "Ratio of number of requests missing L2 cache (includes code+data+rfo w/ prefetches) to the total number of completed instructions",
"MetricExpr": "L2_LINES_IN.ALL / INST_RETIRED.ANY",
"MetricGroup": "",
- "MetricName": "l2_mpi_includes_code_plus_data_plus_rfo_with_prefetches",
+ "MetricName": "l2_mpi",
"ScaleUnit": "1per_instr"
},
{
@@ -591,21 +1065,21 @@
},
{
"BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) in nano seconds",
- "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time",
+ "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time",
"MetricGroup": "",
"MetricName": "llc_data_read_demand_plus_prefetch_miss_latency",
"ScaleUnit": "1ns"
},
{
"BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to local memory in nano seconds",
- "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time",
+ "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time",
"MetricGroup": "",
"MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_local_requests",
"ScaleUnit": "1ns"
},
{
"BriefDescription": "Average latency of a last level cache (LLC) demand and prefetch data read miss (read memory access) addressed to remote memory in nano seconds",
- "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) ) ) * duration_time",
+ "MetricExpr": "( 1000000000 * ( cbox@UNC_C_TOR_OCCUPANCY.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ ) / ( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) ) ) * duration_time",
"MetricGroup": "",
"MetricName": "llc_data_read_demand_plus_prefetch_miss_latency_for_remote_requests",
"ScaleUnit": "1ns"
@@ -640,21 +1114,21 @@
},
{
"BriefDescription": "Memory read that miss the last level cache (LLC) addressed to local DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
- "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )",
+ "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )",
"MetricGroup": "",
- "MetricName": "numa_percent_reads_addressed_to_local_dram",
+ "MetricName": "numa_reads_addressed_to_local_dram",
"ScaleUnit": "1%"
},
{
"BriefDescription": "Memory reads that miss the last level cache (LLC) addressed to remote DRAM as a percentage of total memory read accesses, does not include LLC prefetches.",
- "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_OPCODE\\,filter_opc\\=0x182@ )",
+ "MetricExpr": "100 * cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ / ( cbox@UNC_C_TOR_INSERTS.MISS_LOCAL_OPCODE\\,filter_opc\\=0x182@ + cbox@UNC_C_TOR_INSERTS.MISS_REMOTE_OPCODE\\,filter_opc\\=0x182@ )",
"MetricGroup": "",
- "MetricName": "numa_percent_reads_addressed_to_remote_dram",
+ "MetricName": "numa_reads_addressed_to_remote_dram",
"ScaleUnit": "1%"
},
{
"BriefDescription": "Uncore operating frequency in GHz",
- "MetricExpr": "UNC_C_CLOCKTICKS / ( source_count(UNC_C_CLOCKTICKS) * #num_packages ) / 1000000000",
+ "MetricExpr": "( UNC_C_CLOCKTICKS / ( #num_cores / #num_packages * #num_packages ) / 1000000000) / duration_time",
"MetricGroup": "",
"MetricName": "uncore_frequency",
"ScaleUnit": "1GHz"
@@ -663,7 +1137,7 @@
"BriefDescription": "Intel(R) Quick Path Interconnect (QPI) data transmit bandwidth (MB/sec)",
"MetricExpr": "( UNC_Q_TxL_FLITS_G0.DATA * 8 / 1000000) / duration_time",
"MetricGroup": "",
- "MetricName": "qpi_data_transmit_bw_only_data",
+ "MetricName": "qpi_data_transmit_bw",
"ScaleUnit": "1MB/s"
},
{
@@ -691,245 +1165,42 @@
"BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the CPU.",
"MetricExpr": "( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x19e@ * 64 / 1000000) / duration_time",
"MetricGroup": "",
- "MetricName": "io_bandwidth_read",
+ "MetricName": "io_bandwidth_disk_or_network_writes",
"ScaleUnit": "1MB/s"
},
{
"BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the CPU.",
"MetricExpr": "(( cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x1c8\\,filter_tid\\=0x3e@ + cbox@UNC_C_TOR_INSERTS.OPCODE\\,filter_opc\\=0x180\\,filter_tid\\=0x3e@ ) * 64 / 1000000) / duration_time",
"MetricGroup": "",
- "MetricName": "io_bandwidth_write",
+ "MetricName": "io_bandwidth_disk_or_network_reads",
"ScaleUnit": "1MB/s"
},
{
"BriefDescription": "Uops delivered from decoded instruction cache (decoded stream buffer or DSB) as a percent of total uops delivered to Instruction Decode Queue",
"MetricExpr": "100 * ( IDQ.DSB_UOPS / UOPS_ISSUED.ANY )",
"MetricGroup": "",
- "MetricName": "percent_uops_delivered_from_decoded_icache_dsb",
+ "MetricName": "percent_uops_delivered_from_decoded_icache",
"ScaleUnit": "1%"
},
{
"BriefDescription": "Uops delivered from legacy decode pipeline (Micro-instruction Translation Engine or MITE) as a percent of total uops delivered to Instruction Decode Queue",
"MetricExpr": "100 * ( IDQ.MITE_UOPS / UOPS_ISSUED.ANY )",
"MetricGroup": "",
- "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline_mite",
+ "MetricName": "percent_uops_delivered_from_legacy_decode_pipeline",
"ScaleUnit": "1%"
},
{
"BriefDescription": "Uops delivered from microcode sequencer (MS) as a percent of total uops delivered to Instruction Decode Queue",
"MetricExpr": "100 * ( IDQ.MS_UOPS / UOPS_ISSUED.ANY )",
"MetricGroup": "",
- "MetricName": "percent_uops_delivered_from_microcode_sequencer_ms",
+ "MetricName": "percent_uops_delivered_from_microcode_sequencer",
"ScaleUnit": "1%"
},
{
"BriefDescription": "Uops delivered from loop stream detector(LSD) as a percent of total uops delivered to Instruction Decode Queue",
"MetricExpr": "100 * ( LSD.UOPS / UOPS_ISSUED.ANY )",
"MetricGroup": "",
- "MetricName": "percent_uops_delivered_from_loop_stream_detector_lsd",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
- "MetricExpr": "100 * ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )",
- "MetricGroup": "TmaL1;PGO",
- "MetricName": "tma_frontend_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period.",
- "MetricExpr": "100 * ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )",
- "MetricGroup": "Frontend;TmaL2;m_tma_frontend_bound_percent",
- "MetricName": "tma_fetch_latency_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses.",
- "MetricExpr": "100 * ( ICACHE.IFDATA_STALL / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "BigFoot;FetchLat;IcMiss;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_icache_misses_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses.",
- "MetricExpr": "100 * ( ( 14 * ITLB_MISSES.STLB_HIT + cpu@ITLB_MISSES.WALK_DURATION\\,cmask\\=0x1@ + 7 * ITLB_MISSES.WALK_COMPLETED ) / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_itlb_misses_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings.",
- "MetricExpr": "100 * ( ( 12 ) * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_branch_resteers_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty.",
- "MetricExpr": "100 * ( DSB2MITE_SWITCHES.PENALTY_CYCLES / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "DSBmiss;FetchLat;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_dsb_switches_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
- "MetricExpr": "100 * ( ILD_STALL.LCP / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "FetchLat;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_lcp_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals.",
- "MetricExpr": "100 * ( ( 2 ) * IDQ.MS_SWITCHES / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "FetchLat;MicroSeq;TmaL3;m_tma_fetch_latency_percent",
- "MetricName": "tma_ms_switches_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend.",
- "MetricExpr": "100 * ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )",
- "MetricGroup": "FetchBW;Frontend;TmaL2;m_tma_frontend_bound_percent",
- "MetricName": "tma_fetch_bandwidth_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck.",
- "MetricExpr": "100 * ( ( IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )",
- "MetricGroup": "DSBmiss;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent",
- "MetricName": "tma_mite_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
- "MetricExpr": "100 * ( ( IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS ) / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) / 2 )",
- "MetricGroup": "DSB;FetchBW;TmaL3;m_tma_fetch_bandwidth_percent",
- "MetricName": "tma_dsb_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
- "MetricExpr": "100 * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )",
- "MetricGroup": "TmaL1",
- "MetricName": "tma_bad_speculation_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path.",
- "MetricExpr": "100 * ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )",
- "MetricGroup": "BadSpec;BrMispredicts;TmaL2;m_tma_bad_speculation_percent",
- "MetricName": "tma_branch_mispredicts_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.",
- "MetricExpr": "100 * ( ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT ) ) * ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )",
- "MetricGroup": "BadSpec;MachineClears;TmaL2;m_tma_bad_speculation_percent",
- "MetricName": "tma_machine_clears_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
- "MetricExpr": "100 * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )",
- "MetricGroup": "TmaL1",
- "MetricName": "tma_backend_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
- "MetricExpr": "100 * ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB ) / ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) )",
- "MetricGroup": "Backend;TmaL2;m_tma_backend_bound_percent",
- "MetricName": "tma_memory_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache.",
- "MetricExpr": "100 * ( max( ( CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) , 0 ) )",
- "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent",
- "MetricName": "tma_l1_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance.",
- "MetricExpr": "100 * ( ( CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS ) / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent",
- "MetricName": "tma_l2_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance.",
- "MetricExpr": "100 * ( ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) * CYCLE_ACTIVITY.STALLS_L2_MISS / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent",
- "MetricName": "tma_l3_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance.",
- "MetricExpr": "100 * ( min( ( ( 1 - ( MEM_LOAD_UOPS_RETIRED.L3_HIT / ( MEM_LOAD_UOPS_RETIRED.L3_HIT + ( 7 ) * MEM_LOAD_UOPS_RETIRED.L3_MISS ) ) ) * CYCLE_ACTIVITY.STALLS_L2_MISS / ( CPU_CLK_UNHALTED.THREAD ) ) , ( 1 ) ) )",
- "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent",
- "MetricName": "tma_dram_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck.",
- "MetricExpr": "100 * ( RESOURCE_STALLS.SB / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "MemoryBound;TmaL3mem;TmaL3;m_tma_memory_bound_percent",
- "MetricName": "tma_store_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
- "MetricExpr": "100 * ( ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) - ( ( ( CYCLE_ACTIVITY.STALLS_MEM_ANY + RESOURCE_STALLS.SB ) / ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) ) * ( 1 - ( ( IDQ_UOPS_NOT_DELIVERED.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_ISSUED.ANY - ( UOPS_RETIRED.RETIRE_SLOTS ) + ( 4 ) * ( ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) if #SMT_on else INT_MISC.RECOVERY_CYCLES ) ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) + ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) ) ) )",
- "MetricGroup": "Backend;TmaL2;Compute;m_tma_backend_bound_percent",
- "MetricName": "tma_core_bound_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication.",
- "MetricExpr": "100 * ( ARITH.FPU_DIV_ACTIVE / ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) )",
- "MetricGroup": "TmaL3;m_tma_core_bound_percent",
- "MetricName": "tma_divider_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
- "MetricExpr": "100 * ( ( ( ( CYCLE_ACTIVITY.STALLS_TOTAL + UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - ( UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC if ( ( INST_RETIRED.ANY / ( CPU_CLK_UNHALTED.THREAD ) ) > 1.8 ) else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC ) - ( RS_EVENTS.EMPTY_CYCLES if ( ( ( 4 ) * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) > 0.1 ) else 0 ) + RESOURCE_STALLS.SB ) ) - RESOURCE_STALLS.SB - CYCLE_ACTIVITY.STALLS_MEM_ANY ) / ( CPU_CLK_UNHALTED.THREAD ) )",
- "MetricGroup": "PortsUtil;TmaL3;m_tma_core_bound_percent",
- "MetricName": "tma_ports_utilization_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. ",
- "MetricExpr": "100 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )",
- "MetricGroup": "TmaL1",
- "MetricName": "tma_retiring_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved.",
- "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) - ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) ) )",
- "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent",
- "MetricName": "tma_light_operations_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents overall arithmetic floating-point (FP) operations fraction the CPU has executed (retired). Note this metric's value may exceed its parent due to use of \"Uops\" CountDomain and FMA double-counting.",
- "MetricExpr": "100 * ( ( INST_RETIRED.X87 * ( ( UOPS_RETIRED.RETIRE_SLOTS ) / INST_RETIRED.ANY ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) + ( min( ( ( FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / ( UOPS_RETIRED.RETIRE_SLOTS ) ) , ( 1 ) ) ) )",
- "MetricGroup": "HPC;TmaL3;m_tma_light_operations_percent",
- "MetricName": "tma_fp_arith_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or microcoded sequences. This highly-correlates with the uop length of these instructions/sequences.",
- "MetricExpr": "100 * ( ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) ) )",
- "MetricGroup": "Retire;TmaL2;m_tma_retiring_percent",
- "MetricName": "tma_heavy_operations_percent",
- "ScaleUnit": "1%"
- },
- {
- "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit. The MS is used for CISC instructions not supported by the default decoders (like repeat move strings; or CPUID); or by microcode assists used to address some operation modes (like in Floating Point assists). These cases can often be avoided.",
- "MetricExpr": "100 * ( ( ( UOPS_RETIRED.RETIRE_SLOTS ) / UOPS_ISSUED.ANY ) * IDQ.MS_UOPS / ( ( 4 ) * ( ( CPU_CLK_UNHALTED.THREAD_ANY / 2 ) if #SMT_on else ( CPU_CLK_UNHALTED.THREAD ) ) ) )",
- "MetricGroup": "MicroSeq;TmaL3;m_tma_heavy_operations_percent",
- "MetricName": "tma_microcode_sequencer_percent",
+ "MetricName": "percent_uops_delivered_from_loop_stream_detector",
"ScaleUnit": "1%"
}
]
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
index abee6f773c1f..449fa723d0aa 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-cache.json
@@ -947,21 +947,19 @@
"Unit": "CBO"
},
{
- "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches. Derived from unc_c_tor_inserts.miss_opcode",
+ "BriefDescription": "TOR Inserts; Miss Opcode Match",
"Counter": "0,1,2,3",
"EventCode": "0x35",
- "EventName": "LLC_MISSES.DATA_READ",
- "Filter": "filter_opc=0x182",
+ "EventName": "UNC_C_TOR_INSERTS.MISS_OPCODE",
"PerPkg": "1",
- "ScaleUnit": "64Bytes",
"UMask": "0x3",
"Unit": "CBO"
},
{
- "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches",
+ "BriefDescription": "LLC misses - demand and prefetch data reads - excludes LLC prefetches. Derived from unc_c_tor_inserts.miss_opcode",
"Counter": "0,1,2,3",
"EventCode": "0x35",
- "EventName": "UNC_C_TOR_INSERTS.MISS_OPCODE",
+ "EventName": "LLC_MISSES.DATA_READ",
"Filter": "filter_opc=0x182",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
index 071ce45620d2..cb1916f52607 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-interconnect.json
@@ -685,36 +685,34 @@
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
+ "BriefDescription": "Flits Transferred - Group 0; Data Tx Flits",
"Counter": "0,1,2,3",
- "EventName": "QPI_DATA_BANDWIDTH_TX",
+ "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
"PerPkg": "1",
- "ScaleUnit": "8Bytes",
"UMask": "0x2",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of data flits transmitted ",
+ "BriefDescription": "Number of data flits transmitted . Derived from unc_q_txl_flits_g0.data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.DATA",
+ "EventName": "QPI_DATA_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x2",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
+ "BriefDescription": "Flits Transferred - Group 0; Non-Data protocol Tx Flits",
"Counter": "0,1,2,3",
- "EventName": "QPI_CTL_BANDWIDTH_TX",
+ "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
"PerPkg": "1",
- "ScaleUnit": "8Bytes",
"UMask": "0x4",
"Unit": "QPI LL"
},
{
- "BriefDescription": "Number of non data (control) flits transmitted ",
+ "BriefDescription": "Number of non data (control) flits transmitted . Derived from unc_q_txl_flits_g0.non_data",
"Counter": "0,1,2,3",
- "EventName": "UNC_Q_TxL_FLITS_G0.NON_DATA",
+ "EventName": "QPI_CTL_BANDWIDTH_TX",
"PerPkg": "1",
"ScaleUnit": "8Bytes",
"UMask": "0x4",
diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
index 302e956a82ed..05fab7d2723e 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellx/uncore-memory.json
@@ -72,20 +72,19 @@
"Unit": "iMC"
},
{
- "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
+ "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM Reads (RD_CAS + Underfills)",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "LLC_MISSES.MEM_READ",
+ "EventName": "UNC_M_CAS_COUNT.RD",
"PerPkg": "1",
- "ScaleUnit": "64Bytes",
"UMask": "0x3",
"Unit": "iMC"
},
{
- "BriefDescription": "read requests to memory controller",
+ "BriefDescription": "read requests to memory controller. Derived from unc_m_cas_count.rd",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.RD",
+ "EventName": "LLC_MISSES.MEM_READ",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0x3",
@@ -110,20 +109,19 @@
"Unit": "iMC"
},
{
- "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
+ "BriefDescription": "DRAM RD_CAS and WR_CAS Commands.; All DRAM WR_CAS (both Modes)",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "LLC_MISSES.MEM_WRITE",
+ "EventName": "UNC_M_CAS_COUNT.WR",
"PerPkg": "1",
- "ScaleUnit": "64Bytes",
"UMask": "0xC",
"Unit": "iMC"
},
{
- "BriefDescription": "write requests to memory controller",
+ "BriefDescription": "write requests to memory controller. Derived from unc_m_cas_count.wr",
"Counter": "0,1,2,3",
"EventCode": "0x4",
- "EventName": "UNC_M_CAS_COUNT.WR",
+ "EventName": "LLC_MISSES.MEM_WRITE",
"PerPkg": "1",
"ScaleUnit": "64Bytes",
"UMask": "0xC",
diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
index 46613504b816..81de1149297d 100644
--- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json
@@ -1,148 +1,742 @@
[
{
"BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Frontend_Bound",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound."
+ "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / SLOTS",
+ "MetricGroup": "PGO;TopdownL1;tma_L1_group",
+ "MetricName": "tma_frontend_bound",
+ "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. Sample with: FRONTEND_RETIRED.LATENCY_GE_4_PS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Frontend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Machine_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues",
+ "MetricExpr": "4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / SLOTS",
+ "MetricGroup": "Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_latency",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: FRONTEND_RETIRED.LATENCY_GE_16_PS;FRONTEND_RETIRED.LATENCY_GE_8_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses",
+ "MetricExpr": "(ICACHE_16B.IFDATA_STALL + 2 * cpu@ICACHE_16B.IFDATA_STALL\\,cmask\\=1\\,edge@) / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;IcMiss;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_icache_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to instruction cache misses. Sample with: FRONTEND_RETIRED.L2_MISS_PS;FRONTEND_RETIRED.L1I_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses",
+ "MetricExpr": "ICACHE_64B.IFTAG_STALL / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;MemoryTLB;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_itlb_misses",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Instruction TLB (ITLB) misses. Sample with: FRONTEND_RETIRED.STLB_MISS_PS;FRONTEND_RETIRED.ITLB_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers",
+ "MetricExpr": "INT_MISC.CLEAR_RESTEER_CYCLES / CLKS + tma_unknown_branches",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_branch_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers. Branch Resteers estimates the Frontend delay in fetching operations from corrected path; following all sorts of miss-predicted branches. For example; branchy code with lots of miss-predictions might get categorized under Branch Resteers. Note the value of this node may overlap with its siblings. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage",
+ "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_mispredicts_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears",
+ "MetricExpr": "(1 - (BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT))) * INT_MISC.CLEAR_RESTEER_CYCLES / CLKS",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_clears_resteers",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Machine Clears. Sample with: INT_MISC.CLEAR_RESTEER_CYCLES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears",
+ "MetricExpr": "9 * BACLEARS.ANY / CLKS",
+ "MetricGroup": "BigFoot;FetchLat;TopdownL4;tma_branch_resteers_group",
+ "MetricName": "tma_unknown_branches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to new branch address clears. These are fetched branches the Branch Prediction Unit was unable to recognize (First fetch or hitting BPU capacity limit). Sample with: BACLEARS.ANY",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines",
+ "MetricExpr": "DSB2MITE_SWITCHES.PENALTY_CYCLES / CLKS",
+ "MetricGroup": "DSBmiss;FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_dsb_switches",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to switches from DSB to MITE pipelines. The DSB (decoded i-cache) is a Uop Cache where the front-end directly delivers Uops (micro operations) avoiding heavy x86 decoding. The DSB pipeline has shorter latency and delivered higher bandwidth than the MITE (legacy instruction decode pipeline). Switching between the two pipelines can cause penalties hence this metric measures the exposed penalty. Sample with: FRONTEND_RETIRED.DSB_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs)",
+ "MetricExpr": "ILD_STALL.LCP / CLKS",
+ "MetricGroup": "FetchLat;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_lcp",
+ "PublicDescription": "This metric represents fraction of cycles CPU was stalled due to Length Changing Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this. #Link: Optimization Guide about LCP BKMs.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS)",
+ "MetricExpr": "2 * IDQ.MS_SWITCHES / CLKS",
+ "MetricGroup": "FetchLat;MicroSeq;TopdownL3;tma_fetch_latency_group",
+ "MetricName": "tma_ms_switches",
+ "PublicDescription": "This metric estimates the fraction of cycles when the CPU was stalled due to switches of uop delivery to the Microcode Sequencer (MS). Commonly used instructions are optimized for delivery by the DSB (decoded i-cache) or MITE (legacy instruction decode) pipelines. Certain operations cannot be handled natively by the execution pipeline; and must be performed by microcode (small programs injected into the execution stream). Switching to the MS too often can negatively impact performance. The MS is designated to deliver long uop flows required by CISC instructions like CPUID; or uncommon conditions like Floating Point Assists when dealing with Denormals. Sample with: IDQ.MS_SWITCHES",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues",
+ "MetricExpr": "tma_frontend_bound - tma_fetch_latency",
+ "MetricGroup": "FetchBW;Frontend;TopdownL2;tma_L2_group;tma_frontend_bound_group",
+ "MetricName": "tma_fetch_bandwidth",
+ "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Sample with: FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_1_PS;FRONTEND_RETIRED.LATENCY_GE_2_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)",
+ "MetricExpr": "(IDQ.ALL_MITE_CYCLES_ANY_UOPS - IDQ.ALL_MITE_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_mite",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline). This pipeline is used for code that was not pre-cached in the DSB or LSD. For example; inefficiencies due to asymmetric decoders; use of long immediate or LCP can manifest as MITE fetch bandwidth bottleneck. Sample with: FRONTEND_RETIRED.ANY_DSB_MISS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where decoder-0 was the only active decoder",
+ "MetricExpr": "(cpu@INST_DECODED.DECODERS\\,cmask\\=1@ - cpu@INST_DECODED.DECODERS\\,cmask\\=2@) / CORE_CLKS",
+ "MetricGroup": "DSBmiss;FetchBW;TopdownL4;tma_mite_group",
+ "MetricName": "tma_decoder0_alone",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline",
+ "MetricExpr": "(IDQ.ALL_DSB_CYCLES_ANY_UOPS - IDQ.ALL_DSB_CYCLES_4_UOPS) / CORE_CLKS / 2",
+ "MetricGroup": "DSB;FetchBW;TopdownL3;tma_fetch_bandwidth_group",
+ "MetricName": "tma_dsb",
+ "PublicDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline. For example; inefficient utilization of the DSB cache structure or bank conflict when reading from it; are categorized here.",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Bad_Speculation",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example."
+ "MetricExpr": "(UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_bad_speculation",
+ "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction",
+ "MetricExpr": "(BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT)) * tma_bad_speculation",
+ "MetricGroup": "BadSpec;BrMispredicts;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_branch_mispredicts",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Bad_Speculation_SMT",
- "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears",
+ "MetricExpr": "tma_bad_speculation - tma_branch_mispredicts",
+ "MetricGroup": "BadSpec;MachineClears;TopdownL2;tma_L2_group;tma_bad_speculation_group",
+ "MetricName": "tma_machine_clears",
+ "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend",
- "MetricConstraint": "NO_NMI_WATCHDOG",
- "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) - ( UOPS_ISSUED.ANY + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Backend_Bound",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound."
+ "MetricExpr": "1 - tma_frontend_bound - (UOPS_ISSUED.ANY + 4 * ((INT_MISC.RECOVERY_CYCLES_ANY / 2) if #SMT_on else INT_MISC.RECOVERY_CYCLES)) / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_backend_bound",
+ "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_MEM_ANY + EXE_ACTIVITY.BOUND_ON_STORES) / (CYCLE_ACTIVITY.STALLS_TOTAL + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) + EXE_ACTIVITY.BOUND_ON_STORES)) * tma_backend_bound",
+ "MetricGroup": "Backend;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_memory_bound",
+ "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache",
+ "MetricExpr": "max((CYCLE_ACTIVITY.STALLS_MEM_ANY - CYCLE_ACTIVITY.STALLS_L1D_MISS) / CLKS, 0)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l1_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled without loads missing the L1 data cache. The L1 data cache typically has the shortest latency. However; in certain cases like loads blocked on older stores; a load might suffer due to high latency even though it is being satisfied by the L1. Another example is loads who miss in the TLB. These cases are characterized by execution unit stalls; while some non-completed demand load lives in the machine without having that demand load missing the L1 cache. Sample with: MEM_LOAD_RETIRED.L1_HIT_PS;MEM_LOAD_RETIRED.FB_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses",
+ "MetricExpr": "min(9 * cpu@DTLB_LOAD_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_LOAD_MISSES.WALK_ACTIVE, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_dtlb_load",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles where the Data TLB (DTLB) was missed by load accesses. TLBs (Translation Look-aside Buffers) are processor caches for recently used entries out of the Page Tables that are used to map virtual- to physical-addresses by the operating system. This metric approximates the potential delay of demand loads missing the first-level data TLB (assuming worst case scenario with back to back misses to different pages). This includes hitting in the second-level TLB (STLB) as well as performing a hardware page walk on an STLB miss. Sample with: MEM_INST_RETIRED.STLB_MISS_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the (first level) DTLB was missed by load accesses, that later on hit in second-level TLB (STLB)",
+ "MetricExpr": "tma_dtlb_load - tma_load_stlb_miss",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
+ "MetricName": "tma_load_stlb_hit",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles where the Second-level TLB (STLB) was missed by load accesses, performing a hardware page walk",
+ "MetricExpr": "DTLB_LOAD_MISSES.WALK_ACTIVE / CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_load_group",
+ "MetricName": "tma_load_stlb_miss",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores",
+ "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_store_fwd_blk",
+ "PublicDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores. To streamline memory operations in the pipeline; a load can avoid waiting for memory if a prior in-flight store is writing the data that the load wants to read (store forwarding process). However; in some cases the load may be blocked for a significant time pending the store forward. For example; when the prior store is writing a smaller region than the load is reading.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations",
+ "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES) * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / CLKS",
+ "MetricGroup": "Offcore;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_lock_latency",
+ "PublicDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations. Due to the microarchitecture handling of locks; they are classified as L1_Bound regardless of what memory source satisfied them. Sample with: MEM_INST_RETIRED.LOCK_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary",
+ "MetricExpr": "Load_Miss_Real_Latency * LD_BLOCKS.NO_SR / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_split_loads",
+ "PublicDescription": "This metric estimates fraction of cycles handling memory load split accesses - load that cross 64-byte cache line boundary. Sample with: MEM_INST_RETIRED.SPLIT_LOADS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset",
+ "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / CLKS",
+ "MetricGroup": "TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_4k_aliasing",
+ "PublicDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset. False match is possible; which incur a few cycles load re-issue. However; the short re-issue duration is often hidden by the out-of-order core and HW optimizations; hence a user may safely ignore a high value of this metric unless it manages to propagate up into parent nodes of the hierarchy (e.g. to L1_Bound).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed",
+ "MetricExpr": "Load_Miss_Real_Latency * cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@ / CLKS",
+ "MetricGroup": "MemoryBW;TopdownL4;tma_l1_bound_group",
+ "MetricName": "tma_fb_full",
+ "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads",
+ "MetricExpr": "((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / ((MEM_LOAD_RETIRED.L2_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + cpu@L1D_PEND_MISS.FB_FULL\\,cmask\\=1@)) * ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS)",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l2_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to L2 cache accesses by loads. Avoiding cache misses (i.e. L1 misses/L2 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L2_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core",
+ "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / CLKS",
+ "MetricGroup": "CacheMisses;MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_l3_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses",
+ "MetricExpr": "((44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) + (44 * Average_Frequency) * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_contested_accesses",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses. Contested accesses occur when data written by one Logical Processor are read by another Logical Processor on a different Physical Core. Examples of contested accesses include synchronizations such as locks; true data sharing such as modified locked variables; and false sharing. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses",
+ "MetricExpr": "(44 * Average_Frequency) * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD)))) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "Offcore;Snoop;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_data_sharing",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses. Data shared by multiple Logical Processors (even just read shared) may cause increased access latency due to cache coherency. Excessive data sharing can drastically harm multithreaded performance. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)",
+ "MetricExpr": "(17 * Average_Frequency) * MEM_LOAD_RETIRED.L3_HIT * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "MemoryLat;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_l3_hit_latency",
+ "PublicDescription": "This metric represents fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors)",
+ "MetricExpr": "((OFFCORE_REQUESTS_BUFFER.SQ_FULL / 2) if #SMT_on else OFFCORE_REQUESTS_BUFFER.SQ_FULL) / CORE_CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_l3_bound_group",
+ "MetricName": "tma_sq_full",
+ "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). The Super Queue is used for requests to access the L2 cache or to go out to the Uncore.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads",
+ "MetricExpr": "((CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound) - tma_pmm_bound)",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_dram_bound",
+ "PublicDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads. Better caching can improve the latency and increase performance. Sample with: MEM_LOAD_RETIRED.L3_MISS_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, cpu@OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD\\,cmask\\=4@) / CLKS",
+ "MetricGroup": "MemoryBW;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_bandwidth",
+ "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory (DRAM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM)",
+ "MetricExpr": "min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD) / CLKS - tma_mem_bandwidth",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_dram_bound_group",
+ "MetricName": "tma_mem_latency",
+ "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory (DRAM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory",
+ "MetricExpr": "(59.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "Server;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_local_dram",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory. Caching will improve the latency and increase performance. Sample with: MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory",
+ "MetricExpr": "(127 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "Server;Snoop;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_remote_dram",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues",
+ "MetricExpr": "((89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM + (89.5 * Average_Frequency) * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD) * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) / 2) / CLKS",
+ "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_mem_latency_group",
+ "MetricName": "tma_remote_cache",
+ "PublicDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues. This is caused often due to non-optimal NUMA allocations. #link to NUMA article Sample with: MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM_PS;MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a",
+ "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) / ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 10 * ((MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + (MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + (MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))))))) * (CYCLE_ACTIVITY.STALLS_L3_MISS / CLKS + ((CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / CLKS) - tma_l2_bound)) if (1000000 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS) else 0)",
+ "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_pmm_bound",
+ "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory by loads, PMM stands for Persistent Memory Module. ",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write",
+ "MetricExpr": "EXE_ACTIVITY.BOUND_ON_STORES / CLKS",
+ "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_memory_bound_group",
+ "MetricName": "tma_store_bound",
+ "PublicDescription": "This metric estimates how often CPU was stalled due to RFO store memory accesses; RFO store issue a read-for-ownership request before the write. Even though store accesses do not typically stall out-of-order CPUs; there are few cases where stores can lead to actual stalls. This metric will be flagged should RFO stores be a bottleneck. Sample with: MEM_INST_RETIRED.ALL_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses",
+ "MetricExpr": "((L2_RQSTS.RFO_HIT * 11 * (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES))) + (1 - (MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES)) * min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO)) / CLKS",
+ "MetricGroup": "MemoryLat;Offcore;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_store_latency",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU spent handling L1D store misses. Store accesses usually less impact out-of-order core performance; however; holding resources for longer time can lead into undesired implications (e.g. contention on L1D fill-buffer entries - see FB_Full)",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing",
+ "MetricExpr": "((110 * Average_Frequency) * (OCR.DEMAND_RFO.L3_MISS.REMOTE_HITM + OCR.PF_L2_RFO.L3_MISS.REMOTE_HITM) + (47.5 * Average_Frequency) * (OCR.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE + OCR.PF_L2_RFO.L3_HIT.HITM_OTHER_CORE)) / CLKS",
+ "MetricGroup": "DataSharing;Offcore;Snoop;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_false_sharing",
+ "PublicDescription": "This metric roughly estimates how often CPU was handling synchronizations due to False Sharing. False Sharing is a multithreading hiccup; where multiple Logical Processors contend on different data-elements mapped into the same cache line. Sample with: MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM_PS;OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.SNOOP_HITM",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents rate of split store accesses",
+ "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / CORE_CLKS",
+ "MetricGroup": "TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_split_stores",
+ "PublicDescription": "This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity. Sample with: MEM_INST_RETIRED.SPLIT_STORES_PS",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses",
+ "MetricExpr": "(9 * cpu@DTLB_STORE_MISSES.STLB_HIT\\,cmask\\=1@ + DTLB_STORE_MISSES.WALK_ACTIVE) / CORE_CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL4;tma_store_bound_group",
+ "MetricName": "tma_dtlb_store",
+ "PublicDescription": "This metric roughly estimates the fraction of cycles spent handling first-level data TLB store misses. As with ordinary data caching; focus on improving data locality and reducing working-set size to reduce DTLB overhead. Additionally; consider using profile-guided optimization (PGO) to collocate frequently-used data on the same page. Try using larger page sizes for large amounts of frequently-used data. Sample with: MEM_INST_RETIRED.STLB_MISS_STORES_PS",
+ "ScaleUnit": "100%"
},
{
- "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. SMT version; use when SMT is enabled and measuring per logical CPU.",
- "MetricExpr": "1 - (IDQ_UOPS_NOT_DELIVERED.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) - ( UOPS_ISSUED.ANY + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))",
- "MetricGroup": "TopdownL1_SMT",
- "MetricName": "Backend_Bound_SMT",
- "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "BriefDescription": "This metric roughly estimates the fraction of cycles where the TLB was missed by store accesses, hitting in the second-level TLB (STLB)",
+ "MetricExpr": "tma_dtlb_store - tma_store_stlb_miss",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
+ "MetricName": "tma_store_stlb_hit",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates the fraction of cycles where the STLB was missed by store accesses, performing a hardware page walk",
+ "MetricExpr": "DTLB_STORE_MISSES.WALK_ACTIVE / CORE_CLKS",
+ "MetricGroup": "MemoryTLB;TopdownL5;tma_dtlb_store_group",
+ "MetricName": "tma_store_stlb_miss",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck",
+ "MetricExpr": "tma_backend_bound - tma_memory_bound",
+ "MetricGroup": "Backend;Compute;TopdownL2;tma_L2_group;tma_backend_bound_group",
+ "MetricName": "tma_core_bound",
+ "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the Divider unit was active",
+ "MetricExpr": "ARITH.DIVIDER_ACTIVE / CLKS",
+ "MetricGroup": "TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_divider",
+ "PublicDescription": "This metric represents fraction of cycles where the Divider unit was active. Divide and square root instructions are performed by the Divider unit and can take considerably longer latency than integer or Floating Point addition; subtraction; or multiplication. Sample with: ARITH.DIVIDER_ACTIVE",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)",
+ "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / CLKS if (ARITH.DIVIDER_ACTIVE < (CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY)) else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / CLKS",
+ "MetricGroup": "PortsUtil;TopdownL3;tma_core_bound_group",
+ "MetricName": "tma_ports_utilization",
+ "PublicDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related). Two distinct categories can be attributed into this metric: (1) heavy data-dependency among contiguous instructions would manifest in this metric - such cases are often referred to as low Instruction Level Parallelism (ILP). (2) Contention on some hardware execution unit other than Divider. For example; when there are too many multiply operations.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_NONE / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_0",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise). Long-latency instructions like divides may contribute to this metric.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations",
+ "MetricExpr": "PARTIAL_RAT_STALLS.SCOREBOARD / CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
+ "MetricName": "tma_serializing_operation",
+ "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions",
+ "MetricExpr": "40 * ROB_MISC_EVENTS.PAUSE_INST / CLKS",
+ "MetricGroup": "TopdownL6;tma_serializing_operation_group",
+ "MetricName": "tma_slow_pause",
+ "PublicDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions. Sample with: MISC_RETIRED.PAUSE_INST",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued",
+ "MetricExpr": "CLKS * UOPS_ISSUED.VECTOR_WIDTH_MISMATCH / UOPS_ISSUED.ANY",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_0_group",
+ "MetricName": "tma_mixing_vectors",
+ "PublicDescription": "The Mixing_Vectors metric gives the percentage of injected blend uops out of all uops issued. Usually a Mixing_Vectors over 5% is worth investigating. Read more in Appendix B1 of the Optimizations Guide for this topic.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_1 - UOPS_EXECUTED.CORE_CYCLES_GE_2) / 2 if #SMT_on else EXE_ACTIVITY.1_PORTS_UTIL) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_1",
+ "PublicDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). This can be due to heavy data-dependency among software instructions; or over oversubscribing a particular hardware resource. In some other cases with high 1_Port_Utilized and L1_Bound; this metric can point to L1 data-cache latency bottleneck that may not necessarily manifest with complete execution starvation (due to the short L1 latency e.g. walking a linked list) - looking at the assembly can be helpful.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)",
+ "MetricExpr": "((UOPS_EXECUTED.CORE_CYCLES_GE_2 - UOPS_EXECUTED.CORE_CYCLES_GE_3) / 2 if #SMT_on else EXE_ACTIVITY.2_PORTS_UTIL) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_2",
+ "PublicDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise). Loop Vectorization -most compilers feature auto-Vectorization options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise).",
+ "MetricExpr": "(UOPS_EXECUTED.CORE_CYCLES_GE_3 / 2 if #SMT_on else UOPS_EXECUTED.CORE_CYCLES_GE_3) / CORE_CLKS",
+ "MetricGroup": "PortsUtil;TopdownL4;tma_ports_utilization_group",
+ "MetricName": "tma_ports_utilized_3m",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / (4 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_alu_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 0 ([SNB+] ALU; [HSW+] ALU and 2nd branch) Sample with: UOPS_DISPATCHED_PORT.PORT_0",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_0 / CORE_CLKS",
+ "MetricGroup": "Compute;TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_0",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 1 (ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_1",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_1 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_1",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 5 ([SNB+] Branches and ALU; [HSW+] ALU) Sample with: UOPS_DISPATCHED.PORT_5",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_5 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_5",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 6 ([HSW+]Primary Branch and simple ALU) Sample with: UOPS_DISPATCHED_PORT.PORT_6",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_6 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_alu_op_utilization_group",
+ "MetricName": "tma_port_6",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations Sample with: UOPS_DISPATCHED.PORT_2_3",
+ "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * CORE_CLKS)",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_load_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 2 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_2",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_2 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_2",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 3 ([SNB+]Loads and Store-address; [ICL+] Loads) Sample with: UOPS_DISPATCHED_PORT.PORT_3",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_3 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_load_op_utilization_group",
+ "MetricName": "tma_port_3",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Store operations",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL5;tma_ports_utilized_3m_group",
+ "MetricName": "tma_store_op_utilization",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 4 (Store-data) Sample with: UOPS_DISPATCHED_PORT.PORT_4",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_4 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_4",
+ "ScaleUnit": "100%"
+ },
+ {
+ "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port 7 ([HSW+]simple Store-address) Sample with: UOPS_DISPATCHED_PORT.PORT_7",
+ "MetricExpr": "UOPS_DISPATCHED_PORT.PORT_7 / CORE_CLKS",
+ "MetricGroup": "TopdownL6;tma_store_op_utilization_group",
+ "MetricName": "tma_port_7",
+ "ScaleUnit": "100%"
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired",
- "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)",
- "MetricGroup": "TopdownL1",
- "MetricName": "Retiring",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. "
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / SLOTS",
+ "MetricGroup": "TopdownL1;tma_L1_group",
+ "MetricName": "tma_retiring",