aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json')
-rw-r--r--tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json136
1 files changed, 86 insertions, 50 deletions
diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
index 73b6865a769d..6789285555f0 100644
--- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
+++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json
@@ -47,7 +47,7 @@
"MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / (4 * CPU_CLK_UNHALTED.THREAD)",
"MetricGroup": "TopdownL1",
"MetricName": "Retiring",
- "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided."
+ "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. "
},
{
"BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. SMT version; use when SMT is enabled and measuring per logical CPU.",
@@ -130,44 +130,26 @@
"MetricName": "FLOPc_SMT"
},
{
- "BriefDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width)",
+ "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width)",
"MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * CPU_CLK_UNHALTED.THREAD )",
"MetricGroup": "Cor;Flops;HPC",
"MetricName": "FP_Arith_Utilization",
- "PublicDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). Values > 1 are possible due to Fused-Multiply Add (FMA) counting."
+ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common)."
},
{
- "BriefDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). SMT version; use when SMT is enabled and measuring per logical CPU.",
+ "BriefDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). SMT version; use when SMT is enabled and measuring per logical CPU.",
"MetricExpr": "( (FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE) + (FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE + FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE) ) / ( 2 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ) )",
"MetricGroup": "Cor;Flops;HPC_SMT",
"MetricName": "FP_Arith_Utilization_SMT",
- "PublicDescription": "Actual per-core usage of the Floating Point execution units (regardless of the vector width). Values > 1 are possible due to Fused-Multiply Add (FMA) counting. SMT version; use when SMT is enabled and measuring per logical CPU."
+ "PublicDescription": "Actual per-core usage of the Floating Point non-X87 execution units (regardless of precision or vector-width). Values > 1 are possible due to ([BDW+] Fused-Multiply Add (FMA) counting - common; [ADL+] use all of ADD/MUL/FMA in Scalar or 128/256-bit vectors - less common). SMT version; use when SMT is enabled and measuring per logical CPU."
},
{
- "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)",
+ "BriefDescription": "Instruction-Level-Parallelism (average number of uops executed when there is execution) per-core",
"MetricExpr": "UOPS_EXECUTED.THREAD / (( cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 ) if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)",
"MetricGroup": "Backend;Cor;Pipeline;PortsUtil",
"MetricName": "ILP"
},
{
- "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BrMispredicts",
- "MetricName": "Branch_Misprediction_Cost"
- },
- {
- "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
- "MetricExpr": "( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BrMispredicts_SMT",
- "MetricName": "Branch_Misprediction_Cost_SMT"
- },
- {
- "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear)",
- "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
- "MetricGroup": "Bad;BadSpec;BrMispredicts",
- "MetricName": "IpMispredict"
- },
- {
"BriefDescription": "Core actual clocks when any Logical Processor is active on the Physical Core",
"MetricExpr": "( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )",
"MetricGroup": "SMT",
@@ -204,7 +186,7 @@
"MetricName": "IpTB"
},
{
- "BriefDescription": "Branch instructions per taken branch.",
+ "BriefDescription": "Branch instructions per taken branch. ",
"MetricExpr": "BR_INST_RETIRED.ALL_BRANCHES / BR_INST_RETIRED.NEAR_TAKEN",
"MetricGroup": "Branches;Fed;PGO",
"MetricName": "BpTkBranch"
@@ -257,41 +239,52 @@
"MetricName": "Instructions"
},
{
+ "BriefDescription": "Average number of Uops retired in cycles where at least one uop has retired.",
+ "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / cpu@UOPS_RETIRED.RETIRE_SLOTS\\,cmask\\=1@",
+ "MetricGroup": "Pipeline;Ret",
+ "MetricName": "Retire"
+ },
+ {
+ "BriefDescription": "",
+ "MetricExpr": "UOPS_EXECUTED.THREAD / cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@",
+ "MetricGroup": "Cor;Pipeline;PortsUtil;SMT",
+ "MetricName": "Execute"
+ },
+ {
"BriefDescription": "Fraction of Uops delivered by the DSB (aka Decoded ICache; or Uop Cache)",
- "MetricExpr": "IDQ.DSB_UOPS / ( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS )",
+ "MetricExpr": "IDQ.DSB_UOPS / (( IDQ.DSB_UOPS + LSD.UOPS + IDQ.MITE_UOPS + IDQ.MS_UOPS ) )",
"MetricGroup": "DSB;Fed;FetchBW",
"MetricName": "DSB_Coverage"
},
{
- "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles)",
- "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
- "MetricGroup": "Mem;MemoryBound;MemoryLat",
- "MetricName": "Load_Miss_Real_Latency",
- "PublicDescription": "Actual Average Latency for L1 data-cache miss demand load instructions (in core cycles). Latency may be overestimated for multi-load instructions - e.g. repeat strings."
+ "BriefDescription": "Number of Instructions per non-speculative Branch Misprediction (JEClear) (lower number means higher occurrence rate)",
+ "MetricExpr": "INST_RETIRED.ANY / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BadSpec;BrMispredicts",
+ "MetricName": "IpMispredict"
},
{
- "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
- "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
- "MetricGroup": "Mem;MemoryBound;MemoryBW",
- "MetricName": "MLP"
+ "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
+ "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * INT_MISC.RECOVERY_CYCLES ) / (4 * CPU_CLK_UNHALTED.THREAD))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * CPU_CLK_UNHALTED.THREAD)) ) * (4 * CPU_CLK_UNHALTED.THREAD) / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BrMispredicts",
+ "MetricName": "Branch_Misprediction_Cost"
},
{
- "BriefDescription": "Average data fill bandwidth to the L1 data cache [GB / sec]",
- "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "L1D_Cache_Fill_BW"
+ "BriefDescription": "Branch Misprediction Cost: Fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)",
+ "MetricExpr": " ( ((BR_MISP_RETIRED.ALL_BRANCHES / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT )) * (( UOPS_ISSUED.ANY - UOPS_RETIRED.RETIRE_SLOTS + 4 * ( INT_MISC.RECOVERY_CYCLES_ANY / 2 ) ) / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )))) + (4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) * (BR_MISP_RETIRED.ALL_BRANCHES * (12 * ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY ) / CPU_CLK_UNHALTED.THREAD) / ( BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT + BACLEARS.ANY )) / #(4 * IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE / (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) ))) ) * (4 * ( ( CPU_CLK_UNHALTED.THREAD / 2 ) * ( 1 + CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE / CPU_CLK_UNHALTED.REF_XCLK ) )) / BR_MISP_RETIRED.ALL_BRANCHES",
+ "MetricGroup": "Bad;BrMispredicts_SMT",
+ "MetricName": "Branch_Misprediction_Cost_SMT"
},
{
- "BriefDescription": "Average data fill bandwidth to the L2 cache [GB / sec]",
- "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "L2_Cache_Fill_BW"
+ "BriefDescription": "Actual Average Latency for L1 data-cache miss demand load operations (in core cycles)",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / ( MEM_LOAD_UOPS_RETIRED.L1_MISS + mem_load_uops_retired.hit_lfb )",
+ "MetricGroup": "Mem;MemoryBound;MemoryLat",
+ "MetricName": "Load_Miss_Real_Latency"
},
{
- "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
- "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
- "MetricGroup": "Mem;MemoryBW",
- "MetricName": "L3_Cache_Fill_BW"
+ "BriefDescription": "Memory-Level-Parallelism (average number of L1 miss demand load when there is at least one such miss. Per-Logical Processor)",
+ "MetricExpr": "L1D_PEND_MISS.PENDING / L1D_PEND_MISS.PENDING_CYCLES",
+ "MetricGroup": "Mem;MemoryBound;MemoryBW",
+ "MetricName": "MLP"
},
{
"BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads",
@@ -306,13 +299,13 @@
"MetricName": "L2MPKI"
},
{
- "BriefDescription": "L2 cache misses per kilo instruction for all request types (including speculative)",
+ "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all request types (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.MISS / INST_RETIRED.ANY",
"MetricGroup": "Mem;CacheMisses;Offcore",
"MetricName": "L2MPKI_All"
},
{
- "BriefDescription": "L2 cache misses per kilo instruction for all demand loads (including speculative)",
+ "BriefDescription": "L2 cache ([RKL+] true) misses per kilo instruction for all demand loads (including speculative)",
"MetricExpr": "1000 * L2_RQSTS.DEMAND_DATA_RD_MISS / INST_RETIRED.ANY",
"MetricGroup": "Mem;CacheMisses",
"MetricName": "L2MPKI_Load"
@@ -349,6 +342,48 @@
"MetricName": "Page_Walks_Utilization_SMT"
},
{
+ "BriefDescription": "Average per-core data fill bandwidth to the L1 data cache [GB / sec]",
+ "MetricExpr": "64 * L1D.REPLACEMENT / 1000000000 / duration_time",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L1D_Cache_Fill_BW"
+ },
+ {
+ "BriefDescription": "Average per-core data fill bandwidth to the L2 cache [GB / sec]",
+ "MetricExpr": "64 * L2_LINES_IN.ALL / 1000000000 / duration_time",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L2_Cache_Fill_BW"
+ },
+ {
+ "BriefDescription": "Average per-core data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L3_Cache_Fill_BW"
+ },
+ {
+ "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]",
+ "MetricExpr": "(64 * L1D.REPLACEMENT / 1000000000 / duration_time)",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L1D_Cache_Fill_BW_1T"
+ },
+ {
+ "BriefDescription": "Average per-thread data fill bandwidth to the L2 cache [GB / sec]",
+ "MetricExpr": "(64 * L2_LINES_IN.ALL / 1000000000 / duration_time)",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L2_Cache_Fill_BW_1T"
+ },
+ {
+ "BriefDescription": "Average per-thread data fill bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "(64 * LONGEST_LAT_CACHE.MISS / 1000000000 / duration_time)",
+ "MetricGroup": "Mem;MemoryBW",
+ "MetricName": "L3_Cache_Fill_BW_1T"
+ },
+ {
+ "BriefDescription": "Average per-thread data access bandwidth to the L3 cache [GB / sec]",
+ "MetricExpr": "0",
+ "MetricGroup": "Mem;MemoryBW;Offcore",
+ "MetricName": "L3_Cache_Access_BW_1T"
+ },
+ {
"BriefDescription": "Average CPU Utilization",
"MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@",
"MetricGroup": "HPC;Summary",
@@ -364,7 +399,8 @@
"BriefDescription": "Giga Floating Point Operations Per Second",
"MetricExpr": "( ( 1 * ( FP_ARITH_INST_RETIRED.SCALAR_SINGLE + FP_ARITH_INST_RETIRED.SCALAR_DOUBLE ) + 2 * FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE + 4 * ( FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE + FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE ) + 8 * FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE ) / 1000000000 ) / duration_time",
"MetricGroup": "Cor;Flops;HPC",
- "MetricName": "GFLOPs"
+ "MetricName": "GFLOPs",
+ "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width and AMX engine."
},
{
"BriefDescription": "Average Frequency Utilization relative nominal frequency",