diff options
Diffstat (limited to 'tools')
551 files changed, 41414 insertions, 20524 deletions
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst index d07ccf8a23f7..9bb9ace54ba8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst @@ -17,8 +17,8 @@ SYNOPSIS *COMMANDS* := { **show** | **list** | **tree** | **attach** | **detach** | **help** } -MAP COMMANDS -============= +CGROUP COMMANDS +=============== | **bpftool** **cgroup { show | list }** *CGROUP* | **bpftool** **cgroup tree** [*CGROUP_ROOT*] @@ -142,5 +142,6 @@ SEE ALSO **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8), + **bpftool-feature**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst new file mode 100644 index 000000000000..82de03dd8f52 --- /dev/null +++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst @@ -0,0 +1,85 @@ +=============== +bpftool-feature +=============== +------------------------------------------------------------------------------- +tool for inspection of eBPF-related parameters for Linux kernel or net device +------------------------------------------------------------------------------- + +:Manual section: 8 + +SYNOPSIS +======== + + **bpftool** [*OPTIONS*] **feature** *COMMAND* + + *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } + + *COMMANDS* := { **probe** | **help** } + +FEATURE COMMANDS +================ + +| **bpftool** **feature probe** [*COMPONENT*] [**macros** [**prefix** *PREFIX*]] +| **bpftool** **feature help** +| +| *COMPONENT* := { **kernel** | **dev** *NAME* } + +DESCRIPTION +=========== + **bpftool feature probe** [**kernel**] [**macros** [**prefix** *PREFIX*]] + Probe the running kernel and dump a number of eBPF-related + parameters, such as availability of the **bpf()** system call, + JIT status, eBPF program types availability, eBPF helper + functions availability, and more. + + If the **macros** keyword (but not the **-j** option) is + passed, a subset of the output is dumped as a list of + **#define** macros that are ready to be included in a C + header file, for example. If, additionally, **prefix** is + used to define a *PREFIX*, the provided string will be used + as a prefix to the names of the macros: this can be used to + avoid conflicts on macro names when including the output of + this command as a header file. + + Keyword **kernel** can be omitted. If no probe target is + specified, probing the kernel is the default behaviour. + + Note that when probed, some eBPF helpers (e.g. + **bpf_trace_printk**\ () or **bpf_probe_write_user**\ ()) may + print warnings to kernel logs. + + **bpftool feature probe dev** *NAME* [**macros** [**prefix** *PREFIX*]] + Probe network device for supported eBPF features and dump + results to the console. + + The two keywords **macros** and **prefix** have the same + role as when probing the kernel. + + **bpftool feature help** + Print short help message. + +OPTIONS +======= + -h, --help + Print short generic help message (similar to **bpftool help**). + + -v, --version + Print version number (similar to **bpftool version**). + + -j, --json + Generate JSON output. For commands that cannot produce JSON, this + option has no effect. + + -p, --pretty + Generate human-readable JSON output. Implies **-j**. + +SEE ALSO +======== + **bpf**\ (2), + **bpf-helpers**\ (7), + **bpftool**\ (8), + **bpftool-prog**\ (8), + **bpftool-map**\ (8), + **bpftool-cgroup**\ (8), + **bpftool-net**\ (8), + **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst index 64b001b4f777..5c984ffc9f01 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-map.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst @@ -25,12 +25,17 @@ MAP COMMANDS | **bpftool** **map create** *FILE* **type** *TYPE* **key** *KEY_SIZE* **value** *VALUE_SIZE* \ | **entries** *MAX_ENTRIES* **name** *NAME* [**flags** *FLAGS*] [**dev** *NAME*] | **bpftool** **map dump** *MAP* -| **bpftool** **map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] -| **bpftool** **map lookup** *MAP* **key** *DATA* +| **bpftool** **map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] +| **bpftool** **map lookup** *MAP* [**key** *DATA*] | **bpftool** **map getnext** *MAP* [**key** *DATA*] | **bpftool** **map delete** *MAP* **key** *DATA* | **bpftool** **map pin** *MAP* *FILE* | **bpftool** **map event_pipe** *MAP* [**cpu** *N* **index** *M*] +| **bpftool** **map peek** *MAP* +| **bpftool** **map push** *MAP* **value** *VALUE* +| **bpftool** **map pop** *MAP* +| **bpftool** **map enqueue** *MAP* **value** *VALUE* +| **bpftool** **map dequeue** *MAP* | **bpftool** **map help** | | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } @@ -62,7 +67,7 @@ DESCRIPTION **bpftool map dump** *MAP* Dump all entries in a given *MAP*. - **bpftool map update** *MAP* **key** *DATA* **value** *VALUE* [*UPDATE_FLAGS*] + **bpftool map update** *MAP* [**key** *DATA*] [**value** *VALUE*] [*UPDATE_FLAGS*] Update map entry for a given *KEY*. *UPDATE_FLAGS* can be one of: **any** update existing entry @@ -75,7 +80,7 @@ DESCRIPTION the bytes are parsed as decimal values, unless a "0x" prefix (for hexadecimal) or a "0" prefix (for octal) is provided. - **bpftool map lookup** *MAP* **key** *DATA* + **bpftool map lookup** *MAP* [**key** *DATA*] Lookup **key** in the map. **bpftool map getnext** *MAP* [**key** *DATA*] @@ -107,6 +112,21 @@ DESCRIPTION replace any existing ring. Any other application will stop receiving events if it installed its rings earlier. + **bpftool map peek** *MAP* + Peek next **value** in the queue or stack. + + **bpftool map push** *MAP* **value** *VALUE* + Push **value** onto the stack. + + **bpftool map pop** *MAP* + Pop and print **value** from the stack. + + **bpftool map enqueue** *MAP* **value** *VALUE* + Enqueue **value** into the queue. + + **bpftool map dequeue** *MAP* + Dequeue and print **value** from the queue. + **bpftool map help** Print short help message. @@ -236,5 +256,6 @@ SEE ALSO **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst index ed87c9b619ad..779dab3650ee 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst @@ -142,4 +142,5 @@ SEE ALSO **bpftool-prog**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst index f4c5e5538bb8..bca5590a80d0 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst @@ -84,4 +84,5 @@ SEE ALSO **bpftool-prog**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), **bpftool-net**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 58c8369b77dd..9386bd6e0396 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -18,7 +18,7 @@ SYNOPSIS { **show** | **list** | **dump xlated** | **dump jited** | **pin** | **load** | **loadall** | **help** } -MAP COMMANDS +PROG COMMANDS ============= | **bpftool** **prog { show | list }** [*PROG*] @@ -42,7 +42,7 @@ MAP COMMANDS | **cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6** | } | *ATTACH_TYPE* := { -| **msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector** +| **msg_verdict** | **stream_verdict** | **stream_parser** | **flow_dissector** | } @@ -171,7 +171,7 @@ EXAMPLES :: - 10: xdp name some_prog tag 005a3d2123620c8b gpl + 10: xdp name some_prog tag 005a3d2123620c8b gpl run_time_ns 81632 run_cnt 10 loaded_at 2017-09-29T20:11:00+0000 uid 0 xlated 528B jited 370B memlock 4096B map_ids 10 @@ -184,6 +184,8 @@ EXAMPLES "type": "xdp", "tag": "005a3d2123620c8b", "gpl_compatible": true, + "run_time_ns": 81632, + "run_cnt": 10, "loaded_at": 1506715860, "uid": 0, "bytes_xlated": 528, @@ -258,5 +260,6 @@ SEE ALSO **bpftool**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst index e1677e81ed59..4f2188845dd8 100644 --- a/tools/bpf/bpftool/Documentation/bpftool.rst +++ b/tools/bpf/bpftool/Documentation/bpftool.rst @@ -16,7 +16,7 @@ SYNOPSIS **bpftool** **version** - *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** } + *OBJECT* := { **map** | **program** | **cgroup** | **perf** | **net** | **feature** } *OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** } | { **-j** | **--json** } [{ **-p** | **--pretty** }] } @@ -34,6 +34,8 @@ SYNOPSIS *NET-COMMANDS* := { **show** | **list** | **help** } + *FEATURE-COMMANDS* := { **probe** | **help** } + DESCRIPTION =========== *bpftool* allows for inspection and simple modification of BPF objects @@ -72,5 +74,6 @@ SEE ALSO **bpftool-prog**\ (8), **bpftool-map**\ (8), **bpftool-cgroup**\ (8), + **bpftool-feature**\ (8), **bpftool-net**\ (8), **bpftool-perf**\ (8) diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index e4e4fab1b8c7..b803827d01e8 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -50,14 +50,15 @@ _bpftool_get_map_ids() command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } -_bpftool_get_perf_map_ids() +# Takes map type and adds matching map ids to the list of suggestions. +_bpftool_get_map_ids_for_type() { + local type="$1" COMPREPLY+=( $( compgen -W "$( bpftool -jp map 2>&1 | \ - command grep -C2 perf_event_array | \ + command grep -C2 "$type" | \ command sed -n 's/.*"id": \(.*\),$/\1/p' )" -- "$cur" ) ) } - _bpftool_get_prog_ids() { COMPREPLY+=( $( compgen -W "$( bpftool -jp prog 2>&1 | \ @@ -99,15 +100,25 @@ _sysfs_get_netdevs() "$cur" ) ) } -# For bpftool map update: retrieve type of the map to update. -_bpftool_map_update_map_type() +# Retrieve type of the map that we are operating on. +_bpftool_map_guess_map_type() { local keyword ref for (( idx=3; idx < ${#words[@]}-1; idx++ )); do - if [[ ${words[$((idx-2))]} == "update" ]]; then - keyword=${words[$((idx-1))]} - ref=${words[$((idx))]} - fi + case "${words[$((idx-2))]}" in + lookup|update) + keyword=${words[$((idx-1))]} + ref=${words[$((idx))]} + ;; + push) + printf "stack" + return 0 + ;; + enqueue) + printf "queue" + return 0 + ;; + esac done [[ -z $ref ]] && return 0 @@ -119,6 +130,8 @@ _bpftool_map_update_map_type() _bpftool_map_update_get_id() { + local command="$1" + # Is it the map to update, or a map to insert into the map to update? # Search for "value" keyword. local idx value @@ -128,11 +141,24 @@ _bpftool_map_update_get_id() break fi done - [[ $value -eq 0 ]] && _bpftool_get_map_ids && return 0 + if [[ $value -eq 0 ]]; then + case "$command" in + push) + _bpftool_get_map_ids_for_type stack + ;; + enqueue) + _bpftool_get_map_ids_for_type queue + ;; + *) + _bpftool_get_map_ids + ;; + esac + return 0 + fi # Id to complete is for a value. It can be either prog id or map id. This # depends on the type of the map to update. - local type=$(_bpftool_map_update_map_type) + local type=$(_bpftool_map_guess_map_type) case $type in array_of_maps|hash_of_maps) _bpftool_get_map_ids @@ -285,8 +311,8 @@ _bpftool() return 0 ;; 5) - COMPREPLY=( $( compgen -W 'msg_verdict skb_verdict \ - skb_parse flow_dissector' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'msg_verdict stream_verdict \ + stream_parser flow_dissector' -- "$cur" ) ) return 0 ;; 6) @@ -382,14 +408,28 @@ _bpftool() map) local MAP_TYPE='id pinned' case $command in - show|list|dump) + show|list|dump|peek|pop|dequeue) case $prev in $command) COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 ;; id) - _bpftool_get_map_ids + case "$command" in + peek) + _bpftool_get_map_ids_for_type stack + _bpftool_get_map_ids_for_type queue + ;; + pop) + _bpftool_get_map_ids_for_type stack + ;; + dequeue) + _bpftool_get_map_ids_for_type queue + ;; + *) + _bpftool_get_map_ids + ;; + esac return 0 ;; *) @@ -447,19 +487,25 @@ _bpftool() COMPREPLY+=( $( compgen -W 'hex' -- "$cur" ) ) ;; *) + case $(_bpftool_map_guess_map_type) in + queue|stack) + return 0 + ;; + esac + _bpftool_once_attr 'key' return 0 ;; esac ;; - update) + update|push|enqueue) case $prev in $command) COMPREPLY=( $( compgen -W "$MAP_TYPE" -- "$cur" ) ) return 0 ;; id) - _bpftool_map_update_get_id + _bpftool_map_update_get_id $command return 0 ;; key) @@ -468,7 +514,7 @@ _bpftool() value) # We can have bytes, or references to a prog or a # map, depending on the type of the map to update. - case $(_bpftool_map_update_map_type) in + case "$(_bpftool_map_guess_map_type)" in array_of_maps|hash_of_maps) local MAP_TYPE='id pinned' COMPREPLY+=( $( compgen -W "$MAP_TYPE" \ @@ -490,6 +536,13 @@ _bpftool() return 0 ;; *) + case $(_bpftool_map_guess_map_type) in + queue|stack) + _bpftool_once_attr 'value' + return 0; + ;; + esac + _bpftool_once_attr 'key' local UPDATE_FLAGS='any exist noexist' for (( idx=3; idx < ${#words[@]}-1; idx++ )); do @@ -508,6 +561,7 @@ _bpftool() return 0 fi done + return 0 ;; esac @@ -527,7 +581,7 @@ _bpftool() return 0 ;; id) - _bpftool_get_perf_map_ids + _bpftool_get_map_ids_for_type perf_event_array return 0 ;; cpu) @@ -546,7 +600,8 @@ _bpftool() *) [[ $prev == $object ]] && \ COMPREPLY=( $( compgen -W 'delete dump getnext help \ - lookup pin event_pipe show list update create' -- \ + lookup pin event_pipe show list update create \ + peek push enqueue pop dequeue' -- \ "$cur" ) ) ;; esac @@ -624,6 +679,25 @@ _bpftool() ;; esac ;; + feature) + case $command in + probe) + [[ $prev == "dev" ]] && _sysfs_get_netdevs && return 0 + [[ $prev == "prefix" ]] && return 0 + if _bpftool_search_list 'macros'; then + COMPREPLY+=( $( compgen -W 'prefix' -- "$cur" ) ) + else + COMPREPLY+=( $( compgen -W 'macros' -- "$cur" ) ) + fi + _bpftool_one_of_list 'kernel dev' + return 0 + ;; + *) + [[ $prev == $object ]] && \ + COMPREPLY=( $( compgen -W 'help probe' -- "$cur" ) ) + ;; + esac + ;; esac } && complete -F _bpftool bpftool diff --git a/tools/bpf/bpftool/btf_dumper.c b/tools/bpf/bpftool/btf_dumper.c index 6ba5f567a9d8..e63bce0755eb 100644 --- a/tools/bpf/bpftool/btf_dumper.c +++ b/tools/bpf/bpftool/btf_dumper.c @@ -73,35 +73,104 @@ static int btf_dumper_array(const struct btf_dumper *d, __u32 type_id, return ret; } +static void btf_int128_print(json_writer_t *jw, const void *data, + bool is_plain_text) +{ + /* data points to a __int128 number. + * Suppose + * int128_num = *(__int128 *)data; + * The below formulas shows what upper_num and lower_num represents: + * upper_num = int128_num >> 64; + * lower_num = int128_num & 0xffffffffFFFFFFFFULL; + */ + __u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = *(__u64 *)data; + lower_num = *(__u64 *)(data + 8); +#else + upper_num = *(__u64 *)(data + 8); + lower_num = *(__u64 *)data; +#endif + + if (is_plain_text) { + if (upper_num == 0) + jsonw_printf(jw, "0x%llx", lower_num); + else + jsonw_printf(jw, "0x%llx%016llx", upper_num, lower_num); + } else { + if (upper_num == 0) + jsonw_printf(jw, "\"0x%llx\"", lower_num); + else + jsonw_printf(jw, "\"0x%llx%016llx\"", upper_num, lower_num); + } +} + +static void btf_int128_shift(__u64 *print_num, u16 left_shift_bits, + u16 right_shift_bits) +{ + __u64 upper_num, lower_num; + +#ifdef __BIG_ENDIAN_BITFIELD + upper_num = print_num[0]; + lower_num = print_num[1]; +#else + upper_num = print_num[1]; + lower_num = print_num[0]; +#endif + + /* shake out un-needed bits by shift/or operations */ + if (left_shift_bits >= 64) { + upper_num = lower_num << (left_shift_bits - 64); + lower_num = 0; + } else { + upper_num = (upper_num << left_shift_bits) | + (lower_num >> (64 - left_shift_bits)); + lower_num = lower_num << left_shift_bits; + } + + if (right_shift_bits >= 64) { + lower_num = upper_num >> (right_shift_bits - 64); + upper_num = 0; + } else { + lower_num = (lower_num >> right_shift_bits) | + (upper_num << (64 - right_shift_bits)); + upper_num = upper_num >> right_shift_bits; + } + +#ifdef __BIG_ENDIAN_BITFIELD + print_num[0] = upper_num; + print_num[1] = lower_num; +#else + print_num[0] = lower_num; + print_num[1] = upper_num; +#endif +} + static void btf_dumper_bitfield(__u32 nr_bits, __u8 bit_offset, const void *data, json_writer_t *jw, bool is_plain_text) { int left_shift_bits, right_shift_bits; + __u64 print_num[2] = {}; int bytes_to_copy; int bits_to_copy; - __u64 print_num; bits_to_copy = bit_offset + nr_bits; bytes_to_copy = BITS_ROUNDUP_BYTES(bits_to_copy); - print_num = 0; - memcpy(&print_num, data, bytes_to_copy); + memcpy(print_num, data, bytes_to_copy); #if defined(__BIG_ENDIAN_BITFIELD) left_shift_bits = bit_offset; #elif defined(__LITTLE_ENDIAN_BITFIELD) - left_shift_bits = 64 - bits_to_copy; + left_shift_bits = 128 - bits_to_copy; #else #error neither big nor little endian #endif - right_shift_bits = 64 - nr_bits; + right_shift_bits = 128 - nr_bits; - print_num <<= left_shift_bits; - print_num >>= right_shift_bits; - if (is_plain_text) - jsonw_printf(jw, "0x%llx", print_num); - else - jsonw_printf(jw, "%llu", print_num); + btf_int128_shift(print_num, left_shift_bits, right_shift_bits); + btf_int128_print(jw, print_num, is_plain_text); } @@ -113,7 +182,7 @@ static void btf_dumper_int_bits(__u32 int_type, __u8 bit_offset, int total_bits_offset; /* bits_offset is at most 7. - * BTF_INT_OFFSET() cannot exceed 64 bits. + * BTF_INT_OFFSET() cannot exceed 128 bits. */ total_bits_offset = bit_offset + BTF_INT_OFFSET(int_type); data += BITS_ROUNDDOWN_BYTES(total_bits_offset); @@ -139,6 +208,11 @@ static int btf_dumper_int(const struct btf_type *t, __u8 bit_offset, return 0; } + if (nr_bits == 128) { + btf_int128_print(jw, data, is_plain_text); + return 0; + } + switch (BTF_INT_ENCODING(*int_type)) { case 0: if (BTF_INT_BITS(*int_type) == 64) diff --git a/tools/bpf/bpftool/cfg.c b/tools/bpf/bpftool/cfg.c index 31f0db41513f..3e21f994f262 100644 --- a/tools/bpf/bpftool/cfg.c +++ b/tools/bpf/bpftool/cfg.c @@ -157,6 +157,11 @@ static bool cfg_partition_funcs(struct cfg *cfg, struct bpf_insn *cur, return false; } +static bool is_jmp_insn(u8 code) +{ + return BPF_CLASS(code) == BPF_JMP || BPF_CLASS(code) == BPF_JMP32; +} + static bool func_partition_bb_head(struct func_node *func) { struct bpf_insn *cur, *end; @@ -170,7 +175,7 @@ static bool func_partition_bb_head(struct func_node *func) return true; for (; cur <= end; cur++) { - if (BPF_CLASS(cur->code) == BPF_JMP) { + if (is_jmp_insn(cur->code)) { u8 opcode = BPF_OP(cur->code); if (opcode == BPF_EXIT || opcode == BPF_CALL) @@ -296,7 +301,7 @@ static bool func_add_bb_edges(struct func_node *func) e->src = bb; insn = bb->tail; - if (BPF_CLASS(insn->code) != BPF_JMP || + if (!is_jmp_insn(insn->code) || BPF_OP(insn->code) == BPF_EXIT) { e->dst = bb_next(bb); e->flags |= EDGE_FLAG_FALLTHROUGH; diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c new file mode 100644 index 000000000000..d672d9086fff --- /dev/null +++ b/tools/bpf/bpftool/feature.c @@ -0,0 +1,764 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include <ctype.h> +#include <errno.h> +#include <string.h> +#include <unistd.h> +#include <net/if.h> +#include <sys/utsname.h> +#include <sys/vfs.h> + +#include <linux/filter.h> +#include <linux/limits.h> + +#include <bpf.h> +#include <libbpf.h> + +#include "main.h" + +#ifndef PROC_SUPER_MAGIC +# define PROC_SUPER_MAGIC 0x9fa0 +#endif + +enum probe_component { + COMPONENT_UNSPEC, + COMPONENT_KERNEL, + COMPONENT_DEVICE, +}; + +#define BPF_HELPER_MAKE_ENTRY(name) [BPF_FUNC_ ## name] = "bpf_" # name +static const char * const helper_name[] = { + __BPF_FUNC_MAPPER(BPF_HELPER_MAKE_ENTRY) +}; + +#undef BPF_HELPER_MAKE_ENTRY + +/* Miscellaneous utility functions */ + +static bool check_procfs(void) +{ + struct statfs st_fs; + + if (statfs("/proc", &st_fs) < 0) + return false; + if ((unsigned long)st_fs.f_type != PROC_SUPER_MAGIC) + return false; + + return true; +} + +static void uppercase(char *str, size_t len) +{ + size_t i; + + for (i = 0; i < len && str[i] != '\0'; i++) + str[i] = toupper(str[i]); +} + +/* Printing utility functions */ + +static void +print_bool_feature(const char *feat_name, const char *plain_name, + const char *define_name, bool res, const char *define_prefix) +{ + if (json_output) + jsonw_bool_field(json_wtr, feat_name, res); + else if (define_prefix) + printf("#define %s%sHAVE_%s\n", define_prefix, + res ? "" : "NO_", define_name); + else + printf("%s is %savailable\n", plain_name, res ? "" : "NOT "); +} + +static void print_kernel_option(const char *name, const char *value) +{ + char *endptr; + int res; + + /* No support for C-style ouptut */ + + if (json_output) { + if (!value) { + jsonw_null_field(json_wtr, name); + return; + } + errno = 0; + res = strtol(value, &endptr, 0); + if (!errno && *endptr == '\n') + jsonw_int_field(json_wtr, name, res); + else + jsonw_string_field(json_wtr, name, value); + } else { + if (value) + printf("%s is set to %s\n", name, value); + else + printf("%s is not set\n", name); + } +} + +static void +print_start_section(const char *json_title, const char *plain_title, + const char *define_comment, const char *define_prefix) +{ + if (json_output) { + jsonw_name(json_wtr, json_title); + jsonw_start_object(json_wtr); + } else if (define_prefix) { + printf("%s\n", define_comment); + } else { + printf("%s\n", plain_title); + } +} + +static void +print_end_then_start_section(const char *json_title, const char *plain_title, + const char *define_comment, + const char *define_prefix) +{ + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + + print_start_section(json_title, plain_title, define_comment, + define_prefix); +} + +/* Probing functions */ + +static int read_procfs(const char *path) +{ + char *endptr, *line = NULL; + size_t len = 0; + FILE *fd; + int res; + + fd = fopen(path, "r"); + if (!fd) + return -1; + + res = getline(&line, &len, fd); + fclose(fd); + if (res < 0) + return -1; + + errno = 0; + res = strtol(line, &endptr, 10); + if (errno || *line == '\0' || *endptr != '\n') + res = -1; + free(line); + + return res; +} + +static void probe_unprivileged_disabled(void) +{ + int res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/kernel/unprivileged_bpf_disabled"); + if (json_output) { + jsonw_int_field(json_wtr, "unprivileged_bpf_disabled", res); + } else { + switch (res) { + case 0: + printf("bpf() syscall for unprivileged users is enabled\n"); + break; + case 1: + printf("bpf() syscall restricted to privileged users\n"); + break; + case -1: + printf("Unable to retrieve required privileges for bpf() syscall\n"); + break; + default: + printf("bpf() syscall restriction has unknown value %d\n", res); + } + } +} + +static void probe_jit_enable(void) +{ + int res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_enable"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_enable", res); + } else { + switch (res) { + case 0: + printf("JIT compiler is disabled\n"); + break; + case 1: + printf("JIT compiler is enabled\n"); + break; + case 2: + printf("JIT compiler is enabled with debugging traces in kernel logs\n"); + break; + case -1: + printf("Unable to retrieve JIT-compiler status\n"); + break; + default: + printf("JIT-compiler status has unknown value %d\n", + res); + } + } +} + +static void probe_jit_harden(void) +{ + int res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_harden"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_harden", res); + } else { + switch (res) { + case 0: + printf("JIT compiler hardening is disabled\n"); + break; + case 1: + printf("JIT compiler hardening is enabled for unprivileged users\n"); + break; + case 2: + printf("JIT compiler hardening is enabled for all users\n"); + break; + case -1: + printf("Unable to retrieve JIT hardening status\n"); + break; + default: + printf("JIT hardening status has unknown value %d\n", + res); + } + } +} + +static void probe_jit_kallsyms(void) +{ + int res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_kallsyms"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_kallsyms", res); + } else { + switch (res) { + case 0: + printf("JIT compiler kallsyms exports are disabled\n"); + break; + case 1: + printf("JIT compiler kallsyms exports are enabled for root\n"); + break; + case -1: + printf("Unable to retrieve JIT kallsyms export status\n"); + break; + default: + printf("JIT kallsyms exports status has unknown value %d\n", res); + } + } +} + +static void probe_jit_limit(void) +{ + int res; + + /* No support for C-style ouptut */ + + res = read_procfs("/proc/sys/net/core/bpf_jit_limit"); + if (json_output) { + jsonw_int_field(json_wtr, "bpf_jit_limit", res); + } else { + switch (res) { + case -1: + printf("Unable to retrieve global memory limit for JIT compiler for unprivileged users\n"); + break; + default: + printf("Global memory limit for JIT compiler for unprivileged users is %d bytes\n", res); + } + } +} + +static char *get_kernel_config_option(FILE *fd, const char *option) +{ + size_t line_n = 0, optlen = strlen(option); + char *res, *strval, *line = NULL; + ssize_t n; + + rewind(fd); + while ((n = getline(&line, &line_n, fd)) > 0) { + if (strncmp(line, option, optlen)) + continue; + /* Check we have at least '=', value, and '\n' */ + if (strlen(line) < optlen + 3) + continue; + if (*(line + optlen) != '=') + continue; + + /* Trim ending '\n' */ + line[strlen(line) - 1] = '\0'; + + /* Copy and return config option value */ + strval = line + optlen + 1; + res = strdup(strval); + free(line); + return res; + } + free(line); + + return NULL; +} + +static void probe_kernel_image_config(void) +{ + static const char * const options[] = { + /* Enable BPF */ + "CONFIG_BPF", + /* Enable bpf() syscall */ + "CONFIG_BPF_SYSCALL", + /* Does selected architecture support eBPF JIT compiler */ + "CONFIG_HAVE_EBPF_JIT", + /* Compile eBPF JIT compiler */ + "CONFIG_BPF_JIT", + /* Avoid compiling eBPF interpreter (use JIT only) */ + "CONFIG_BPF_JIT_ALWAYS_ON", + + /* cgroups */ + "CONFIG_CGROUPS", + /* BPF programs attached to cgroups */ + "CONFIG_CGROUP_BPF", + /* bpf_get_cgroup_classid() helper */ + "CONFIG_CGROUP_NET_CLASSID", + /* bpf_skb_{,ancestor_}cgroup_id() helpers */ + "CONFIG_SOCK_CGROUP_DATA", + + /* Tracing: attach BPF to kprobes, tracepoints, etc. */ + "CONFIG_BPF_EVENTS", + /* Kprobes */ + "CONFIG_KPROBE_EVENTS", + /* Uprobes */ + "CONFIG_UPROBE_EVENTS", + /* Tracepoints */ + "CONFIG_TRACING", + /* Syscall tracepoints */ + "CONFIG_FTRACE_SYSCALLS", + /* bpf_override_return() helper support for selected arch */ + "CONFIG_FUNCTION_ERROR_INJECTION", + /* bpf_override_return() helper */ + "CONFIG_BPF_KPROBE_OVERRIDE", + + /* Network */ + "CONFIG_NET", + /* AF_XDP sockets */ + "CONFIG_XDP_SOCKETS", + /* BPF_PROG_TYPE_LWT_* and related helpers */ + "CONFIG_LWTUNNEL_BPF", + /* BPF_PROG_TYPE_SCHED_ACT, TC (traffic control) actions */ + "CONFIG_NET_ACT_BPF", + /* BPF_PROG_TYPE_SCHED_CLS, TC filters */ + "CONFIG_NET_CLS_BPF", + /* TC clsact qdisc */ + "CONFIG_NET_CLS_ACT", + /* Ingress filtering with TC */ + "CONFIG_NET_SCH_INGRESS", + /* bpf_skb_get_xfrm_state() helper */ + "CONFIG_XFRM", + /* bpf_get_route_realm() helper */ + "CONFIG_IP_ROUTE_CLASSID", + /* BPF_PROG_TYPE_LWT_SEG6_LOCAL and related helpers */ + "CONFIG_IPV6_SEG6_BPF", + /* BPF_PROG_TYPE_LIRC_MODE2 and related helpers */ + "CONFIG_BPF_LIRC_MODE2", + /* BPF stream parser and BPF socket maps */ + "CONFIG_BPF_STREAM_PARSER", + /* xt_bpf module for passing BPF programs to netfilter */ + "CONFIG_NETFILTER_XT_MATCH_BPF", + /* bpfilter back-end for iptables */ + "CONFIG_BPFILTER", + /* bpftilter module with "user mode helper" */ + "CONFIG_BPFILTER_UMH", + + /* test_bpf module for BPF tests */ + "CONFIG_TEST_BPF", + }; + char *value, *buf = NULL; + struct utsname utsn; + char path[PATH_MAX]; + size_t i, n; + ssize_t ret; + FILE *fd; + + if (uname(&utsn)) + goto no_config; + + snprintf(path, sizeof(path), "/boot/config-%s", utsn.release); + + fd = fopen(path, "r"); + if (!fd && errno == ENOENT) { + /* Some distributions put the config file at /proc/config, give + * it a try. + * Sometimes it is also at /proc/config.gz but we do not try + * this one for now, it would require linking against libz. + */ + fd = fopen("/proc/config", "r"); + } + if (!fd) { + p_info("skipping kernel config, can't open file: %s", + strerror(errno)); + goto no_config; + } + /* Sanity checks */ + ret = getline(&buf, &n, fd); + ret = getline(&buf, &n, fd); + if (!buf || !ret) { + p_info("skipping kernel config, can't read from file: %s", + strerror(errno)); + free(buf); + goto no_config; + } + if (strcmp(buf, "# Automatically generated file; DO NOT EDIT.\n")) { + p_info("skipping kernel config, can't find correct file"); + free(buf); + goto no_config; + } + free(buf); + + for (i = 0; i < ARRAY_SIZE(options); i++) { + value = get_kernel_config_option(fd, options[i]); + print_kernel_option(options[i], value); + free(value); + } + fclose(fd); + return; + +no_config: + for (i = 0; i < ARRAY_SIZE(options); i++) + print_kernel_option(options[i], NULL); +} + +static bool probe_bpf_syscall(const char *define_prefix) +{ + bool res; + + bpf_load_program(BPF_PROG_TYPE_UNSPEC, NULL, 0, NULL, 0, NULL, 0); + res = (errno != ENOSYS); + + print_bool_feature("have_bpf_syscall", + "bpf() syscall", + "BPF_SYSCALL", + res, define_prefix); + + return res; +} + +static void +probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types, + const char *define_prefix, __u32 ifindex) +{ + char feat_name[128], plain_desc[128], define_name[128]; + const char *plain_comment = "eBPF program_type "; + size_t maxlen; + bool res; + + if (ifindex) + /* Only test offload-able program types */ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_XDP: + break; + default: + return; + } + + res = bpf_probe_prog_type(prog_type, ifindex); + + supported_types[prog_type] |= res; + + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; + if (strlen(prog_type_name[prog_type]) > maxlen) { + p_info("program type name too long"); + return; + } + + sprintf(feat_name, "have_%s_prog_type", prog_type_name[prog_type]); + sprintf(define_name, "%s_prog_type", prog_type_name[prog_type]); + uppercase(define_name, sizeof(define_name)); + sprintf(plain_desc, "%s%s", plain_comment, prog_type_name[prog_type]); + print_bool_feature(feat_name, plain_desc, define_name, res, + define_prefix); +} + +static void +probe_map_type(enum bpf_map_type map_type, const char *define_prefix, + __u32 ifindex) +{ + char feat_name[128], plain_desc[128], define_name[128]; + const char *plain_comment = "eBPF map_type "; + size_t maxlen; + bool res; + + res = bpf_probe_map_type(map_type, ifindex); + + maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1; + if (strlen(map_type_name[map_type]) > maxlen) { + p_info("map type name too long"); + return; + } + + sprintf(feat_name, "have_%s_map_type", map_type_name[map_type]); + sprintf(define_name, "%s_map_type", map_type_name[map_type]); + uppercase(define_name, sizeof(define_name)); + sprintf(plain_desc, "%s%s", plain_comment, map_type_name[map_type]); + print_bool_feature(feat_name, plain_desc, define_name, res, + define_prefix); +} + +static void +probe_helpers_for_progtype(enum bpf_prog_type prog_type, bool supported_type, + const char *define_prefix, __u32 ifindex) +{ + const char *ptype_name = prog_type_name[prog_type]; + char feat_name[128]; + unsigned int id; + bool res; + + if (ifindex) + /* Only test helpers for offload-able program types */ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_XDP: + break; + default: + return; + } + + if (json_output) { + sprintf(feat_name, "%s_available_helpers", ptype_name); + jsonw_name(json_wtr, feat_name); + jsonw_start_array(json_wtr); + } else if (!define_prefix) { + printf("eBPF helpers supported for program type %s:", + ptype_name); + } + + for (id = 1; id < ARRAY_SIZE(helper_name); id++) { + if (!supported_type) + res = false; + else + res = bpf_probe_helper(id, prog_type, ifindex); + + if (json_output) { + if (res) + jsonw_string(json_wtr, helper_name[id]); + } else if (define_prefix) { + printf("#define %sBPF__PROG_TYPE_%s__HELPER_%s %s\n", + define_prefix, ptype_name, helper_name[id], + res ? "1" : "0"); + } else { + if (res) + printf("\n\t- %s", helper_name[id]); + } + } + + if (json_output) + jsonw_end_array(json_wtr); + else if (!define_prefix) + printf("\n"); +} + +static int do_probe(int argc, char **argv) +{ + enum probe_component target = COMPONENT_UNSPEC; + const char *define_prefix = NULL; + bool supported_types[128] = {}; + __u32 ifindex = 0; + unsigned int i; + char *ifname; + + /* Detection assumes user has sufficient privileges (CAP_SYS_ADMIN). + * Let's approximate, and restrict usage to root user only. + */ + if (geteuid()) { + p_err("please run this command as root user"); + return -1; + } + + set_max_rlimit(); + + while (argc) { + if (is_prefix(*argv, "kernel")) { + if (target != COMPONENT_UNSPEC) { + p_err("component to probe already specified"); + return -1; + } + target = COMPONENT_KERNEL; + NEXT_ARG(); + } else if (is_prefix(*argv, "dev")) { + NEXT_ARG(); + + if (target != COMPONENT_UNSPEC || ifindex) { + p_err("component to probe already specified"); + return -1; + } + if (!REQ_ARGS(1)) + return -1; + + target = COMPONENT_DEVICE; + ifname = GET_ARG(); + ifindex = if_nametoindex(ifname); + if (!ifindex) { + p_err("unrecognized netdevice '%s': %s", ifname, + strerror(errno)); + return -1; + } + } else if (is_prefix(*argv, "macros") && !define_prefix) { + define_prefix = ""; + NEXT_ARG(); + } else if (is_prefix(*argv, "prefix")) { + if (!define_prefix) { + p_err("'prefix' argument can only be use after 'macros'"); + return -1; + } + if (strcmp(define_prefix, "")) { + p_err("'prefix' already defined"); + return -1; + } + NEXT_ARG(); + + if (!REQ_ARGS(1)) + return -1; + define_prefix = GET_ARG(); + } else { + p_err("expected no more arguments, 'kernel', 'dev', 'macros' or 'prefix', got: '%s'?", + *argv); + return -1; + } + } + + if (json_output) { + define_prefix = NULL; + jsonw_start_object(json_wtr); + } + + switch (target) { + case COMPONENT_KERNEL: + case COMPONENT_UNSPEC: + if (define_prefix) + break; + + print_start_section("system_config", + "Scanning system configuration...", + NULL, /* define_comment never used here */ + NULL); /* define_prefix always NULL here */ + if (check_procfs()) { + probe_unprivileged_disabled(); + probe_jit_enable(); + probe_jit_harden(); + probe_jit_kallsyms(); + probe_jit_limit(); + } else { + p_info("/* procfs not mounted, skipping related probes */"); + } + probe_kernel_image_config(); + if (json_output) + jsonw_end_object(json_wtr); + else + printf("\n"); + break; + default: + break; + } + + print_start_section("syscall_config", + "Scanning system call availability...", + "/*** System call availability ***/", + define_prefix); + + if (!probe_bpf_syscall(define_prefix)) + /* bpf() syscall unavailable, don't probe other BPF features */ + goto exit_close_json; + + print_end_then_start_section("program_types", + "Scanning eBPF program types...", + "/*** eBPF program types ***/", + define_prefix); + + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_prog_type(i, supported_types, define_prefix, ifindex); + + print_end_then_start_section("map_types", + "Scanning eBPF map types...", + "/*** eBPF map types ***/", + define_prefix); + + for (i = BPF_MAP_TYPE_UNSPEC + 1; i < map_type_name_size; i++) + probe_map_type(i, define_prefix, ifindex); + + print_end_then_start_section("helpers", + "Scanning eBPF helper functions...", + "/*** eBPF helper functions ***/", + define_prefix); + + if (define_prefix) + printf("/*\n" + " * Use %sHAVE_PROG_TYPE_HELPER(prog_type_name, helper_name)\n" + " * to determine if <helper_name> is available for <prog_type_name>,\n" + " * e.g.\n" + " * #if %sHAVE_PROG_TYPE_HELPER(xdp, bpf_redirect)\n" + " * // do stuff with this helper\n" + " * #elif\n" + " * // use a workaround\n" + " * #endif\n" + " */\n" + "#define %sHAVE_PROG_TYPE_HELPER(prog_type, helper) \\\n" + " %sBPF__PROG_TYPE_ ## prog_type ## __HELPER_ ## helper\n", + define_prefix, define_prefix, define_prefix, + define_prefix); + for (i = BPF_PROG_TYPE_UNSPEC + 1; i < ARRAY_SIZE(prog_type_name); i++) + probe_helpers_for_progtype(i, supported_types[i], + define_prefix, ifindex); + +exit_close_json: + if (json_output) { + /* End current "section" of probes */ + jsonw_end_object(json_wtr); + /* End root object */ + jsonw_end_object(json_wtr); + } + + return 0; +} + +static int do_help(int argc, char **argv) +{ + if (json_output) { + jsonw_null(json_wtr); + return 0; + } + + fprintf(stderr, + "Usage: %s %s probe [COMPONENT] [macros [prefix PREFIX]]\n" + " %s %s help\n" + "\n" + " COMPONENT := { kernel | dev NAME }\n" + "", + bin_name, argv[-2], bin_name, argv[-2]); + + return 0; +} + +static const struct cmd cmds[] = { + { "probe", do_probe }, + { "help", do_help }, + { 0 } +}; + +int do_feature(int argc, char **argv) +{ + return cmd_select(cmds, argc, argv, do_help); +} diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index f44a1c2c4ea0..a9d5e9e6a732 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -56,7 +56,7 @@ static int do_help(int argc, char **argv) " %s batch file FILE\n" " %s version\n" "\n" - " OBJECT := { prog | map | cgroup | perf | net }\n" + " OBJECT := { prog | map | cgroup | perf | net | feature }\n" " " HELP_SPEC_OPTIONS "\n" "", bin_name, bin_name, bin_name); @@ -187,6 +187,7 @@ static const struct cmd cmds[] = { { "cgroup", do_cgroup }, { "perf", do_perf }, { "net", do_net }, + { "feature", do_feature }, { "version", do_version }, { 0 } }; diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 052c91d4dc55..d7dd84d3c660 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -75,6 +75,9 @@ static const char * const prog_type_name[] = { [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", }; +extern const char * const map_type_name[]; +extern const size_t map_type_name_size; + enum bpf_obj_type { BPF_OBJ_UNKNOWN, BPF_OBJ_PROG, @@ -145,6 +148,7 @@ int do_cgroup(int argc, char **arg); int do_perf(int argc, char **arg); int do_net(int argc, char **arg); int do_tracelog(int argc, char **arg); +int do_feature(int argc, char **argv); int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what); int prog_parse_fd(int *argc, char ***argv); diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index 1ef1ee2280a2..e0c650d91784 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -21,7 +21,7 @@ #include "json_writer.h" #include "main.h" -static const char * const map_type_name[] = { +const char * const map_type_name[] = { [BPF_MAP_TYPE_UNSPEC] = "unspec", [BPF_MAP_TYPE_HASH] = "hash", [BPF_MAP_TYPE_ARRAY] = "array", @@ -48,6 +48,8 @@ static const char * const map_type_name[] = { [BPF_MAP_TYPE_STACK] = "stack", }; +const size_t map_type_name_size = ARRAY_SIZE(map_type_name); + static bool map_is_per_cpu(__u32 type) { return type == BPF_MAP_TYPE_PERCPU_HASH || @@ -285,16 +287,21 @@ static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, single_line = info->key_size + info->value_size <= 24 && !break_names; - printf("key:%c", break_names ? '\n' : ' '); - fprint_hex(stdout, key, info->key_size, " "); + if (info->key_size) { + printf("key:%c", break_names ? '\n' : ' '); + fprint_hex(stdout, key, info->key_size, " "); - printf(single_line ? " " : "\n"); + printf(single_line ? " " : "\n"); + } - printf("value:%c", break_names ? '\n' : ' '); - if (value) - fprint_hex(stdout, value, info->value_size, " "); - else - printf("<no entry>"); + if (info->value_size) { + printf("value:%c", break_names ? '\n' : ' '); + if (value) + fprint_hex(stdout, value, info->value_size, + " "); + else + printf("<no entry>"); + } printf("\n"); } else { @@ -303,19 +310,23 @@ static void print_entry_plain(struct bpf_map_info *info, unsigned char *key, n = get_possible_cpus(); step = round_up(info->value_size, 8); - printf("key:\n"); - fprint_hex(stdout, key, info->key_size, " "); - printf("\n"); - for (i = 0; i < n; i++) { - printf("value (CPU %02d):%c", - i, info->value_size > 16 ? '\n' : ' '); - if (value) - fprint_hex(stdout, value + i * step, - info->value_size, " "); - else - printf("<no entry>"); + if (info->key_size) { + printf("key:\n"); + fprint_hex(stdout, key, info->key_size, " "); printf("\n"); } + if (info->value_size) { + for (i = 0; i < n; i++) { + printf("value (CPU %02d):%c", + i, info->value_size > 16 ? '\n' : ' '); + if (value) + fprint_hex(stdout, value + i * step, + info->value_size, " "); + else + printf("<no entry>"); + printf("\n"); + } + } } } @@ -429,6 +440,9 @@ static int parse_elem(char **argv, struct bpf_map_info *info, p_err("not enough value arguments for map of progs"); return -1; } + if (is_prefix(*argv, "id")) + p_info("Warning: updating program array via MAP_ID, make sure this map is kept open\n" + " by some process or pinned otherwise update will be lost"); fd = prog_parse_fd(&argc, &argv); if (fd < 0) @@ -794,6 +808,32 @@ exit_free: return err; } +static int alloc_key_value(struct bpf_map_info *info, void **key, void **value) +{ + *key = NULL; + *value = NULL; + + if (info->key_size) { + *key = malloc(info->key_size); + if (!*key) { + p_err("key mem alloc failed"); + return -1; + } + } + + if (info->value_size) { + *value = alloc_value(info); + if (!*value) { + p_err("value mem alloc failed"); + free(*key); + *key = NULL; + return -1; + } + } + + return 0; +} + static int do_update(int argc, char **argv) { struct bpf_map_info info = {}; @@ -810,13 +850,9 @@ static int do_update(int argc, char **argv) if (fd < 0) return -1; - key = malloc(info.key_size); - value = alloc_value(&info); - if (!key || !value) { - p_err("mem alloc failed"); - err = -1; + err = alloc_key_value(&info, &key, &value); + if (err) goto exit_free; - } err = parse_elem(argv, &info, key, value, info.key_size, info.value_size, &flags, &value_fd); @@ -841,12 +877,51 @@ exit_free: return err; } +static void print_key_value(struct bpf_map_info *info, void *key, + void *value) +{ + json_writer_t *btf_wtr; + struct btf *btf = NULL; + int err; + + err = btf__get_from_id(info->btf_id, &btf); + if (err) { + p_err("failed to get btf"); + return; + } + + if (json_output) { + print_entry_json(info, key, value, btf); + } else if (btf) { + /* if here json_wtr wouldn't have been initialised, + * so let's create separate writer for btf + */ + btf_wtr = get_btf_writer(); + if (!btf_wtr) { + p_info("failed to create json writer for btf. falling back to plain output"); + btf__free(btf); + btf = NULL; + print_entry_plain(info, key, value); + } else { + struct btf_dumper d = { + .btf = btf, + .jw = btf_wtr, + .is_plain_text = true, + }; + + do_dump_btf(&d, info, key, value); + jsonw_destroy(&btf_wtr); + } + } else { + print_entry_plain(info, key, value); + } + btf__free(btf); +} + static int do_lookup(int argc, char **argv) { struct bpf_map_info info = {}; __u32 len = sizeof(info); - json_writer_t *btf_wtr; - struct btf *btf = NULL; void *key, *value; int err; int fd; @@ -858,13 +933,9 @@ static int do_lookup(int argc, char **argv) if (fd < 0) return -1; - key = malloc(info.key_size); - value = alloc_value(&info); - if (!key || !value) { - p_err("mem alloc failed"); - err = -1; + err = alloc_key_value(&info, &key, &value); + if (err) goto exit_free; - } err = parse_elem(argv, &info, key, NULL, info.key_size, 0, NULL, NULL); if (err) @@ -888,43 +959,12 @@ static int do_lookup(int argc, char **argv) } /* here means bpf_map_lookup_elem() succeeded */ - err = btf__get_from_id(info.btf_id, &btf); - if (err) { - p_err("failed to get btf"); - goto exit_free; - } - - if (json_output) { - print_entry_json(&info, key, value, btf); - } else if (btf) { - /* if here json_wtr wouldn't have been initialised, - * so let's create separate writer for btf - */ - btf_wtr = get_btf_writer(); - if (!btf_wtr) { - p_info("failed to create json writer for btf. falling back to plain output"); - btf__free(btf); - btf = NULL; - print_entry_plain(&info, key, value); - } else { - struct btf_dumper d = { - .btf = btf, - .jw = btf_wtr, - .is_plain_text = true, - }; - - do_dump_btf(&d, &info, key, value); - jsonw_destroy(&btf_wtr); - } - } else { - print_entry_plain(&info, key, value); - } + print_key_value(&info, key, value); exit_free: free(key); free(value); close(fd); - btf__free(btf); return err; } @@ -1137,6 +1177,49 @@ static int do_create(int argc, char **argv) return 0; } +static int do_pop_dequeue(int argc, char **argv) +{ + struct bpf_map_info info = {}; + __u32 len = sizeof(info); + void *key, *value; + int err; + int fd; + + if (argc < 2) + usage(); + + fd = map_parse_fd_and_info(&argc, &argv, &info, &len); + if (fd < 0) + return -1; + + err = alloc_key_value(&info, &key, &value); + if (err) + goto exit_free; + + err = bpf_map_lookup_and_delete_elem(fd, key, value); + if (err) { + if (errno == ENOENT) { + if (json_output) + jsonw_null(json_wtr); + else + printf("Error: empty map\n"); + } else { + p_err("pop failed: %s", strerror(errno)); + } + + goto exit_free; + } + + print_key_value(&info, key, value); + +exit_free: + free(key); + free(value); + close(fd); + + return err; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -1150,12 +1233,17 @@ static int do_help(int argc, char **argv) " entries MAX_ENTRIES name NAME [flags FLAGS] \\\n" " [dev NAME]\n" " %s %s dump MAP\n" - " %s %s update MAP key DATA value VALUE [UPDATE_FLAGS]\n" - " %s %s lookup MAP key DATA\n" + " %s %s update MAP [key DATA] [value VALUE] [UPDATE_FLAGS]\n" + " %s %s lookup MAP [key DATA]\n" " %s %s getnext MAP [key DATA]\n" " %s %s delete MAP key DATA\n" " %s %s pin MAP FILE\n" " %s %s event_pipe MAP [cpu N index M]\n" + " %s %s peek MAP\n" + " %s %s push MAP value VALUE\n" + " %s %s pop MAP\n" + " %s %s enqueue MAP value VALUE\n" + " %s %s dequeue MAP\n" " %s %s help\n" "\n" " " HELP_SPEC_MAP "\n" @@ -1173,7 +1261,8 @@ static int do_help(int argc, char **argv) bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], - bin_name, argv[-2]); + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2], + bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]); return 0; } @@ -1190,6 +1279,11 @@ static const struct cmd cmds[] = { { "pin", do_pin }, { "event_pipe", do_event_pipe }, { "create", do_create }, + { "peek", do_lookup }, + { "push", do_update }, + { "enqueue", do_update }, + { "pop", do_pop_dequeue }, + { "dequeue", do_pop_dequeue }, { 0 } }; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index b54ed82b9589..8ef80d65a474 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -214,6 +214,10 @@ static void print_prog_json(struct bpf_prog_info *info, int fd) info->tag[4], info->tag[5], info->tag[6], info->tag[7]); jsonw_bool_field(json_wtr, "gpl_compatible", info->gpl_compatible); + if (info->run_time_ns) { + jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); + jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); + } print_dev_json(info->ifindex, info->netns_dev, info->netns_ino); @@ -277,6 +281,9 @@ static void print_prog_plain(struct bpf_prog_info *info, int fd) fprint_hex(stdout, info->tag, BPF_TAG_SIZE, ""); print_dev_plain(info->ifindex, info->netns_dev, info->netns_ino); printf("%s", info->gpl_compatible ? " gpl" : ""); + if (info->run_time_ns) + printf(" run_time_ns %lld run_cnt %lld", + info->run_time_ns, info->run_cnt); printf("\n"); if (info->load_time) { @@ -931,10 +938,9 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) err = libbpf_prog_type_by_name(type, &attr.prog_type, &expected_attach_type); free(type); - if (err < 0) { - p_err("unknown program type '%s'", *argv); + if (err < 0) goto err_free_reuse_maps; - } + NEXT_ARG(); } else if (is_prefix(*argv, "map")) { void *new_map_replace; @@ -1029,11 +1035,8 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) err = libbpf_prog_type_by_name(sec_name, &prog_type, &expected_attach_type); - if (err < 0) { - p_err("failed to guess program type based on section name %s\n", - sec_name); + if (err < 0) goto err_close_obj; - } } bpf_program__set_ifindex(pos, ifindex); @@ -1050,7 +1053,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) j = 0; while (j < old_map_fds && map_replace[j].name) { i = 0; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!strcmp(bpf_map__name(map), map_replace[j].name)) { map_replace[j].idx = i; break; @@ -1071,7 +1074,7 @@ static int load_with_options(int argc, char **argv, bool first_prog_only) /* Set ifindex and name reuse */ j = 0; idx = 0; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!bpf_map__is_offload_neutral(map)) bpf_map__set_ifindex(map, ifindex); @@ -1203,7 +1206,7 @@ static int do_help(int argc, char **argv) " cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n" " cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n" " cgroup/sendmsg4 | cgroup/sendmsg6 }\n" - " ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n" + " ATTACH_TYPE := { msg_verdict | stream_verdict | stream_parser |\n" " flow_dissector }\n" " " HELP_SPEC_OPTIONS "\n" "", diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 5467c6bf9ceb..61e46d54a67c 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -53,10 +53,6 @@ FEATURE_TESTS_BASIC := \ libslang \ libcrypto \ libunwind \ - libunwind-x86 \ - libunwind-x86_64 \ - libunwind-arm \ - libunwind-aarch64 \ pthread-attr-setaffinity-np \ pthread-barrier \ reallocarray \ @@ -70,7 +66,6 @@ FEATURE_TESTS_BASIC := \ sched_getcpu \ sdt \ setns \ - libopencsd \ libaio # FEATURE_TESTS_BASIC + FEATURE_TESTS_EXTRA is the complete list @@ -84,6 +79,11 @@ FEATURE_TESTS_EXTRA := \ libbabeltrace \ libbfd-liberty \ libbfd-liberty-z \ + libopencsd \ + libunwind-x86 \ + libunwind-x86_64 \ + libunwind-arm \ + libunwind-aarch64 \ libunwind-debug-frame \ libunwind-debug-frame-arm \ libunwind-debug-frame-aarch64 \ diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index 20cdaa4fc112..e903b86b742f 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -170,14 +170,14 @@ # include "test-setns.c" #undef main -#define main main_test_libopencsd -# include "test-libopencsd.c" -#undef main - #define main main_test_libaio # include "test-libaio.c" #undef main +#define main main_test_reallocarray +# include "test-reallocarray.c" +#undef main + int main(int argc, char *argv[]) { main_test_libpython(); @@ -217,8 +217,8 @@ int main(int argc, char *argv[]) main_test_sched_getcpu(); main_test_sdt(); main_test_setns(); - main_test_libopencsd(); main_test_libaio(); + main_test_reallocarray(); return 0; } diff --git a/tools/build/feature/test-get_current_dir_name.c b/tools/build/feature/test-get_current_dir_name.c index 573000f93212..c3c201691b4f 100644 --- a/tools/build/feature/test-get_current_dir_name.c +++ b/tools/build/feature/test-get_current_dir_name.c @@ -8,3 +8,4 @@ int main(void) free(get_current_dir_name()); return 0; } +#undef _GNU_SOURCE diff --git a/tools/build/feature/test-libpython.c b/tools/build/feature/test-libpython.c index 0c1641b0d9a7..371c9113e49d 100644 --- a/tools/build/feature/test-libpython.c +++ b/tools/build/feature/test-libpython.c @@ -7,3 +7,4 @@ int main(void) return 0; } +#undef _GNU_SOURCE diff --git a/tools/build/feature/test-reallocarray.c b/tools/build/feature/test-reallocarray.c index 8170de35150d..8f6743e31da7 100644 --- a/tools/build/feature/test-reallocarray.c +++ b/tools/build/feature/test-reallocarray.c @@ -6,3 +6,5 @@ int main(void) { return !!reallocarray(NULL, 1, 1); } + +#undef _GNU_SOURCE diff --git a/tools/build/feature/test-sched_getcpu.c b/tools/build/feature/test-sched_getcpu.c index e448deb4124c..48995ac7911e 100644 --- a/tools/build/feature/test-sched_getcpu.c +++ b/tools/build/feature/test-sched_getcpu.c @@ -8,3 +8,5 @@ int main(void) { return sched_getcpu(); } + +#undef _GNU_SOURCE diff --git a/tools/build/feature/test-setns.c b/tools/build/feature/test-setns.c index 1f714d2a658b..4a1581ae7a55 100644 --- a/tools/build/feature/test-setns.c +++ b/tools/build/feature/test-setns.c @@ -5,3 +5,4 @@ int main(void) { return setns(0, 0); } +#undef _GNU_SOURCE diff --git a/tools/firmware/ihex2fw.c b/tools/firmware/ihex2fw.c index b58dd061e978..8925b60e51f5 100644 --- a/tools/firmware/ihex2fw.c +++ b/tools/firmware/ihex2fw.c @@ -24,6 +24,10 @@ #include <getopt.h> +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) + struct ihex_binrec { struct ihex_binrec *next; /* not part of the real data structure */ uint32_t addr; @@ -131,6 +135,7 @@ int main(int argc, char **argv) static int process_ihex(uint8_t *data, ssize_t size) { struct ihex_binrec *record; + size_t record_size; uint32_t offset = 0; uint32_t data32; uint8_t type, crc = 0, crcbyte = 0; @@ -157,12 +162,13 @@ next_record: len <<= 8; len += hex(data + i, &crc); i += 2; } - record = malloc((sizeof (*record) + len + 3) & ~3); + record_size = ALIGN(sizeof(*record) + len, 4); + record = malloc(record_size); if (!record) { fprintf(stderr, "out of memory for records\n"); return -ENOMEM; } - memset(record, 0, (sizeof(*record) + len + 3) & ~3); + memset(record, 0, record_size); record->len = len; /* now check if we have enough data to read everything */ @@ -259,13 +265,18 @@ static void file_record(struct ihex_binrec *record) *p = record; } +static uint16_t ihex_binrec_size(struct ihex_binrec *p) +{ + return p->len + sizeof(p->addr) + sizeof(p->len); +} + static int output_records(int outfd) { unsigned char zeroes[6] = {0, 0, 0, 0, 0, 0}; struct ihex_binrec *p = records; while (p) { - uint16_t writelen = (p->len + 9) & ~3; + uint16_t writelen = ALIGN(ihex_binrec_size(p), 4); p->addr = htonl(p->addr); p->len = htons(p->len); diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index af55acf73e75..cce0b02c0e28 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -199,6 +199,16 @@ .off = OFF, \ .imm = 0 }) +/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ @@ -209,6 +219,16 @@ .off = OFF, \ .imm = IMM }) +/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h new file mode 100644 index 000000000000..110b0e5d0fb0 --- /dev/null +++ b/tools/include/linux/numa.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NUMA_H +#define _LINUX_NUMA_H + + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) + +#endif /* _LINUX_NUMA_H */ diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index 112582253dd0..8e9ed4786269 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -43,13 +43,28 @@ struct rb_root { struct rb_node *rb_node; }; +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ @@ -68,6 +83,12 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +extern void rb_insert_color_cached(struct rb_node *, + struct rb_root_cached *, bool); +extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -75,6 +96,8 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -90,12 +113,29 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) - -/* - * Handy for checking that we are not deleting an entry that is - * already in a list, found in block/{blk-throttle,cfq-iosched}.c, - * probably should be moved to lib/rbtree.c... +/** + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ + pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ + typeof(*pos), field); 1; }); \ + pos = n) + static inline void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 43be941db695..d008e1404580 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -44,7 +44,9 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, +extern void __rb_insert_augmented(struct rb_node *node, + struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. @@ -60,7 +62,16 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, augment->rotate); + __rb_insert_augmented(node, root, false, NULL, augment->rotate); +} + +static inline void +rb_insert_augmented_cached(struct rb_node *node, + struct rb_root_cached *root, bool newleft, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, &root->rb_root, + newleft, &root->rb_leftmost, augment->rotate); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -93,7 +104,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ old->rbaugmented = rbcompute(old); \ } \ rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ + .propagate = rbname ## _propagate, \ + .copy = rbname ## _copy, \ + .rotate = rbname ## _rotate \ }; @@ -126,11 +139,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, { if (parent) { if (parent->rb_left == old) - parent->rb_left = new; + WRITE_ONCE(parent->rb_left, new); else - parent->rb_right = new; + WRITE_ONCE(parent->rb_right, new); } else - root->rb_node = new; + WRITE_ONCE(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, @@ -138,12 +151,17 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, + struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { - struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *child = node->rb_right; + struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; + if (leftmost && node == *leftmost) + *leftmost = rb_next(node); + if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -170,6 +188,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, tmp = parent; } else { struct rb_node *successor = child, *child2; + tmp = child->rb_left; if (!tmp) { /* @@ -183,6 +202,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, */ parent = successor; child2 = successor->rb_right; + augment->copy(node, successor); } else { /* @@ -204,19 +224,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, successor = tmp; tmp = tmp->rb_left; } while (tmp); - parent->rb_left = child2 = successor->rb_right; - successor->rb_right = child; + child2 = successor->rb_right; + WRITE_ONCE(parent->rb_left, child2); + WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); + augment->copy(node, successor); augment->propagate(parent, successor); } - successor->rb_left = tmp = node->rb_left; + tmp = node->rb_left; + WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); + if (child2) { successor->__rb_parent_color = pc; rb_set_parent_color(child2, parent, RB_BLACK); @@ -237,9 +261,21 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, + NULL, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } -#endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ +static __always_inline void +rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, + augment); + if (rebalance) + __rb_erase_color(rebalance, &root->rb_root, augment->rotate); +} + +#endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ diff --git a/tools/testing/selftests/rcutorture/bin/nolibc.h b/tools/include/nolibc/nolibc.h index f98f5b92d3eb..1708e9f9f8aa 100644 --- a/tools/testing/selftests/rcutorture/bin/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -3,7 +3,85 @@ * Copyright (C) 2017-2018 Willy Tarreau <w@1wt.eu> */ -/* some archs (at least aarch64) don't expose the regular syscalls anymore by +/* + * This file is designed to be used as a libc alternative for minimal programs + * with very limited requirements. It consists of a small number of syscall and + * type definitions, and the minimal startup code needed to call main(). + * All syscalls are declared as static functions so that they can be optimized + * away by the compiler when not used. + * + * Syscalls are split into 3 levels: + * - The lower level is the arch-specific syscall() definition, consisting in + * assembly code in compound expressions. These are called my_syscall0() to + * my_syscall6() depending on the number of arguments. The MIPS + * implementation is limited to 5 arguments. All input arguments are cast + * to a long stored in a register. These expressions always return the + * syscall's return value as a signed long value which is often either a + * pointer or the negated errno value. + * + * - The second level is mostly architecture-independent. It is made of + * static functions called sys_<name>() which rely on my_syscallN() + * depending on the syscall definition. These functions are responsible + * for exposing the appropriate types for the syscall arguments (int, + * pointers, etc) and for setting the appropriate return type (often int). + * A few of them are architecture-specific because the syscalls are not all + * mapped exactly the same among architectures. For example, some archs do + * not implement select() and need pselect6() instead, so the sys_select() + * function will have to abstract this. + * + * - The third level is the libc call definition. It exposes the lower raw + * sys_<name>() calls in a way that looks like what a libc usually does, + * takes care of specific input values, and of setting errno upon error. + * There can be minor variations compared to standard libc calls. For + * example the open() call always takes 3 args here. + * + * The errno variable is declared static and unused. This way it can be + * optimized away if not used. However this means that a program made of + * multiple C files may observe different errno values (one per C file). For + * the type of programs this project targets it usually is not a problem. The + * resulting program may even be reduced by defining the NOLIBC_IGNORE_ERRNO + * macro, in which case the errno value will never be assigned. + * + * Some stdint-like integer types are defined. These are valid on all currently + * supported architectures, because signs are enforced, ints are assumed to be + * 32 bits, longs the size of a pointer and long long 64 bits. If more + * architectures have to be supported, this may need to be adapted. + * + * Some macro definitions like the O_* values passed to open(), and some + * structures like the sys_stat struct depend on the architecture. + * + * The definitions start with the architecture-specific parts, which are picked + * based on what the compiler knows about the target architecture, and are + * completed with the generic code. Since it is the compiler which sets the + * target architecture, cross-compiling normally works out of the box without + * having to specify anything. + * + * Finally some very common libc-level functions are provided. It is the case + * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing + * is currently provided regarding stdio emulation. + * + * The macro NOLIBC is always defined, so that it is possible for a program to + * check this macro to know if it is being built against and decide to disable + * some features or simply not to include some standard libc files. + * + * Ideally this file should be split in multiple files for easier long term + * maintenance, but provided as a single file as it is now, it's quite + * convenient to use. Maybe some variations involving a set of includes at the + * top could work. + * + * A simple static executable may be built this way : + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -static -include nolibc.h -lgcc -o hello hello.c + * + * A very useful calling convention table may be found here : + * http://man7.org/linux/man-pages/man2/syscall.2.html + * + * This doc is quite convenient though not necessarily up to date : + * https://w3challs.com/syscalls/ + * + */ + +/* Some archs (at least aarch64) don't expose the regular syscalls anymore by * default, either because they have an "_at" replacement, or because there are * more modern alternatives. For now we'd rather still use them. */ @@ -19,18 +97,6 @@ #define NOLIBC -/* Build a static executable this way : - * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - * -static -include nolibc.h -lgcc -o hello hello.c - * - * Useful calling convention table found here : - * http://man7.org/linux/man-pages/man2/syscall.2.html - * - * This doc is even better : - * https://w3challs.com/syscalls/ - */ - - /* this way it will be removed if unused */ static int errno; @@ -81,9 +147,9 @@ typedef signed long time_t; /* for poll() */ struct pollfd { - int fd; - short int events; - short int revents; + int fd; + short int events; + short int revents; }; /* for select() */ @@ -239,7 +305,7 @@ struct stat { "syscall\n" \ : "=a" (_ret) \ : "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -255,7 +321,7 @@ struct stat { : "=a" (_ret) \ : "r"(_arg1), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -272,7 +338,7 @@ struct stat { : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -290,7 +356,7 @@ struct stat { : "=a" (_ret) \ : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ ); \ _ret; \ }) @@ -1006,7 +1072,7 @@ struct sys_stat_struct { : "=r"(_num), "=r"(_arg4) \ : "r"(_num) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) @@ -1025,7 +1091,7 @@ struct sys_stat_struct { : "0"(_num), \ "r"(_arg1) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) @@ -1045,7 +1111,7 @@ struct sys_stat_struct { : "0"(_num), \ "r"(_arg1), "r"(_arg2) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) @@ -1066,7 +1132,7 @@ struct sys_stat_struct { : "0"(_num), \ "r"(_arg1), "r"(_arg2), "r"(_arg3) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) @@ -1087,7 +1153,7 @@ struct sys_stat_struct { : "0"(_num), \ "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) @@ -1110,7 +1176,7 @@ struct sys_stat_struct { : "0"(_num), \ "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ : "memory", "cc", "at", "v1", "hi", "lo", \ - \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ ); \ _arg4 ? -_num : _num; \ }) diff --git a/tools/include/uapi/asm/bitsperlong.h b/tools/include/uapi/asm/bitsperlong.h index fd92ce8388fc..57aaeaf8e192 100644 --- a/tools/include/uapi/asm/bitsperlong.h +++ b/tools/include/uapi/asm/bitsperlong.h @@ -15,6 +15,8 @@ #include "../../arch/ia64/include/uapi/asm/bitsperlong.h" #elif defined(__riscv) #include "../../arch/riscv/include/uapi/asm/bitsperlong.h" +#elif defined(__alpha__) +#include "../../arch/alpha/include/uapi/asm/bitsperlong.h" #else #include <asm-generic/bitsperlong.h> #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 91c43884f295..3c38ac9a92a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ @@ -266,6 +267,7 @@ enum bpf_attach_type { #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ /* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) @@ -2014,6 +2016,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2327,6 +2342,30 @@ union bpf_attr { * "**y**". * Return * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2421,7 +2460,12 @@ union bpf_attr { FN(map_peek_elem), \ FN(msg_push_data), \ FN(msg_pop_data), \ - FN(rc_pointer_rel), + FN(rc_pointer_rel), \ + FN(spin_lock), \ + FN(spin_unlock), \ + FN(sk_fullsock), \ + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2494,7 +2538,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2540,6 +2585,8 @@ struct __sk_buff { __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { @@ -2581,7 +2628,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { @@ -2591,14 +2646,52 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. */ }; @@ -2728,6 +2821,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { @@ -3054,4 +3149,7 @@ struct bpf_line_info { __u32 line_col; }; +struct bpf_spin_lock { + __u32 val; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/include/uapi/linux/ethtool.h b/tools/include/uapi/linux/ethtool.h new file mode 100644 index 000000000000..c86c3e942df9 --- /dev/null +++ b/tools/include/uapi/linux/ethtool.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * ethtool.h: Defines for Linux ethtool. + * + * Copyright (C) 1998 David S. Miller (davem@redhat.com) + * Copyright 2001 Jeff Garzik <jgarzik@pobox.com> + * Portions Copyright 2001 Sun Microsystems (thockin@sun.com) + * Portions Copyright 2002 Intel (eli.kupermann@intel.com, + * christopher.leech@intel.com, + * scott.feldman@intel.com) + * Portions Copyright (C) Sun Microsystems 2008 + */ + +#ifndef _UAPI_LINUX_ETHTOOL_H +#define _UAPI_LINUX_ETHTOOL_H + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/if_ether.h> + +#define ETHTOOL_GCHANNELS 0x0000003c /* Get no of channels */ + +/** + * struct ethtool_channels - configuring number of network channel + * @cmd: ETHTOOL_{G,S}CHANNELS + * @max_rx: Read only. Maximum number of receive channel the driver support. + * @max_tx: Read only. Maximum number of transmit channel the driver support. + * @max_other: Read only. Maximum number of other channel the driver support. + * @max_combined: Read only. Maximum number of combined channel the driver + * support. Set of queues RX, TX or other. + * @rx_count: Valid values are in the range 1 to the max_rx. + * @tx_count: Valid values are in the range 1 to the max_tx. + * @other_count: Valid values are in the range 1 to the max_other. + * @combined_count: Valid values are in the range 1 to the max_combined. + * + * This can be used to configure RX, TX and other channels. + */ + +struct ethtool_channels { + __u32 cmd; + __u32 max_rx; + __u32 max_tx; + __u32 max_other; + __u32 max_combined; + __u32 rx_count; + __u32 tx_count; + __u32 other_count; + __u32 combined_count; +}; + +#endif /* _UAPI_LINUX_ETHTOOL_H */ diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index d6533828123a..5b225ff63b48 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -925,6 +925,7 @@ enum { enum { LINK_XSTATS_TYPE_UNSPEC, LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, __LINK_XSTATS_TYPE_MAX }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h new file mode 100644 index 000000000000..caed8b1614ff --- /dev/null +++ b/tools/include/uapi/linux/if_xdp.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel <bjorn.topel@intel.com> + * Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include <linux/types.h> + +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ + +struct sockaddr_xdp { + __u16 sxdp_family; + __u16 sxdp_flags; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; +}; + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + +/* XDP socket options */ +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; +}; + +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for reasons other than invalid desc */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ +}; + +/* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL + +/* Rx/Tx descriptor */ +struct xdp_desc { + __u64 addr; + __u32 len; + __u32 options; +}; + +/* UMEM descriptor is __u64 */ + +#endif /* _LINUX_IF_XDP_H */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 9de8780ac8d9..7198ddd0c6b1 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -372,7 +372,9 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + __reserved_1 : 33; union { __u32 wakeup_events; /* wakeup every n events */ @@ -445,8 +447,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ @@ -965,9 +965,58 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h index 6e89a5df49a4..653c4f94f76e 100644 --- a/tools/include/uapi/linux/tc_act/tc_bpf.h +++ b/tools/include/uapi/linux/tc_act/tc_bpf.h @@ -13,8 +13,6 @@ #include <linux/pkt_cls.h> -#define TCA_ACT_BPF 13 - struct tc_act_bpf { tc_gen; }; diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 197b40f5b5c6..ee9d5362f35b 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1 +1 @@ -libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o +libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 34d9c3619c96..a05c43468bd0 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -14,21 +14,6 @@ srctree := $(patsubst %/,%,$(dir $(srctree))) #$(info Determined 'srctree' to be $(srctree)) endif -# Makefiles suck: This macro sets a default value of $(2) for the -# variable named by $(1), unless the variable has been set by -# environment or command line. This is necessary for CC and AR -# because make sets default values, so the simpler ?= approach -# won't work as expected. -define allow-override - $(if $(or $(findstring environment,$(origin $(1))),\ - $(findstring command line,$(origin $(1)))),,\ - $(eval $(1) = $(2))) -endef - -# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix. -$(call allow-override,CC,$(CROSS_COMPILE)gcc) -$(call allow-override,AR,$(CROSS_COMPILE)ar) - INSTALL = install # Use DESTDIR for installing into a different root directory. @@ -54,7 +39,7 @@ man_dir_SQ = '$(subst ','\'',$(man_dir))' export man_dir man_dir_SQ INSTALL export DESTDIR DESTDIR_SQ -include ../../scripts/Makefile.include +include $(srctree)/tools/scripts/Makefile.include # copy a bit from Linux kbuild @@ -147,9 +132,9 @@ BPF_IN := $(OUTPUT)libbpf-in.o LIB_FILE := $(addprefix $(OUTPUT),$(LIB_FILE)) VERSION_SCRIPT := libbpf.map -GLOBAL_SYM_COUNT = $(shell readelf -s $(BPF_IN) | \ +GLOBAL_SYM_COUNT = $(shell readelf -s --wide $(BPF_IN) | \ awk '/GLOBAL/ && /DEFAULT/ && !/UND/ {s++} END{print s}') -VERSIONED_SYM_COUNT = $(shell readelf -s $(OUTPUT)libbpf.so | \ +VERSIONED_SYM_COUNT = $(shell readelf -s --wide $(OUTPUT)libbpf.so | \ grep -Eo '[^ ]+@LIBBPF_' | cut -d@ -f1 | sort -u | wc -l) CMD_TARGETS = $(LIB_FILE) @@ -179,6 +164,9 @@ $(BPF_IN): force elfdep bpfdep @(test -f ../../include/uapi/linux/if_link.h -a -f ../../../include/uapi/linux/if_link.h && ( \ (diff -B ../../include/uapi/linux/if_link.h ../../../include/uapi/linux/if_link.h >/dev/null) || \ echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_link.h' differs from latest version at 'include/uapi/linux/if_link.h'" >&2 )) || true + @(test -f ../../include/uapi/linux/if_xdp.h -a -f ../../../include/uapi/linux/if_xdp.h && ( \ + (diff -B ../../include/uapi/linux/if_xdp.h ../../../include/uapi/linux/if_xdp.h >/dev/null) || \ + echo "Warning: Kernel ABI header at 'tools/include/uapi/linux/if_xdp.h' differs from latest version at 'include/uapi/linux/if_xdp.h'" >&2 )) || true $(Q)$(MAKE) $(build)=libbpf $(OUTPUT)libbpf.so: $(BPF_IN) @@ -189,7 +177,7 @@ $(OUTPUT)libbpf.a: $(BPF_IN) $(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^ $(OUTPUT)test_libbpf: test_libbpf.cpp $(OUTPUT)libbpf.a - $(QUIET_LINK)$(CXX) $^ -lelf -o $@ + $(QUIET_LINK)$(CXX) $(INCLUDES) $^ -lelf -o $@ check: check_abi diff --git a/tools/lib/bpf/README.rst b/tools/lib/bpf/README.rst index 607aae40f4ed..5788479384ca 100644 --- a/tools/lib/bpf/README.rst +++ b/tools/lib/bpf/README.rst @@ -9,7 +9,7 @@ described here. It's recommended to follow these conventions whenever a new function or type is added to keep libbpf API clean and consistent. All types and functions provided by libbpf API should have one of the -following prefixes: ``bpf_``, ``btf_``, ``libbpf_``. +following prefixes: ``bpf_``, ``btf_``, ``libbpf_``, ``xsk_``. System call wrappers -------------------- @@ -62,6 +62,19 @@ Auxiliary functions and types that don't fit well in any of categories described above should have ``libbpf_`` prefix, e.g. ``libbpf_get_error`` or ``libbpf_prog_type_by_name``. +AF_XDP functions +------------------- + +AF_XDP functions should have an ``xsk_`` prefix, e.g. +``xsk_umem__get_data`` or ``xsk_umem__create``. The interface consists +of both low-level ring access functions and high-level configuration +functions. These can be mixed and matched. Note that these functions +are not reentrant for performance reasons. + +Please take a look at Documentation/networking/af_xdp.rst in the Linux +kernel source tree on how to use XDP sockets and for some common +mistakes in case you do not get any traffic up to user space. + libbpf ABI ========== diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 88cbd110ae58..9cd015574e83 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -22,6 +22,7 @@ */ #include <stdlib.h> +#include <string.h> #include <memory.h> #include <unistd.h> #include <asm/unistd.h> @@ -214,23 +215,35 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, { void *finfo = NULL, *linfo = NULL; union bpf_attr attr; + __u32 log_level; __u32 name_len; int fd; - if (!load_attr) + if (!load_attr || !log_buf != !log_buf_sz) + return -EINVAL; + + log_level = load_attr->log_level; + if (log_level > 2 || (log_level && !log_buf)) return -EINVAL; name_len = load_attr->name ? strlen(load_attr->name) : 0; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_type = load_attr->prog_type; attr.expected_attach_type = load_attr->expected_attach_type; attr.insn_cnt = (__u32)load_attr->insns_cnt; attr.insns = ptr_to_u64(load_attr->insns); attr.license = ptr_to_u64(load_attr->license); - attr.log_buf = ptr_to_u64(NULL); - attr.log_size = 0; - attr.log_level = 0; + + attr.log_level = log_level; + if (log_level) { + attr.log_buf = ptr_to_u64(log_buf); + attr.log_size = log_buf_sz; + } else { + attr.log_buf = ptr_to_u64(NULL); + attr.log_size = 0; + } + attr.kern_version = load_attr->kern_version; attr.prog_ifindex = load_attr->prog_ifindex; attr.prog_btf_fd = load_attr->prog_btf_fd; @@ -286,7 +299,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, goto done; } - if (!log_buf || !log_buf_sz) + if (log_level || !log_buf) goto done; /* Try again with log */ @@ -327,7 +340,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_type = type; attr.insn_cnt = (__u32)insns_cnt; attr.insns = ptr_to_u64(insns); @@ -347,7 +360,7 @@ int bpf_map_update_elem(int fd, const void *key, const void *value, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -360,7 +373,7 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -368,11 +381,24 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value) return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); } +int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) +{ + union bpf_attr attr; + + memset(&attr, 0, sizeof(attr)); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + attr.value = ptr_to_u64(value); + attr.flags = flags; + + return sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); +} + int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); @@ -384,7 +410,7 @@ int bpf_map_delete_elem(int fd, const void *key) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); @@ -395,7 +421,7 @@ int bpf_map_get_next_key(int fd, const void *key, void *next_key) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.next_key = ptr_to_u64(next_key); @@ -407,7 +433,7 @@ int bpf_obj_pin(int fd, const char *pathname) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.pathname = ptr_to_u64((void *)pathname); attr.bpf_fd = fd; @@ -418,7 +444,7 @@ int bpf_obj_get(const char *pathname) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.pathname = ptr_to_u64((void *)pathname); return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); @@ -429,7 +455,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; @@ -442,7 +468,7 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_type = type; @@ -453,7 +479,7 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.target_fd = target_fd; attr.attach_bpf_fd = prog_fd; attr.attach_type = type; @@ -467,7 +493,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, union bpf_attr attr; int ret; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.query.target_fd = target_fd; attr.query.attach_type = type; attr.query.query_flags = query_flags; @@ -488,7 +514,7 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, union bpf_attr attr; int ret; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.test.prog_fd = prog_fd; attr.test.data_in = ptr_to_u64(data); attr.test.data_out = ptr_to_u64(data_out); @@ -513,7 +539,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr) if (!test_attr->data_out && test_attr->data_size_out > 0) return -EINVAL; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.test.prog_fd = test_attr->prog_fd; attr.test.data_in = ptr_to_u64(test_attr->data_in); attr.test.data_out = ptr_to_u64(test_attr->data_out); @@ -533,7 +559,7 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.start_id = start_id; err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr)); @@ -548,7 +574,7 @@ int bpf_map_get_next_id(__u32 start_id, __u32 *next_id) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.start_id = start_id; err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr)); @@ -562,7 +588,7 @@ int bpf_prog_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.prog_id = id; return sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -572,7 +598,7 @@ int bpf_map_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.map_id = id; return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -582,7 +608,7 @@ int bpf_btf_get_fd_by_id(__u32 id) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.btf_id = id; return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); @@ -593,7 +619,7 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) union bpf_attr attr; int err; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.info.bpf_fd = prog_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); @@ -609,7 +635,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd) { union bpf_attr attr; - bzero(&attr, sizeof(attr)); + memset(&attr, 0, sizeof(attr)); attr.raw_tracepoint.name = ptr_to_u64(name); attr.raw_tracepoint.prog_fd = prog_fd; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 8f09de482839..6ffdd79bea89 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -85,6 +85,7 @@ struct bpf_load_program_attr { __u32 line_info_rec_size; const void *line_info; __u32 line_info_cnt; + __u32 log_level; }; /* Flags to direct loading requirements */ @@ -110,6 +111,8 @@ LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); LIBBPF_API int bpf_map_lookup_elem(int fd, const void *key, void *value); +LIBBPF_API int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, + __u64 flags); LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value); LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index d682d3b8f7b9..1b8d8cdd3575 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) /* Copyright (c) 2018 Facebook */ +#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -9,12 +10,14 @@ #include <linux/btf.h> #include "btf.h" #include "bpf.h" +#include "libbpf.h" +#include "libbpf_util.h" -#define elog(fmt, ...) { if (err_log) err_log(fmt, ##__VA_ARGS__); } #define max(a, b) ((a) > (b) ? (a) : (b)) #define min(a, b) ((a) < (b) ? (a) : (b)) -#define BTF_MAX_NR_TYPES 65535 +#define BTF_MAX_NR_TYPES 0x7fffffff +#define BTF_MAX_STR_OFFSET 0x7fffffff #define IS_MODIFIER(k) (((k) == BTF_KIND_TYPEDEF) || \ ((k) == BTF_KIND_VOLATILE) || \ @@ -39,9 +42,8 @@ struct btf { struct btf_ext_info { /* - * info points to a deep copy of the individual info section - * (e.g. func_info and line_info) from the .BTF.ext. - * It does not include the __u32 rec_size. + * info points to the individual info section (e.g. func_info and + * line_info) from the .BTF.ext. It does not include the __u32 rec_size. */ void *info; __u32 rec_size; @@ -49,8 +51,13 @@ struct btf_ext_info { }; struct btf_ext { + union { + struct btf_ext_header *hdr; + void *data; + }; struct btf_ext_info func_info; struct btf_ext_info line_info; + __u32 data_size; }; struct btf_ext_info_sec { @@ -107,54 +114,54 @@ static int btf_add_type(struct btf *btf, struct btf_type *t) return 0; } -static int btf_parse_hdr(struct btf *btf, btf_print_fn_t err_log) +static int btf_parse_hdr(struct btf *btf) { const struct btf_header *hdr = btf->hdr; __u32 meta_left; if (btf->data_size < sizeof(struct btf_header)) { - elog("BTF header not found\n"); + pr_debug("BTF header not found\n"); return -EINVAL; } if (hdr->magic != BTF_MAGIC) { - elog("Invalid BTF magic:%x\n", hdr->magic); + pr_debug("Invalid BTF magic:%x\n", hdr->magic); return -EINVAL; } if (hdr->version != BTF_VERSION) { - elog("Unsupported BTF version:%u\n", hdr->version); + pr_debug("Unsupported BTF version:%u\n", hdr->version); return -ENOTSUP; } if (hdr->flags) { - elog("Unsupported BTF flags:%x\n", hdr->flags); + pr_debug("Unsupported BTF flags:%x\n", hdr->flags); return -ENOTSUP; } meta_left = btf->data_size - sizeof(*hdr); if (!meta_left) { - elog("BTF has no data\n"); + pr_debug("BTF has no data\n"); return -EINVAL; } if (meta_left < hdr->type_off) { - elog("Invalid BTF type section offset:%u\n", hdr->type_off); + pr_debug("Invalid BTF type section offset:%u\n", hdr->type_off); return -EINVAL; } if (meta_left < hdr->str_off) { - elog("Invalid BTF string section offset:%u\n", hdr->str_off); + pr_debug("Invalid BTF string section offset:%u\n", hdr->str_off); return -EINVAL; } if (hdr->type_off >= hdr->str_off) { - elog("BTF type section offset >= string section offset. No type?\n"); + pr_debug("BTF type section offset >= string section offset. No type?\n"); return -EINVAL; } if (hdr->type_off & 0x02) { - elog("BTF type section is not aligned to 4 bytes\n"); + pr_debug("BTF type section is not aligned to 4 bytes\n"); return -EINVAL; } @@ -163,15 +170,15 @@ static int btf_parse_hdr(struct btf *btf, btf_print_fn_t err_log) return 0; } -static int btf_parse_str_sec(struct btf *btf, btf_print_fn_t err_log) +static int btf_parse_str_sec(struct btf *btf) { const struct btf_header *hdr = btf->hdr; const char *start = btf->nohdr_data + hdr->str_off; const char *end = start + btf->hdr->str_len; - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_STR_OFFSET || start[0] || end[-1]) { - elog("Invalid BTF string section\n"); + pr_debug("Invalid BTF string section\n"); return -EINVAL; } @@ -180,7 +187,38 @@ static int btf_parse_str_sec(struct btf *btf, btf_print_fn_t err_log) return 0; } -static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log) +static int btf_type_size(struct btf_type *t) +{ + int base_size = sizeof(struct btf_type); + __u16 vlen = BTF_INFO_VLEN(t->info); + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + return base_size; + case BTF_KIND_INT: + return base_size + sizeof(__u32); + case BTF_KIND_ENUM: + return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ARRAY: + return base_size + sizeof(struct btf_array); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + return base_size + vlen * sizeof(struct btf_member); + case BTF_KIND_FUNC_PROTO: + return base_size + vlen * sizeof(struct btf_param); + default: + pr_debug("Unsupported BTF_KIND:%u\n", BTF_INFO_KIND(t->info)); + return -EINVAL; + } +} + +static int btf_parse_type_sec(struct btf *btf) { struct btf_header *hdr = btf->hdr; void *nohdr_data = btf->nohdr_data; @@ -189,41 +227,13 @@ static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log) while (next_type < end_type) { struct btf_type *t = next_type; - __u16 vlen = BTF_INFO_VLEN(t->info); + int type_size; int err; - next_type += sizeof(*t); - switch (BTF_INFO_KIND(t->info)) { - case BTF_KIND_INT: - next_type += sizeof(int); - break; - case BTF_KIND_ARRAY: - next_type += sizeof(struct btf_array); - break; - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - next_type += vlen * sizeof(struct btf_member); - break; - case BTF_KIND_ENUM: - next_type += vlen * sizeof(struct btf_enum); - break; - case BTF_KIND_FUNC_PROTO: - next_type += vlen * sizeof(struct btf_param); - break; - case BTF_KIND_FUNC: - case BTF_KIND_TYPEDEF: - case BTF_KIND_PTR: - case BTF_KIND_FWD: - case BTF_KIND_VOLATILE: - case BTF_KIND_CONST: - case BTF_KIND_RESTRICT: - break; - default: - elog("Unsupported BTF_KIND:%u\n", - BTF_INFO_KIND(t->info)); - return -EINVAL; - } - + type_size = btf_type_size(t); + if (type_size < 0) + return type_size; + next_type += type_size; err = btf_add_type(btf, t); if (err) return err; @@ -232,6 +242,11 @@ static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log) return 0; } +__u32 btf__get_nr_types(const struct btf *btf) +{ + return btf->nr_types; +} + const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) { if (type_id > btf->nr_types) @@ -250,21 +265,6 @@ static bool btf_type_is_void_or_null(const struct btf_type *t) return !t || btf_type_is_void(t); } -static __s64 btf_type_size(const struct btf_type *t) -{ - switch (BTF_INFO_KIND(t->info)) { - case BTF_KIND_INT: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - return t->size; - case BTF_KIND_PTR: - return sizeof(void *); - default: - return -EINVAL; - } -} - #define MAX_RESOLVE_DEPTH 32 __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) @@ -278,11 +278,16 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) t = btf__type_by_id(btf, type_id); for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t); i++) { - size = btf_type_size(t); - if (size >= 0) - break; - switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + size = t->size; + goto done; + case BTF_KIND_PTR: + size = sizeof(void *); + goto done; case BTF_KIND_TYPEDEF: case BTF_KIND_VOLATILE: case BTF_KIND_CONST: @@ -306,6 +311,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) if (size < 0) return -EINVAL; +done: if (nelems && size > UINT32_MAX / nelems) return -E2BIG; @@ -363,10 +369,8 @@ void btf__free(struct btf *btf) free(btf); } -struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log) +struct btf *btf__new(__u8 *data, __u32 size) { - __u32 log_buf_size = 0; - char *log_buf = NULL; struct btf *btf; int err; @@ -376,16 +380,6 @@ struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log) btf->fd = -1; - if (err_log) { - log_buf = malloc(BPF_LOG_BUF_SIZE); - if (!log_buf) { - err = -ENOMEM; - goto done; - } - *log_buf = 0; - log_buf_size = BPF_LOG_BUF_SIZE; - } - btf->data = malloc(size); if (!btf->data) { err = -ENOMEM; @@ -395,30 +389,17 @@ struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log) memcpy(btf->data, data, size); btf->data_size = size; - btf->fd = bpf_load_btf(btf->data, btf->data_size, - log_buf, log_buf_size, false); - - if (btf->fd == -1) { - err = -errno; - elog("Error loading BTF: %s(%d)\n", strerror(errno), errno); - if (log_buf && *log_buf) - elog("%s\n", log_buf); - goto done; - } - - err = btf_parse_hdr(btf, err_log); + err = btf_parse_hdr(btf); if (err) goto done; - err = btf_parse_str_sec(btf, err_log); + err = btf_parse_str_sec(btf); if (err) goto done; - err = btf_parse_type_sec(btf, err_log); + err = btf_parse_type_sec(btf); done: - free(log_buf); - if (err) { btf__free(btf); return ERR_PTR(err); @@ -427,11 +408,47 @@ done: return btf; } +int btf__load(struct btf *btf) +{ + __u32 log_buf_size = BPF_LOG_BUF_SIZE; + char *log_buf = NULL; + int err = 0; + + if (btf->fd >= 0) + return -EEXIST; + + log_buf = malloc(log_buf_size); + if (!log_buf) + return -ENOMEM; + + *log_buf = 0; + + btf->fd = bpf_load_btf(btf->data, btf->data_size, + log_buf, log_buf_size, false); + if (btf->fd < 0) { + err = -errno; + pr_warning("Error loading BTF: %s(%d)\n", strerror(errno), errno); + if (*log_buf) + pr_warning("%s\n", log_buf); + goto done; + } + +done: + free(log_buf); + return err; +} + int btf__fd(const struct btf *btf) { return btf->fd; } +const void *btf__get_raw_data(const struct btf *btf, __u32 *size) +{ + *size = btf->data_size; + return btf->data; +} + const char *btf__name_by_offset(const struct btf *btf, __u32 offset) { if (offset < btf->hdr->str_len) @@ -467,7 +484,7 @@ int btf__get_from_id(__u32 id, struct btf **btf) goto exit_free; } - bzero(ptr, last_size); + memset(ptr, 0, last_size); btf_info.btf = ptr_to_u64(ptr); err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); @@ -481,7 +498,7 @@ int btf__get_from_id(__u32 id, struct btf **btf) goto exit_free; } ptr = temp_ptr; - bzero(ptr, last_size); + memset(ptr, 0, last_size); btf_info.btf = ptr_to_u64(ptr); err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); } @@ -491,7 +508,7 @@ int btf__get_from_id(__u32 id, struct btf **btf) goto exit_free; } - *btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size, NULL); + *btf = btf__new((__u8 *)(long)btf_info.btf, btf_info.btf_size); if (IS_ERR(*btf)) { err = PTR_ERR(*btf); *btf = NULL; @@ -504,7 +521,79 @@ exit_free: return err; } -struct btf_ext_sec_copy_param { +int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, + __u32 expected_key_size, __u32 expected_value_size, + __u32 *key_type_id, __u32 *value_type_id) +{ + const struct btf_type *container_type; + const struct btf_member *key, *value; + const size_t max_name = 256; + char container_name[max_name]; + __s64 key_size, value_size; + __s32 container_id; + + if (snprintf(container_name, max_name, "____btf_map_%s", map_name) == + max_name) { + pr_warning("map:%s length of '____btf_map_%s' is too long\n", + map_name, map_name); + return -EINVAL; + } + + container_id = btf__find_by_name(btf, container_name); + if (container_id < 0) { + pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n", + map_name, container_name); + return container_id; + } + + container_type = btf__type_by_id(btf, container_id); + if (!container_type) { + pr_warning("map:%s cannot find BTF type for container_id:%u\n", + map_name, container_id); + return -EINVAL; + } + + if (BTF_INFO_KIND(container_type->info) != BTF_KIND_STRUCT || + BTF_INFO_VLEN(container_type->info) < 2) { + pr_warning("map:%s container_name:%s is an invalid container struct\n", + map_name, container_name); + return -EINVAL; + } + + key = (struct btf_member *)(container_type + 1); + value = key + 1; + + key_size = btf__resolve_size(btf, key->type); + if (key_size < 0) { + pr_warning("map:%s invalid BTF key_type_size\n", map_name); + return key_size; + } + + if (expected_key_size != key_size) { + pr_warning("map:%s btf_key_type_size:%u != map_def_key_size:%u\n", + map_name, (__u32)key_size, expected_key_size); + return -EINVAL; + } + + value_size = btf__resolve_size(btf, value->type); + if (value_size < 0) { + pr_warning("map:%s invalid BTF value_type_size\n", map_name); + return value_size; + } + + if (expected_value_size != value_size) { + pr_warning("map:%s btf_value_type_size:%u != map_def_value_size:%u\n", + map_name, (__u32)value_size, expected_value_size); + return -EINVAL; + } + + *key_type_id = key->type; + *value_type_id = value->type; + + return 0; +} + +struct btf_ext_sec_setup_param { __u32 off; __u32 len; __u32 min_rec_size; @@ -512,41 +601,33 @@ struct btf_ext_sec_copy_param { const char *desc; }; -static int btf_ext_copy_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size, - struct btf_ext_sec_copy_param *ext_sec, - btf_print_fn_t err_log) +static int btf_ext_setup_info(struct btf_ext *btf_ext, + struct btf_ext_sec_setup_param *ext_sec) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; const struct btf_ext_info_sec *sinfo; struct btf_ext_info *ext_info; __u32 info_left, record_size; /* The start of the info sec (including the __u32 record_size). */ - const void *info; - - /* data and data_size do not include btf_ext_header from now on */ - data = data + hdr->hdr_len; - data_size -= hdr->hdr_len; + void *info; if (ext_sec->off & 0x03) { - elog(".BTF.ext %s section is not aligned to 4 bytes\n", + pr_debug(".BTF.ext %s section is not aligned to 4 bytes\n", ext_sec->desc); return -EINVAL; } - if (data_size < ext_sec->off || - ext_sec->len > data_size - ext_sec->off) { - elog("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", - ext_sec->desc, ext_sec->off, ext_sec->len); + info = btf_ext->data + btf_ext->hdr->hdr_len + ext_sec->off; + info_left = ext_sec->len; + + if (btf_ext->data + btf_ext->data_size < info + ext_sec->len) { + pr_debug("%s section (off:%u len:%u) is beyond the end of the ELF section .BTF.ext\n", + ext_sec->desc, ext_sec->off, ext_sec->len); return -EINVAL; } - info = data + ext_sec->off; - info_left = ext_sec->len; - /* At least a record size */ if (info_left < sizeof(__u32)) { - elog(".BTF.ext %s record size not found\n", ext_sec->desc); + pr_debug(".BTF.ext %s record size not found\n", ext_sec->desc); return -EINVAL; } @@ -554,8 +635,8 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, record_size = *(__u32 *)info; if (record_size < ext_sec->min_rec_size || record_size & 0x03) { - elog("%s section in .BTF.ext has invalid record size %u\n", - ext_sec->desc, record_size); + pr_debug("%s section in .BTF.ext has invalid record size %u\n", + ext_sec->desc, record_size); return -EINVAL; } @@ -564,7 +645,7 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, /* If no records, return failure now so .BTF.ext won't be used. */ if (!info_left) { - elog("%s section in .BTF.ext has no records", ext_sec->desc); + pr_debug("%s section in .BTF.ext has no records", ext_sec->desc); return -EINVAL; } @@ -574,14 +655,14 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, __u32 num_records; if (info_left < sec_hdrlen) { - elog("%s section header is not found in .BTF.ext\n", + pr_debug("%s section header is not found in .BTF.ext\n", ext_sec->desc); return -EINVAL; } num_records = sinfo->num_info; if (num_records == 0) { - elog("%s section has incorrect num_records in .BTF.ext\n", + pr_debug("%s section has incorrect num_records in .BTF.ext\n", ext_sec->desc); return -EINVAL; } @@ -589,7 +670,7 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, total_record_size = sec_hdrlen + (__u64)num_records * record_size; if (info_left < total_record_size) { - elog("%s section has incorrect num_records in .BTF.ext\n", + pr_debug("%s section has incorrect num_records in .BTF.ext\n", ext_sec->desc); return -EINVAL; } @@ -601,74 +682,64 @@ static int btf_ext_copy_info(struct btf_ext *btf_ext, ext_info = ext_sec->ext_info; ext_info->len = ext_sec->len - sizeof(__u32); ext_info->rec_size = record_size; - ext_info->info = malloc(ext_info->len); - if (!ext_info->info) - return -ENOMEM; - memcpy(ext_info->info, info + sizeof(__u32), ext_info->len); + ext_info->info = info + sizeof(__u32); return 0; } -static int btf_ext_copy_func_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size, - btf_print_fn_t err_log) +static int btf_ext_setup_func_info(struct btf_ext *btf_ext) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; - struct btf_ext_sec_copy_param param = { - .off = hdr->func_info_off, - .len = hdr->func_info_len, + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->func_info_off, + .len = btf_ext->hdr->func_info_len, .min_rec_size = sizeof(struct bpf_func_info_min), .ext_info = &btf_ext->func_info, .desc = "func_info" }; - return btf_ext_copy_info(btf_ext, data, data_size, ¶m, err_log); + return btf_ext_setup_info(btf_ext, ¶m); } -static int btf_ext_copy_line_info(struct btf_ext *btf_ext, - __u8 *data, __u32 data_size, - btf_print_fn_t err_log) +static int btf_ext_setup_line_info(struct btf_ext *btf_ext) { - const struct btf_ext_header *hdr = (struct btf_ext_header *)data; - struct btf_ext_sec_copy_param param = { - .off = hdr->line_info_off, - .len = hdr->line_info_len, + struct btf_ext_sec_setup_param param = { + .off = btf_ext->hdr->line_info_off, + .len = btf_ext->hdr->line_info_len, .min_rec_size = sizeof(struct bpf_line_info_min), .ext_info = &btf_ext->line_info, .desc = "line_info", }; - return btf_ext_copy_info(btf_ext, data, data_size, ¶m, err_log); + return btf_ext_setup_info(btf_ext, ¶m); } -static int btf_ext_parse_hdr(__u8 *data, __u32 data_size, - btf_print_fn_t err_log) +static int btf_ext_parse_hdr(__u8 *data, __u32 data_size) { const struct btf_ext_header *hdr = (struct btf_ext_header *)data; if (data_size < offsetof(struct btf_ext_header, func_info_off) || data_size < hdr->hdr_len) { - elog("BTF.ext header not found"); + pr_debug("BTF.ext header not found"); return -EINVAL; } if (hdr->magic != BTF_MAGIC) { - elog("Invalid BTF.ext magic:%x\n", hdr->magic); + pr_debug("Invalid BTF.ext magic:%x\n", hdr->magic); return -EINVAL; } if (hdr->version != BTF_VERSION) { - elog("Unsupported BTF.ext version:%u\n", hdr->version); + pr_debug("Unsupported BTF.ext version:%u\n", hdr->version); return -ENOTSUP; } if (hdr->flags) { - elog("Unsupported BTF.ext flags:%x\n", hdr->flags); + pr_debug("Unsupported BTF.ext flags:%x\n", hdr->flags); return -ENOTSUP; } if (data_size == hdr->hdr_len) { - elog("BTF.ext has no data\n"); + pr_debug("BTF.ext has no data\n"); return -EINVAL; } @@ -679,18 +750,16 @@ void btf_ext__free(struct btf_ext *btf_ext) { if (!btf_ext) return; - - free(btf_ext->func_info.info); - free(btf_ext->line_info.info); + free(btf_ext->data); free(btf_ext); } -struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log) +struct btf_ext *btf_ext__new(__u8 *data, __u32 size) { struct btf_ext *btf_ext; int err; - err = btf_ext_parse_hdr(data, size, err_log); + err = btf_ext_parse_hdr(data, size); if (err) return ERR_PTR(err); @@ -698,13 +767,23 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log) if (!btf_ext) return ERR_PTR(-ENOMEM); - err = btf_ext_copy_func_info(btf_ext, data, size, err_log); - if (err) { - btf_ext__free(btf_ext); - return ERR_PTR(err); + btf_ext->data_size = size; + btf_ext->data = malloc(size); + if (!btf_ext->data) { + err = -ENOMEM; + goto done; } + memcpy(btf_ext->data, data, size); + + err = btf_ext_setup_func_info(btf_ext); + if (err) + goto done; + + err = btf_ext_setup_line_info(btf_ext); + if (err) + goto done; - err = btf_ext_copy_line_info(btf_ext, data, size, err_log); +done: if (err) { btf_ext__free(btf_ext); return ERR_PTR(err); @@ -713,6 +792,12 @@ struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log) return btf_ext; } +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +{ + *size = btf_ext->data_size; + return btf_ext->data; +} + static int btf_ext_reloc_info(const struct btf *btf, const struct btf_ext_info *ext_info, const char *sec_name, __u32 insns_cnt, @@ -761,7 +846,8 @@ static int btf_ext_reloc_info(const struct btf *btf, return -ENOENT; } -int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ext, +int btf_ext__reloc_func_info(const struct btf *btf, + const struct btf_ext *btf_ext, const char *sec_name, __u32 insns_cnt, void **func_info, __u32 *cnt) { @@ -769,7 +855,8 @@ int btf_ext__reloc_func_info(const struct btf *btf, const struct btf_ext *btf_ex insns_cnt, func_info, cnt); } -int btf_ext__reloc_line_info(const struct btf *btf, const struct btf_ext *btf_ext, +int btf_ext__reloc_line_info(const struct btf *btf, + const struct btf_ext *btf_ext, const char *sec_name, __u32 insns_cnt, void **line_info, __u32 *cnt) { @@ -786,3 +873,1778 @@ __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext) { return btf_ext->line_info.rec_size; } + +struct btf_dedup; + +static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, + const struct btf_dedup_opts *opts); +static void btf_dedup_free(struct btf_dedup *d); +static int btf_dedup_strings(struct btf_dedup *d); +static int btf_dedup_prim_types(struct btf_dedup *d); +static int btf_dedup_struct_types(struct btf_dedup *d); +static int btf_dedup_ref_types(struct btf_dedup *d); +static int btf_dedup_compact_types(struct btf_dedup *d); +static int btf_dedup_remap_types(struct btf_dedup *d); + +/* + * Deduplicate BTF types and strings. + * + * BTF dedup algorithm takes as an input `struct btf` representing `.BTF` ELF + * section with all BTF type descriptors and string data. It overwrites that + * memory in-place with deduplicated types and strings without any loss of + * information. If optional `struct btf_ext` representing '.BTF.ext' ELF section + * is provided, all the strings referenced from .BTF.ext section are honored + * and updated to point to the right offsets after deduplication. + * + * If function returns with error, type/string data might be garbled and should + * be discarded. + * + * More verbose and detailed description of both problem btf_dedup is solving, + * as well as solution could be found at: + * https://facebookmicrosites.github.io/bpf/blog/2018/11/14/btf-enhancement.html + * + * Problem description and justification + * ===================================== + * + * BTF type information is typically emitted either as a result of conversion + * from DWARF to BTF or directly by compiler. In both cases, each compilation + * unit contains information about a subset of all the types that are used + * in an application. These subsets are frequently overlapping and contain a lot + * of duplicated information when later concatenated together into a single + * binary. This algorithm ensures that each unique type is represented by single + * BTF type descriptor, greatly reducing resulting size of BTF data. + * + * Compilation unit isolation and subsequent duplication of data is not the only + * problem. The same type hierarchy (e.g., struct and all the type that struct + * references) in different compilation units can be represented in BTF to + * various degrees of completeness (or, rather, incompleteness) due to + * struct/union forward declarations. + * + * Let's take a look at an example, that we'll use to better understand the + * problem (and solution). Suppose we have two compilation units, each using + * same `struct S`, but each of them having incomplete type information about + * struct's fields: + * + * // CU #1: + * struct S; + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * // CU #2: + * struct S; + * struct A; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * In case of CU #1, BTF data will know only that `struct B` exist (but no + * more), but will know the complete type information about `struct A`. While + * for CU #2, it will know full type information about `struct B`, but will + * only know about forward declaration of `struct A` (in BTF terms, it will + * have `BTF_KIND_FWD` type descriptor with name `B`). + * + * This compilation unit isolation means that it's possible that there is no + * single CU with complete type information describing structs `S`, `A`, and + * `B`. Also, we might get tons of duplicated and redundant type information. + * + * Additional complication we need to keep in mind comes from the fact that + * types, in general, can form graphs containing cycles, not just DAGs. + * + * While algorithm does deduplication, it also merges and resolves type + * information (unless disabled throught `struct btf_opts`), whenever possible. + * E.g., in the example above with two compilation units having partial type + * information for structs `A` and `B`, the output of algorithm will emit + * a single copy of each BTF type that describes structs `A`, `B`, and `S` + * (as well as type information for `int` and pointers), as if they were defined + * in a single compilation unit as: + * + * struct A { + * int a; + * struct A* self; + * struct S* parent; + * }; + * struct B { + * int b; + * struct B* self; + * struct S* parent; + * }; + * struct S { + * struct A* a_ptr; + * struct B* b_ptr; + * }; + * + * Algorithm summary + * ================= + * + * Algorithm completes its work in 6 separate passes: + * + * 1. Strings deduplication. + * 2. Primitive types deduplication (int, enum, fwd). + * 3. Struct/union types deduplication. + * 4. Reference types deduplication (pointers, typedefs, arrays, funcs, func + * protos, and const/volatile/restrict modifiers). + * 5. Types compaction. + * 6. Types remapping. + * + * Algorithm determines canonical type descriptor, which is a single + * representative type for each truly unique type. This canonical type is the + * one that will go into final deduplicated BTF type information. For + * struct/unions, it is also the type that algorithm will merge additional type + * information into (while resolving FWDs), as it discovers it from data in + * other CUs. Each input BTF type eventually gets either mapped to itself, if + * that type is canonical, or to some other type, if that type is equivalent + * and was chosen as canonical representative. This mapping is stored in + * `btf_dedup->map` array. This map is also used to record STRUCT/UNION that + * FWD type got resolved to. + * + * To facilitate fast discovery of canonical types, we also maintain canonical + * index (`btf_dedup->dedup_table`), which maps type descriptor's signature hash + * (i.e., hashed kind, name, size, fields, etc) into a list of canonical types + * that match that signature. With sufficiently good choice of type signature + * hashing function, we can limit number of canonical types for each unique type + * signature to a very small number, allowing to find canonical type for any + * duplicated type very quickly. + * + * Struct/union deduplication is the most critical part and algorithm for + * deduplicating structs/unions is described in greater details in comments for + * `btf_dedup_is_equiv` function. + */ +int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, + const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d = btf_dedup_new(btf, btf_ext, opts); + int err; + + if (IS_ERR(d)) { + pr_debug("btf_dedup_new failed: %ld", PTR_ERR(d)); + return -EINVAL; + } + + err = btf_dedup_strings(d); + if (err < 0) { + pr_debug("btf_dedup_strings failed:%d\n", err); + goto done; + } + err = btf_dedup_prim_types(d); + if (err < 0) { + pr_debug("btf_dedup_prim_types failed:%d\n", err); + goto done; + } + err = btf_dedup_struct_types(d); + if (err < 0) { + pr_debug("btf_dedup_struct_types failed:%d\n", err); + goto done; + } + err = btf_dedup_ref_types(d); + if (err < 0) { + pr_debug("btf_dedup_ref_types failed:%d\n", err); + goto done; + } + err = btf_dedup_compact_types(d); + if (err < 0) { + pr_debug("btf_dedup_compact_types failed:%d\n", err); + goto done; + } + err = btf_dedup_remap_types(d); + if (err < 0) { + pr_debug("btf_dedup_remap_types failed:%d\n", err); + goto done; + } + +done: + btf_dedup_free(d); + return err; +} + +#define BTF_DEDUP_TABLE_DEFAULT_SIZE (1 << 14) +#define BTF_DEDUP_TABLE_MAX_SIZE_LOG 31 +#define BTF_UNPROCESSED_ID ((__u32)-1) +#define BTF_IN_PROGRESS_ID ((__u32)-2) + +struct btf_dedup_node { + struct btf_dedup_node *next; + __u32 type_id; +}; + +struct btf_dedup { + /* .BTF section to be deduped in-place */ + struct btf *btf; + /* + * Optional .BTF.ext section. When provided, any strings referenced + * from it will be taken into account when deduping strings + */ + struct btf_ext *btf_ext; + /* + * This is a map from any type's signature hash to a list of possible + * canonical representative type candidates. Hash collisions are + * ignored, so even types of various kinds can share same list of + * candidates, which is fine because we rely on subsequent + * btf_xxx_equal() checks to authoritatively verify type equality. + */ + struct btf_dedup_node **dedup_table; + /* Canonical types map */ + __u32 *map; + /* Hypothetical mapping, used during type graph equivalence checks */ + __u32 *hypot_map; + __u32 *hypot_list; + size_t hypot_cnt; + size_t hypot_cap; + /* Various option modifying behavior of algorithm */ + struct btf_dedup_opts opts; +}; + +struct btf_str_ptr { + const char *str; + __u32 new_off; + bool used; +}; + +struct btf_str_ptrs { + struct btf_str_ptr *ptrs; + const char *data; + __u32 cnt; + __u32 cap; +}; + +static inline __u32 hash_combine(__u32 h, __u32 value) +{ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME 0x9e370001UL + return h * 37 + value * GOLDEN_RATIO_PRIME; +#undef GOLDEN_RATIO_PRIME +} + +#define for_each_dedup_cand(d, hash, node) \ + for (node = d->dedup_table[hash & (d->opts.dedup_table_size - 1)]; \ + node; \ + node = node->next) + +static int btf_dedup_table_add(struct btf_dedup *d, __u32 hash, __u32 type_id) +{ + struct btf_dedup_node *node = malloc(sizeof(struct btf_dedup_node)); + int bucket = hash & (d->opts.dedup_table_size - 1); + + if (!node) + return -ENOMEM; + node->type_id = type_id; + node->next = d->dedup_table[bucket]; + d->dedup_table[bucket] = node; + return 0; +} + +static int btf_dedup_hypot_map_add(struct btf_dedup *d, + __u32 from_id, __u32 to_id) +{ + if (d->hypot_cnt == d->hypot_cap) { + __u32 *new_list; + + d->hypot_cap += max(16, d->hypot_cap / 2); + new_list = realloc(d->hypot_list, sizeof(__u32) * d->hypot_cap); + if (!new_list) + return -ENOMEM; + d->hypot_list = new_list; + } + d->hypot_list[d->hypot_cnt++] = from_id; + d->hypot_map[from_id] = to_id; + return 0; +} + +static void btf_dedup_clear_hypot_map(struct btf_dedup *d) +{ + int i; + + for (i = 0; i < d->hypot_cnt; i++) + d->hypot_map[d->hypot_list[i]] = BTF_UNPROCESSED_ID; + d->hypot_cnt = 0; +} + +static void btf_dedup_table_free(struct btf_dedup *d) +{ + struct btf_dedup_node *head, *tmp; + int i; + + if (!d->dedup_table) + return; + + for (i = 0; i < d->opts.dedup_table_size; i++) { + while (d->dedup_table[i]) { + tmp = d->dedup_table[i]; + d->dedup_table[i] = tmp->next; + free(tmp); + } + + head = d->dedup_table[i]; + while (head) { + tmp = head; + head = head->next; + free(tmp); + } + } + + free(d->dedup_table); + d->dedup_table = NULL; +} + +static void btf_dedup_free(struct btf_dedup *d) +{ + btf_dedup_table_free(d); + + free(d->map); + d->map = NULL; + + free(d->hypot_map); + d->hypot_map = NULL; + + free(d->hypot_list); + d->hypot_list = NULL; + + free(d); +} + +/* Find closest power of two >= to size, capped at 2^max_size_log */ +static __u32 roundup_pow2_max(__u32 size, int max_size_log) +{ + int i; + + for (i = 0; i < max_size_log && (1U << i) < size; i++) + ; + return 1U << i; +} + + +static struct btf_dedup *btf_dedup_new(struct btf *btf, struct btf_ext *btf_ext, + const struct btf_dedup_opts *opts) +{ + struct btf_dedup *d = calloc(1, sizeof(struct btf_dedup)); + int i, err = 0; + __u32 sz; + + if (!d) + return ERR_PTR(-ENOMEM); + + d->opts.dont_resolve_fwds = opts && opts->dont_resolve_fwds; + sz = opts && opts->dedup_table_size ? opts->dedup_table_size + : BTF_DEDUP_TABLE_DEFAULT_SIZE; + sz = roundup_pow2_max(sz, BTF_DEDUP_TABLE_MAX_SIZE_LOG); + d->opts.dedup_table_size = sz; + + d->btf = btf; + d->btf_ext = btf_ext; + + d->dedup_table = calloc(d->opts.dedup_table_size, + sizeof(struct btf_dedup_node *)); + if (!d->dedup_table) { + err = -ENOMEM; + goto done; + } + + d->map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + if (!d->map) { + err = -ENOMEM; + goto done; + } + /* special BTF "void" type is made canonical immediately */ + d->map[0] = 0; + for (i = 1; i <= btf->nr_types; i++) + d->map[i] = BTF_UNPROCESSED_ID; + + d->hypot_map = malloc(sizeof(__u32) * (1 + btf->nr_types)); + if (!d->hypot_map) { + err = -ENOMEM; + goto done; + } + for (i = 0; i <= btf->nr_types; i++) + d->hypot_map[i] = BTF_UNPROCESSED_ID; + +done: + if (err) { + btf_dedup_free(d); + return ERR_PTR(err); + } + + return d; +} + +typedef int (*str_off_fn_t)(__u32 *str_off_ptr, void *ctx); + +/* + * Iterate over all possible places in .BTF and .BTF.ext that can reference + * string and pass pointer to it to a provided callback `fn`. + */ +static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) +{ + void *line_data_cur, *line_data_end; + int i, j, r, rec_size; + struct btf_type *t; + + for (i = 1; i <= d->btf->nr_types; i++) { + t = d->btf->types[i]; + r = fn(&t->name_off, ctx); + if (r) + return r; + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = (struct btf_member *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + + for (j = 0; j < vlen; j++) { + r = fn(&m->name_off, ctx); + if (r) + return r; + m++; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = (struct btf_enum *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + + for (j = 0; j < vlen; j++) { + r = fn(&m->name_off, ctx); + if (r) + return r; + m++; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = (struct btf_param *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + + for (j = 0; j < vlen; j++) { + r = fn(&m->name_off, ctx); + if (r) + return r; + m++; + } + break; + } + default: + break; + } + } + + if (!d->btf_ext) + return 0; + + line_data_cur = d->btf_ext->line_info.info; + line_data_end = d->btf_ext->line_info.info + d->btf_ext->line_info.len; + rec_size = d->btf_ext->line_info.rec_size; + + while (line_data_cur < line_data_end) { + struct btf_ext_info_sec *sec = line_data_cur; + struct bpf_line_info_min *line_info; + __u32 num_info = sec->num_info; + + r = fn(&sec->sec_name_off, ctx); + if (r) + return r; + + line_data_cur += sizeof(struct btf_ext_info_sec); + for (i = 0; i < num_info; i++) { + line_info = line_data_cur; + r = fn(&line_info->file_name_off, ctx); + if (r) + return r; + r = fn(&line_info->line_off, ctx); + if (r) + return r; + line_data_cur += rec_size; + } + } + + return 0; +} + +static int str_sort_by_content(const void *a1, const void *a2) +{ + const struct btf_str_ptr *p1 = a1; + const struct btf_str_ptr *p2 = a2; + + return strcmp(p1->str, p2->str); +} + +static int str_sort_by_offset(const void *a1, const void *a2) +{ + const struct btf_str_ptr *p1 = a1; + const struct btf_str_ptr *p2 = a2; + + if (p1->str != p2->str) + return p1->str < p2->str ? -1 : 1; + return 0; +} + +static int btf_dedup_str_ptr_cmp(const void *str_ptr, const void *pelem) +{ + const struct btf_str_ptr *p = pelem; + + if (str_ptr != p->str) + return (const char *)str_ptr < p->str ? -1 : 1; + return 0; +} + +static int btf_str_mark_as_used(__u32 *str_off_ptr, void *ctx) +{ + struct btf_str_ptrs *strs; + struct btf_str_ptr *s; + + if (*str_off_ptr == 0) + return 0; + + strs = ctx; + s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, + sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); + if (!s) + return -EINVAL; + s->used = true; + return 0; +} + +static int btf_str_remap_offset(__u32 *str_off_ptr, void *ctx) +{ + struct btf_str_ptrs *strs; + struct btf_str_ptr *s; + + if (*str_off_ptr == 0) + return 0; + + strs = ctx; + s = bsearch(strs->data + *str_off_ptr, strs->ptrs, strs->cnt, + sizeof(struct btf_str_ptr), btf_dedup_str_ptr_cmp); + if (!s) + return -EINVAL; + *str_off_ptr = s->new_off; + return 0; +} + +/* + * Dedup string and filter out those that are not referenced from either .BTF + * or .BTF.ext (if provided) sections. + * + * This is done by building index of all strings in BTF's string section, + * then iterating over all entities that can reference strings (e.g., type + * names, struct field names, .BTF.ext line info, etc) and marking corresponding + * strings as used. After that all used strings are deduped and compacted into + * sequential blob of memory and new offsets are calculated. Then all the string + * references are iterated again and rewritten using new offsets. + */ +static int btf_dedup_strings(struct btf_dedup *d) +{ + const struct btf_header *hdr = d->btf->hdr; + char *start = (char *)d->btf->nohdr_data + hdr->str_off; + char *end = start + d->btf->hdr->str_len; + char *p = start, *tmp_strs = NULL; + struct btf_str_ptrs strs = { + .cnt = 0, + .cap = 0, + .ptrs = NULL, + .data = start, + }; + int i, j, err = 0, grp_idx; + bool grp_used; + + /* build index of all strings */ + while (p < end) { + if (strs.cnt + 1 > strs.cap) { + struct btf_str_ptr *new_ptrs; + + strs.cap += max(strs.cnt / 2, 16); + new_ptrs = realloc(strs.ptrs, + sizeof(strs.ptrs[0]) * strs.cap); + if (!new_ptrs) { + err = -ENOMEM; + goto done; + } + strs.ptrs = new_ptrs; + } + + strs.ptrs[strs.cnt].str = p; + strs.ptrs[strs.cnt].used = false; + + p += strlen(p) + 1; + strs.cnt++; + } + + /* temporary storage for deduplicated strings */ + tmp_strs = malloc(d->btf->hdr->str_len); + if (!tmp_strs) { + err = -ENOMEM; + goto done; + } + + /* mark all used strings */ + strs.ptrs[0].used = true; + err = btf_for_each_str_off(d, btf_str_mark_as_used, &strs); + if (err) + goto done; + + /* sort strings by context, so that we can identify duplicates */ + qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_content); + + /* + * iterate groups of equal strings and if any instance in a group was + * referenced, emit single instance and remember new offset + */ + p = tmp_strs; + grp_idx = 0; + grp_used = strs.ptrs[0].used; + /* iterate past end to avoid code duplication after loop */ + for (i = 1; i <= strs.cnt; i++) { + /* + * when i == strs.cnt, we want to skip string comparison and go + * straight to handling last group of strings (otherwise we'd + * need to handle last group after the loop w/ duplicated code) + */ + if (i < strs.cnt && + !strcmp(strs.ptrs[i].str, strs.ptrs[grp_idx].str)) { + grp_used = grp_used || strs.ptrs[i].used; + continue; + } + + /* + * this check would have been required after the loop to handle + * last group of strings, but due to <= condition in a loop + * we avoid that duplication + */ + if (grp_used) { + int new_off = p - tmp_strs; + __u32 len = strlen(strs.ptrs[grp_idx].str); + + memmove(p, strs.ptrs[grp_idx].str, len + 1); + for (j = grp_idx; j < i; j++) + strs.ptrs[j].new_off = new_off; + p += len + 1; + } + + if (i < strs.cnt) { + grp_idx = i; + grp_used = strs.ptrs[i].used; + } + } + + /* replace original strings with deduped ones */ + d->btf->hdr->str_len = p - tmp_strs; + memmove(start, tmp_strs, d->btf->hdr->str_len); + end = start + d->btf->hdr->str_len; + + /* restore original order for further binary search lookups */ + qsort(strs.ptrs, strs.cnt, sizeof(strs.ptrs[0]), str_sort_by_offset); + + /* remap string offsets */ + err = btf_for_each_str_off(d, btf_str_remap_offset, &strs); + if (err) + goto done; + + d->btf->hdr->str_len = end - start; + +done: + free(tmp_strs); + free(strs.ptrs); + return err; +} + +static __u32 btf_hash_common(struct btf_type *t) +{ + __u32 h; + + h = hash_combine(0, t->name_off); + h = hash_combine(h, t->info); + h = hash_combine(h, t->size); + return h; +} + +static bool btf_equal_common(struct btf_type *t1, struct btf_type *t2) +{ + return t1->name_off == t2->name_off && + t1->info == t2->info && + t1->size == t2->size; +} + +/* Calculate type signature hash of INT. */ +static __u32 btf_hash_int(struct btf_type *t) +{ + __u32 info = *(__u32 *)(t + 1); + __u32 h; + + h = btf_hash_common(t); + h = hash_combine(h, info); + return h; +} + +/* Check structural equality of two INTs. */ +static bool btf_equal_int(struct btf_type *t1, struct btf_type *t2) +{ + __u32 info1, info2; + + if (!btf_equal_common(t1, t2)) + return false; + info1 = *(__u32 *)(t1 + 1); + info2 = *(__u32 *)(t2 + 1); + return info1 == info2; +} + +/* Calculate type signature hash of ENUM. */ +static __u32 btf_hash_enum(struct btf_type *t) +{ + struct btf_enum *member = (struct btf_enum *)(t + 1); + __u32 vlen = BTF_INFO_VLEN(t->info); + __u32 h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->val); + member++; + } + return h; +} + +/* Check structural equality of two ENUMs. */ +static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_enum *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = BTF_INFO_VLEN(t1->info); + m1 = (struct btf_enum *)(t1 + 1); + m2 = (struct btf_enum *)(t2 + 1); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val != m2->val) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Calculate type signature hash of STRUCT/UNION, ignoring referenced type IDs, + * as referenced type IDs equivalence is established separately during type + * graph equivalence check algorithm. + */ +static __u32 btf_hash_struct(struct btf_type *t) +{ + struct btf_member *member = (struct btf_member *)(t + 1); + __u32 vlen = BTF_INFO_VLEN(t->info); + __u32 h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->offset); + /* no hashing of referenced type ID, it can be unresolved yet */ + member++; + } + return h; +} + +/* + * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_shallow_equal_struct(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_member *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = BTF_INFO_VLEN(t1->info); + m1 = (struct btf_member *)(t1 + 1); + m2 = (struct btf_member *)(t2 + 1); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->offset != m2->offset) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Calculate type signature hash of ARRAY, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static __u32 btf_hash_array(struct btf_type *t) +{ + struct btf_array *info = (struct btf_array *)(t + 1); + __u32 h = btf_hash_common(t); + + h = hash_combine(h, info->type); + h = hash_combine(h, info->index_type); + h = hash_combine(h, info->nelems); + return h; +} + +/* + * Check exact equality of two ARRAYs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * ARRAY to potential canonical representative. + */ +static bool btf_equal_array(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_array *info1, *info2; + + if (!btf_equal_common(t1, t2)) + return false; + + info1 = (struct btf_array *)(t1 + 1); + info2 = (struct btf_array *)(t2 + 1); + return info1->type == info2->type && + info1->index_type == info2->index_type && + info1->nelems == info2->nelems; +} + +/* + * Check structural compatibility of two ARRAYs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static bool btf_compat_array(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_array *info1, *info2; + + if (!btf_equal_common(t1, t2)) + return false; + + info1 = (struct btf_array *)(t1 + 1); + info2 = (struct btf_array *)(t2 + 1); + return info1->nelems == info2->nelems; +} + +/* + * Calculate type signature hash of FUNC_PROTO, including referenced type IDs, + * under assumption that they were already resolved to canonical type IDs and + * are not going to change. + */ +static inline __u32 btf_hash_fnproto(struct btf_type *t) +{ + struct btf_param *member = (struct btf_param *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + __u32 h = btf_hash_common(t); + int i; + + for (i = 0; i < vlen; i++) { + h = hash_combine(h, member->name_off); + h = hash_combine(h, member->type); + member++; + } + return h; +} + +/* + * Check exact equality of two FUNC_PROTOs, taking into account referenced + * type IDs, under assumption that they were already resolved to canonical + * type IDs and are not going to change. + * This function is called during reference types deduplication to compare + * FUNC_PROTO to potential canonical representative. + */ +static inline bool btf_equal_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_param *m1, *m2; + __u16 vlen; + int i; + + if (!btf_equal_common(t1, t2)) + return false; + + vlen = BTF_INFO_VLEN(t1->info); + m1 = (struct btf_param *)(t1 + 1); + m2 = (struct btf_param *)(t2 + 1); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->type != m2->type) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Check structural compatibility of two FUNC_PROTOs, ignoring referenced type + * IDs. This check is performed during type graph equivalence check and + * referenced types equivalence is checked separately. + */ +static inline bool btf_compat_fnproto(struct btf_type *t1, struct btf_type *t2) +{ + struct btf_param *m1, *m2; + __u16 vlen; + int i; + + /* skip return type ID */ + if (t1->name_off != t2->name_off || t1->info != t2->info) + return false; + + vlen = BTF_INFO_VLEN(t1->info); + m1 = (struct btf_param *)(t1 + 1); + m2 = (struct btf_param *)(t2 + 1); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off) + return false; + m1++; + m2++; + } + return true; +} + +/* + * Deduplicate primitive types, that can't reference other types, by calculating + * their type signature hash and comparing them with any possible canonical + * candidate. If no canonical candidate matches, type itself is marked as + * canonical and is added into `btf_dedup->dedup_table` as another candidate. + */ +static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *t = d->btf->types[type_id]; + struct btf_type *cand; + struct btf_dedup_node *cand_node; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u32 h; + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_ARRAY: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_FUNC: + case BTF_KIND_FUNC_PROTO: + return 0; + + case BTF_KIND_INT: + h = btf_hash_int(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_int(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + + case BTF_KIND_ENUM: + h = btf_hash_enum(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_enum(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + + case BTF_KIND_FWD: + h = btf_hash_common(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_common(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_prim_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 1; i <= d->btf->nr_types; i++) { + err = btf_dedup_prim_type(d, i); + if (err) + return err; + } + return 0; +} + +/* + * Check whether type is already mapped into canonical one (could be to itself). + */ +static inline bool is_type_mapped(struct btf_dedup *d, uint32_t type_id) +{ + return d->map[type_id] <= BTF_MAX_NR_TYPES; +} + +/* + * Resolve type ID into its canonical type ID, if any; otherwise return original + * type ID. If type is FWD and is resolved into STRUCT/UNION already, follow + * STRUCT/UNION link and resolve it into canonical type ID as well. + */ +static inline __u32 resolve_type_id(struct btf_dedup *d, __u32 type_id) +{ + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + return type_id; +} + +/* + * Resolve FWD to underlying STRUCT/UNION, if any; otherwise return original + * type ID. + */ +static uint32_t resolve_fwd_id(struct btf_dedup *d, uint32_t type_id) +{ + __u32 orig_type_id = type_id; + + if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD) + return type_id; + + while (is_type_mapped(d, type_id) && d->map[type_id] != type_id) + type_id = d->map[type_id]; + + if (BTF_INFO_KIND(d->btf->types[type_id]->info) != BTF_KIND_FWD) + return type_id; + + return orig_type_id; +} + + +static inline __u16 btf_fwd_kind(struct btf_type *t) +{ + return BTF_INFO_KFLAG(t->info) ? BTF_KIND_UNION : BTF_KIND_STRUCT; +} + +/* + * Check equivalence of BTF type graph formed by candidate struct/union (we'll + * call it "candidate graph" in this description for brevity) to a type graph + * formed by (potential) canonical struct/union ("canonical graph" for brevity + * here, though keep in mind that not all types in canonical graph are + * necessarily canonical representatives themselves, some of them might be + * duplicates or its uniqueness might not have been established yet). + * Returns: + * - >0, if type graphs are equivalent; + * - 0, if not equivalent; + * - <0, on error. + * + * Algorithm performs side-by-side DFS traversal of both type graphs and checks + * equivalence of BTF types at each step. If at any point BTF types in candidate + * and canonical graphs are not compatible structurally, whole graphs are + * incompatible. If types are structurally equivalent (i.e., all information + * except referenced type IDs is exactly the same), a mapping from `canon_id` to + * a `cand_id` is recored in hypothetical mapping (`btf_dedup->hypot_map`). + * If a type references other types, then those referenced types are checked + * for equivalence recursively. + * + * During DFS traversal, if we find that for current `canon_id` type we + * already have some mapping in hypothetical map, we check for two possible + * situations: + * - `canon_id` is mapped to exactly the same type as `cand_id`. This will + * happen when type graphs have cycles. In this case we assume those two + * types are equivalent. + * - `canon_id` is mapped to different type. This is contradiction in our + * hypothetical mapping, because same graph in canonical graph corresponds + * to two different types in candidate graph, which for equivalent type + * graphs shouldn't happen. This condition terminates equivalence check + * with negative result. + * + * If type graphs traversal exhausts types to check and find no contradiction, + * then type graphs are equivalent. + * + * When checking types for equivalence, there is one special case: FWD types. + * If FWD type resolution is allowed and one of the types (either from canonical + * or candidate graph) is FWD and other is STRUCT/UNION (depending on FWD's kind + * flag) and their names match, hypothetical mapping is updated to point from + * FWD to STRUCT/UNION. If graphs will be determined as equivalent successfully, + * this mapping will be used to record FWD -> STRUCT/UNION mapping permanently. + * + * Technically, this could lead to incorrect FWD to STRUCT/UNION resolution, + * if there are two exactly named (or anonymous) structs/unions that are + * compatible structurally, one of which has FWD field, while other is concrete + * STRUCT/UNION, but according to C sources they are different structs/unions + * that are referencing different types with the same name. This is extremely + * unlikely to happen, but btf_dedup API allows to disable FWD resolution if + * this logic is causing problems. + * + * Doing FWD resolution means that both candidate and/or canonical graphs can + * consists of portions of the graph that come from multiple compilation units. + * This is due to the fact that types within single compilation unit are always + * deduplicated and FWDs are already resolved, if referenced struct/union + * definiton is available. So, if we had unresolved FWD and found corresponding + * STRUCT/UNION, they will be from different compilation units. This + * consequently means that when we "link" FWD to corresponding STRUCT/UNION, + * type graph will likely have at least two different BTF types that describe + * same type (e.g., most probably there will be two different BTF types for the + * same 'int' primitive type) and could even have "overlapping" parts of type + * graph that describe same subset of types. + * + * This in turn means that our assumption that each type in canonical graph + * must correspond to exactly one type in candidate graph might not hold + * anymore and will make it harder to detect contradictions using hypothetical + * map. To handle this problem, we allow to follow FWD -> STRUCT/UNION + * resolution only in canonical graph. FWDs in candidate graphs are never + * resolved. To see why it's OK, let's check all possible situations w.r.t. FWDs + * that can occur: + * - Both types in canonical and candidate graphs are FWDs. If they are + * structurally equivalent, then they can either be both resolved to the + * same STRUCT/UNION or not resolved at all. In both cases they are + * equivalent and there is no need to resolve FWD on candidate side. + * - Both types in canonical and candidate graphs are concrete STRUCT/UNION, + * so nothing to resolve as well, algorithm will check equivalence anyway. + * - Type in canonical graph is FWD, while type in candidate is concrete + * STRUCT/UNION. In this case candidate graph comes from single compilation + * unit, so there is exactly one BTF type for each unique C type. After + * resolving FWD into STRUCT/UNION, there might be more than one BTF type + * in canonical graph mapping to single BTF type in candidate graph, but + * because hypothetical mapping maps from canonical to candidate types, it's + * alright, and we still maintain the property of having single `canon_id` + * mapping to single `cand_id` (there could be two different `canon_id` + * mapped to the same `cand_id`, but it's not contradictory). + * - Type in canonical graph is concrete STRUCT/UNION, while type in candidate + * graph is FWD. In this case we are just going to check compatibility of + * STRUCT/UNION and corresponding FWD, and if they are compatible, we'll + * assume that whatever STRUCT/UNION FWD resolves to must be equivalent to + * a concrete STRUCT/UNION from canonical graph. If the rest of type graphs + * turn out equivalent, we'll re-resolve FWD to concrete STRUCT/UNION from + * canonical graph. + */ +static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, + __u32 canon_id) +{ + struct btf_type *cand_type; + struct btf_type *canon_type; + __u32 hypot_type_id; + __u16 cand_kind; + __u16 canon_kind; + int i, eq; + + /* if both resolve to the same canonical, they must be equivalent */ + if (resolve_type_id(d, cand_id) == resolve_type_id(d, canon_id)) + return 1; + + canon_id = resolve_fwd_id(d, canon_id); + + hypot_type_id = d->hypot_map[canon_id]; + if (hypot_type_id <= BTF_MAX_NR_TYPES) + return hypot_type_id == cand_id; + + if (btf_dedup_hypot_map_add(d, canon_id, cand_id)) + return -ENOMEM; + + cand_type = d->btf->types[cand_id]; + canon_type = d->btf->types[canon_id]; + cand_kind = BTF_INFO_KIND(cand_type->info); + canon_kind = BTF_INFO_KIND(canon_type->info); + + if (cand_type->name_off != canon_type->name_off) + return 0; + + /* FWD <--> STRUCT/UNION equivalence check, if enabled */ + if (!d->opts.dont_resolve_fwds + && (cand_kind == BTF_KIND_FWD || canon_kind == BTF_KIND_FWD) + && cand_kind != canon_kind) { + __u16 real_kind; + __u16 fwd_kind; + + if (cand_kind == BTF_KIND_FWD) { + real_kind = canon_kind; + fwd_kind = btf_fwd_kind(cand_type); + } else { + real_kind = cand_kind; + fwd_kind = btf_fwd_kind(canon_type); + } + return fwd_kind == real_kind; + } + + if (cand_type->info != canon_type->info) + return 0; + + switch (cand_kind) { + case BTF_KIND_INT: + return btf_equal_int(cand_type, canon_type); + + case BTF_KIND_ENUM: + return btf_equal_enum(cand_type, canon_type); + + case BTF_KIND_FWD: + return btf_equal_common(cand_type, canon_type); + + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + return btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + + case BTF_KIND_ARRAY: { + struct btf_array *cand_arr, *canon_arr; + + if (!btf_compat_array(cand_type, canon_type)) + return 0; + cand_arr = (struct btf_array *)(cand_type + 1); + canon_arr = (struct btf_array *)(canon_type + 1); + eq = btf_dedup_is_equiv(d, + cand_arr->index_type, canon_arr->index_type); + if (eq <= 0) + return eq; + return btf_dedup_is_equiv(d, cand_arr->type, canon_arr->type); + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *cand_m, *canon_m; + __u16 vlen; + + if (!btf_shallow_equal_struct(cand_type, canon_type)) + return 0; + vlen = BTF_INFO_VLEN(cand_type->info); + cand_m = (struct btf_member *)(cand_type + 1); + canon_m = (struct btf_member *)(canon_type + 1); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_m->type, canon_m->type); + if (eq <= 0) + return eq; + cand_m++; + canon_m++; + } + + return 1; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *cand_p, *canon_p; + __u16 vlen; + + if (!btf_compat_fnproto(cand_type, canon_type)) + return 0; + eq = btf_dedup_is_equiv(d, cand_type->type, canon_type->type); + if (eq <= 0) + return eq; + vlen = BTF_INFO_VLEN(cand_type->info); + cand_p = (struct btf_param *)(cand_type + 1); + canon_p = (struct btf_param *)(canon_type + 1); + for (i = 0; i < vlen; i++) { + eq = btf_dedup_is_equiv(d, cand_p->type, canon_p->type); + if (eq <= 0) + return eq; + cand_p++; + canon_p++; + } + return 1; + } + + default: + return -EINVAL; + } + return 0; +} + +/* + * Use hypothetical mapping, produced by successful type graph equivalence + * check, to augment existing struct/union canonical mapping, where possible. + * + * If BTF_KIND_FWD resolution is allowed, this mapping is also used to record + * FWD -> STRUCT/UNION correspondence as well. FWD resolution is bidirectional: + * it doesn't matter if FWD type was part of canonical graph or candidate one, + * we are recording the mapping anyway. As opposed to carefulness required + * for struct/union correspondence mapping (described below), for FWD resolution + * it's not important, as by the time that FWD type (reference type) will be + * deduplicated all structs/unions will be deduped already anyway. + * + * Recording STRUCT/UNION mapping is purely a performance optimization and is + * not required for correctness. It needs to be done carefully to ensure that + * struct/union from candidate's type graph is not mapped into corresponding + * struct/union from canonical type graph that itself hasn't been resolved into + * canonical representative. The only guarantee we have is that canonical + * struct/union was determined as canonical and that won't change. But any + * types referenced through that struct/union fields could have been not yet + * resolved, so in case like that it's too early to establish any kind of + * correspondence between structs/unions. + * + * No canonical correspondence is derived for primitive types (they are already + * deduplicated completely already anyway) or reference types (they rely on + * stability of struct/union canonical relationship for equivalence checks). + */ +static void btf_dedup_merge_hypot_map(struct btf_dedup *d) +{ + __u32 cand_type_id, targ_type_id; + __u16 t_kind, c_kind; + __u32 t_id, c_id; + int i; + + for (i = 0; i < d->hypot_cnt; i++) { + cand_type_id = d->hypot_list[i]; + targ_type_id = d->hypot_map[cand_type_id]; + t_id = resolve_type_id(d, targ_type_id); + c_id = resolve_type_id(d, cand_type_id); + t_kind = BTF_INFO_KIND(d->btf->types[t_id]->info); + c_kind = BTF_INFO_KIND(d->btf->types[c_id]->info); + /* + * Resolve FWD into STRUCT/UNION. + * It's ok to resolve FWD into STRUCT/UNION that's not yet + * mapped to canonical representative (as opposed to + * STRUCT/UNION <--> STRUCT/UNION mapping logic below), because + * eventually that struct is going to be mapped and all resolved + * FWDs will automatically resolve to correct canonical + * representative. This will happen before ref type deduping, + * which critically depends on stability of these mapping. This + * stability is not a requirement for STRUCT/UNION equivalence + * checks, though. + */ + if (t_kind != BTF_KIND_FWD && c_kind == BTF_KIND_FWD) + d->map[c_id] = t_id; + else if (t_kind == BTF_KIND_FWD && c_kind != BTF_KIND_FWD) + d->map[t_id] = c_id; + + if ((t_kind == BTF_KIND_STRUCT || t_kind == BTF_KIND_UNION) && + c_kind != BTF_KIND_FWD && + is_type_mapped(d, c_id) && + !is_type_mapped(d, t_id)) { + /* + * as a perf optimization, we can map struct/union + * that's part of type graph we just verified for + * equivalence. We can do that for struct/union that has + * canonical representative only, though. + */ + d->map[t_id] = c_id; + } + } +} + +/* + * Deduplicate struct/union types. + * + * For each struct/union type its type signature hash is calculated, taking + * into account type's name, size, number, order and names of fields, but + * ignoring type ID's referenced from fields, because they might not be deduped + * completely until after reference types deduplication phase. This type hash + * is used to iterate over all potential canonical types, sharing same hash. + * For each canonical candidate we check whether type graphs that they form + * (through referenced types in fields and so on) are equivalent using algorithm + * implemented in `btf_dedup_is_equiv`. If such equivalence is found and + * BTF_KIND_FWD resolution is allowed, then hypothetical mapping + * (btf_dedup->hypot_map) produced by aforementioned type graph equivalence + * algorithm is used to record FWD -> STRUCT/UNION mapping. It's also used to + * potentially map other structs/unions to their canonical representatives, + * if such relationship hasn't yet been established. This speeds up algorithm + * by eliminating some of the duplicate work. + * + * If no matching canonical representative was found, struct/union is marked + * as canonical for itself and is added into btf_dedup->dedup_table hash map + * for further look ups. + */ +static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_dedup_node *cand_node; + struct btf_type *cand_type, *t; + /* if we don't find equivalent type, then we are canonical */ + __u32 new_id = type_id; + __u16 kind; + __u32 h; + + /* already deduped or is in process of deduping (loop detected) */ + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return 0; + + t = d->btf->types[type_id]; + kind = BTF_INFO_KIND(t->info); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + return 0; + + h = btf_hash_struct(t); + for_each_dedup_cand(d, h, cand_node) { + int eq; + + /* + * Even though btf_dedup_is_equiv() checks for + * btf_shallow_equal_struct() internally when checking two + * structs (unions) for equivalence, we need to guard here + * from picking matching FWD type as a dedup candidate. + * This can happen due to hash collision. In such case just + * relying on btf_dedup_is_equiv() would lead to potentially + * creating a loop (FWD -> STRUCT and STRUCT -> FWD), because + * FWD and compatible STRUCT/UNION are considered equivalent. + */ + cand_type = d->btf->types[cand_node->type_id]; + if (!btf_shallow_equal_struct(t, cand_type)) + continue; + + btf_dedup_clear_hypot_map(d); + eq = btf_dedup_is_equiv(d, type_id, cand_node->type_id); + if (eq < 0) + return eq; + if (!eq) + continue; + new_id = cand_node->type_id; + btf_dedup_merge_hypot_map(d); + break; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return 0; +} + +static int btf_dedup_struct_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 1; i <= d->btf->nr_types; i++) { + err = btf_dedup_struct_type(d, i); + if (err) + return err; + } + return 0; +} + +/* + * Deduplicate reference type. + * + * Once all primitive and struct/union types got deduplicated, we can easily + * deduplicate all other (reference) BTF types. This is done in two steps: + * + * 1. Resolve all referenced type IDs into their canonical type IDs. This + * resolution can be done either immediately for primitive or struct/union types + * (because they were deduped in previous two phases) or recursively for + * reference types. Recursion will always terminate at either primitive or + * struct/union type, at which point we can "unwind" chain of reference types + * one by one. There is no danger of encountering cycles because in C type + * system the only way to form type cycle is through struct/union, so any chain + * of reference types, even those taking part in a type cycle, will inevitably + * reach struct/union at some point. + * + * 2. Once all referenced type IDs are resolved into canonical ones, BTF type + * becomes "stable", in the sense that no further deduplication will cause + * any changes to it. With that, it's now possible to calculate type's signature + * hash (this time taking into account referenced type IDs) and loop over all + * potential canonical representatives. If no match was found, current type + * will become canonical representative of itself and will be added into + * btf_dedup->dedup_table as another possible canonical representative. + */ +static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_dedup_node *cand_node; + struct btf_type *t, *cand; + /* if we don't find equivalent type, then we are representative type */ + __u32 new_id = type_id; + int ref_type_id; + __u32 h; + + if (d->map[type_id] == BTF_IN_PROGRESS_ID) + return -ELOOP; + if (d->map[type_id] <= BTF_MAX_NR_TYPES) + return resolve_type_id(d, type_id); + + t = d->btf->types[type_id]; + d->map[type_id] = BTF_IN_PROGRESS_ID; + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + h = btf_hash_common(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_common(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + + case BTF_KIND_ARRAY: { + struct btf_array *info = (struct btf_array *)(t + 1); + + ref_type_id = btf_dedup_ref_type(d, info->type); + if (ref_type_id < 0) + return ref_type_id; + info->type = ref_type_id; + + ref_type_id = btf_dedup_ref_type(d, info->index_type); + if (ref_type_id < 0) + return ref_type_id; + info->index_type = ref_type_id; + + h = btf_hash_array(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_array(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *param; + __u16 vlen; + int i; + + ref_type_id = btf_dedup_ref_type(d, t->type); + if (ref_type_id < 0) + return ref_type_id; + t->type = ref_type_id; + + vlen = BTF_INFO_VLEN(t->info); + param = (struct btf_param *)(t + 1); + for (i = 0; i < vlen; i++) { + ref_type_id = btf_dedup_ref_type(d, param->type); + if (ref_type_id < 0) + return ref_type_id; + param->type = ref_type_id; + param++; + } + + h = btf_hash_fnproto(t); + for_each_dedup_cand(d, h, cand_node) { + cand = d->btf->types[cand_node->type_id]; + if (btf_equal_fnproto(t, cand)) { + new_id = cand_node->type_id; + break; + } + } + break; + } + + default: + return -EINVAL; + } + + d->map[type_id] = new_id; + if (type_id == new_id && btf_dedup_table_add(d, h, type_id)) + return -ENOMEM; + + return new_id; +} + +static int btf_dedup_ref_types(struct btf_dedup *d) +{ + int i, err; + + for (i = 1; i <= d->btf->nr_types; i++) { + err = btf_dedup_ref_type(d, i); + if (err < 0) + return err; + } + btf_dedup_table_free(d); + return 0; +} + +/* + * Compact types. + * + * After we established for each type its corresponding canonical representative + * type, we now can eliminate types that are not canonical and leave only + * canonical ones layed out sequentially in memory by copying them over + * duplicates. During compaction btf_dedup->hypot_map array is reused to store + * a map from original type ID to a new compacted type ID, which will be used + * during next phase to "fix up" type IDs, referenced from struct/union and + * reference types. + */ +static int btf_dedup_compact_types(struct btf_dedup *d) +{ + struct btf_type **new_types; + __u32 next_type_id = 1; + char *types_start, *p; + int i, len; + + /* we are going to reuse hypot_map to store compaction remapping */ + d->hypot_map[0] = 0; + for (i = 1; i <= d->btf->nr_types; i++) + d->hypot_map[i] = BTF_UNPROCESSED_ID; + + types_start = d->btf->nohdr_data + d->btf->hdr->type_off; + p = types_start; + + for (i = 1; i <= d->btf->nr_types; i++) { + if (d->map[i] != i) + continue; + + len = btf_type_size(d->btf->types[i]); + if (len < 0) + return len; + + memmove(p, d->btf->types[i], len); + d->hypot_map[i] = next_type_id; + d->btf->types[next_type_id] = (struct btf_type *)p; + p += len; + next_type_id++; + } + + /* shrink struct btf's internal types index and update btf_header */ + d->btf->nr_types = next_type_id - 1; + d->btf->types_size = d->btf->nr_types; + d->btf->hdr->type_len = p - types_start; + new_types = realloc(d->btf->types, + (1 + d->btf->nr_types) * sizeof(struct btf_type *)); + if (!new_types) + return -ENOMEM; + d->btf->types = new_types; + + /* make sure string section follows type information without gaps */ + d->btf->hdr->str_off = p - (char *)d->btf->nohdr_data; + memmove(p, d->btf->strings, d->btf->hdr->str_len); + d->btf->strings = p; + p += d->btf->hdr->str_len; + + d->btf->data_size = p - (char *)d->btf->data; + return 0; +} + +/* + * Figure out final (deduplicated and compacted) type ID for provided original + * `type_id` by first resolving it into corresponding canonical type ID and + * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, + * which is populated during compaction phase. + */ +static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) +{ + __u32 resolved_type_id, new_type_id; + + resolved_type_id = resolve_type_id(d, type_id); + new_type_id = d->hypot_map[resolved_type_id]; + if (new_type_id > BTF_MAX_NR_TYPES) + return -EINVAL; + return new_type_id; +} + +/* + * Remap referenced type IDs into deduped type IDs. + * + * After BTF types are deduplicated and compacted, their final type IDs may + * differ from original ones. The map from original to a corresponding + * deduped type ID is stored in btf_dedup->hypot_map and is populated during + * compaction phase. During remapping phase we are rewriting all type IDs + * referenced from any BTF type (e.g., struct fields, func proto args, etc) to + * their final deduped type IDs. + */ +static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) +{ + struct btf_type *t = d->btf->types[type_id]; + int i, r; + + switch (BTF_INFO_KIND(t->info)) { + case BTF_KIND_INT: + case BTF_KIND_ENUM: + break; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + r = btf_dedup_remap_type_id(d, t->type); + if (r < 0) + return r; + t->type = r; + break; + + case BTF_KIND_ARRAY: { + struct btf_array *arr_info = (struct btf_array *)(t + 1); + + r = btf_dedup_remap_type_id(d, arr_info->type); + if (r < 0) + return r; + arr_info->type = r; + r = btf_dedup_remap_type_id(d, arr_info->index_type); + if (r < 0) + return r; + arr_info->index_type = r; + break; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *member = (struct btf_member *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + + for (i = 0; i < vlen; i++) { + r = btf_dedup_remap_type_id(d, member->type); + if (r < 0) + return r; + member->type = r; + member++; + } + break; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *param = (struct btf_param *)(t + 1); + __u16 vlen = BTF_INFO_VLEN(t->info); + + r = btf_dedup_remap_type_id(d, t->type); + if (r < 0) + return r; + t->type = r; + + for (i = 0; i < vlen; i++) { + r = btf_dedup_remap_type_id(d, param->type); + if (r < 0) + return r; + param->type = r; + param++; + } + break; + } + + default: + return -EINVAL; + } + + return 0; +} + +static int btf_dedup_remap_types(struct btf_dedup *d) +{ + int i, r; + + for (i = 1; i <= d->btf->nr_types; i++) { + r = btf_dedup_remap_type(d, i); + if (r < 0) + return r; + } + return 0; +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index b0610dcdae6b..28a1e1e59861 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -55,33 +55,47 @@ struct btf_ext_header { __u32 line_info_len; }; -typedef int (*btf_print_fn_t)(const char *, ...) - __attribute__((format(printf, 1, 2))); - LIBBPF_API void btf__free(struct btf *btf); -LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size, btf_print_fn_t err_log); +LIBBPF_API struct btf *btf__new(__u8 *data, __u32 size); +LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); +LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 id); LIBBPF_API __s64 btf__resolve_size(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__resolve_type(const struct btf *btf, __u32 type_id); LIBBPF_API int btf__fd(const struct btf *btf); +LIBBPF_API const void *btf__get_raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); +LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, + __u32 expected_key_size, + __u32 expected_value_size, + __u32 *key_type_id, __u32 *value_type_id); + +LIBBPF_API struct btf_ext *btf_ext__new(__u8 *data, __u32 size); +LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); +LIBBPF_API const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, + __u32 *size); +LIBBPF_API int btf_ext__reloc_func_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **func_info, __u32 *cnt); +LIBBPF_API int btf_ext__reloc_line_info(const struct btf *btf, + const struct btf_ext *btf_ext, + const char *sec_name, __u32 insns_cnt, + void **line_info, __u32 *cnt); +LIBBPF_API __u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); +LIBBPF_API __u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); + +struct btf_dedup_opts { + unsigned int dedup_table_size; + bool dont_resolve_fwds; +}; -struct btf_ext *btf_ext__new(__u8 *data, __u32 size, btf_print_fn_t err_log); -void btf_ext__free(struct btf_ext *btf_ext); -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *func_info_len); -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); +LIBBPF_API int btf__dedup(struct btf *btf, struct btf_ext *btf_ext, + const struct btf_dedup_opts *opts); #ifdef __cplusplus } /* extern "C" */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 169e347c76f6..f5eb60379c8d 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -42,6 +42,7 @@ #include "bpf.h" #include "btf.h" #include "str_error.h" +#include "libbpf_util.h" #ifndef EM_BPF #define EM_BPF 247 @@ -53,39 +54,33 @@ #define __printf(a, b) __attribute__((format(printf, a, b))) -__printf(1, 2) -static int __base_pr(const char *format, ...) +static int __base_pr(enum libbpf_print_level level, const char *format, + va_list args) { - va_list args; - int err; + if (level == LIBBPF_DEBUG) + return 0; - va_start(args, format); - err = vfprintf(stderr, format, args); - va_end(args); - return err; + return vfprintf(stderr, format, args); } -static __printf(1, 2) libbpf_print_fn_t __pr_warning = __base_pr; -static __printf(1, 2) libbpf_print_fn_t __pr_info = __base_pr; -static __printf(1, 2) libbpf_print_fn_t __pr_debug; - -#define __pr(func, fmt, ...) \ -do { \ - if ((func)) \ - (func)("libbpf: " fmt, ##__VA_ARGS__); \ -} while (0) +static libbpf_print_fn_t __libbpf_pr = __base_pr; -#define pr_warning(fmt, ...) __pr(__pr_warning, fmt, ##__VA_ARGS__) -#define pr_info(fmt, ...) __pr(__pr_info, fmt, ##__VA_ARGS__) -#define pr_debug(fmt, ...) __pr(__pr_debug, fmt, ##__VA_ARGS__) +void libbpf_set_print(libbpf_print_fn_t fn) +{ + __libbpf_pr = fn; +} -void libbpf_set_print(libbpf_print_fn_t warn, - libbpf_print_fn_t info, - libbpf_print_fn_t debug) +__printf(2, 3) +void libbpf_print(enum libbpf_print_level level, const char *format, ...) { - __pr_warning = warn; - __pr_info = info; - __pr_debug = debug; + va_list args; + + if (!__libbpf_pr) + return; + + va_start(args, format); + __libbpf_pr(level, format, args); + va_end(args); } #define STRERR_BUFSIZE 128 @@ -312,7 +307,7 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx, return -EINVAL; } - bzero(prog, sizeof(*prog)); + memset(prog, 0, sizeof(*prog)); prog->section_name = strdup(section_name); if (!prog->section_name) { @@ -839,9 +834,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj, int flags) else if (strcmp(name, "maps") == 0) obj->efile.maps_shndx = idx; else if (strcmp(name, BTF_ELF_SEC) == 0) { - obj->btf = btf__new(data->d_buf, data->d_size, - __pr_debug); - if (IS_ERR(obj->btf)) { + obj->btf = btf__new(data->d_buf, data->d_size); + if (IS_ERR(obj->btf) || btf__load(obj->btf)) { pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", BTF_ELF_SEC, PTR_ERR(obj->btf)); obj->btf = NULL; @@ -915,8 +909,7 @@ static int bpf_object__elf_collect(struct bpf_object *obj, int flags) BTF_EXT_ELF_SEC, BTF_ELF_SEC); } else { obj->btf_ext = btf_ext__new(btf_ext_data->d_buf, - btf_ext_data->d_size, - __pr_debug); + btf_ext_data->d_size); if (IS_ERR(obj->btf_ext)) { pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n", BTF_EXT_ELF_SEC, @@ -1057,72 +1050,18 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr, static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf) { - const struct btf_type *container_type; - const struct btf_member *key, *value; struct bpf_map_def *def = &map->def; - const size_t max_name = 256; - char container_name[max_name]; - __s64 key_size, value_size; - __s32 container_id; - - if (snprintf(container_name, max_name, "____btf_map_%s", map->name) == - max_name) { - pr_warning("map:%s length of '____btf_map_%s' is too long\n", - map->name, map->name); - return -EINVAL; - } - - container_id = btf__find_by_name(btf, container_name); - if (container_id < 0) { - pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n", - map->name, container_name); - return container_id; - } - - container_type = btf__type_by_id(btf, container_id); - if (!container_type) { - pr_warning("map:%s cannot find BTF type for container_id:%u\n", - map->name, container_id); - return -EINVAL; - } - - if (BTF_INFO_KIND(container_type->info) != BTF_KIND_STRUCT || - BTF_INFO_VLEN(container_type->info) < 2) { - pr_warning("map:%s container_name:%s is an invalid container struct\n", - map->name, container_name); - return -EINVAL; - } - - key = (struct btf_member *)(container_type + 1); - value = key + 1; - - key_size = btf__resolve_size(btf, key->type); - if (key_size < 0) { - pr_warning("map:%s invalid BTF key_type_size\n", - map->name); - return key_size; - } - - if (def->key_size != key_size) { - pr_warning("map:%s btf_key_type_size:%u != map_def_key_size:%u\n", - map->name, (__u32)key_size, def->key_size); - return -EINVAL; - } - - value_size = btf__resolve_size(btf, value->type); - if (value_size < 0) { - pr_warning("map:%s invalid BTF value_type_size\n", map->name); - return value_size; - } + __u32 key_type_id, value_type_id; + int ret; - if (def->value_size != value_size) { - pr_warning("map:%s btf_value_type_size:%u != map_def_value_size:%u\n", - map->name, (__u32)value_size, def->value_size); - return -EINVAL; - } + ret = btf__get_map_kv_tids(btf, map->name, def->key_size, + def->value_size, &key_type_id, + &value_type_id); + if (ret) + return ret; - map->btf_key_type_id = key->type; - map->btf_value_type_id = value->type; + map->btf_key_type_id = key_type_id; + map->btf_value_type_id = value_type_id; return 0; } @@ -1174,6 +1113,20 @@ err_free_new_name: return -errno; } +int bpf_map__resize(struct bpf_map *map, __u32 max_entries) +{ + if (!map || !max_entries) + return -EINVAL; + + /* If map already created, its attributes can't be changed. */ + if (map->fd >= 0) + return -EBUSY; + + map->def.max_entries = max_entries; + + return 0; +} + static int bpf_object__probe_name(struct bpf_object *obj) { @@ -1637,7 +1590,7 @@ bpf_program__load(struct bpf_program *prog, struct bpf_prog_prep_result result; bpf_program_prep_t preprocessor = prog->preprocessor; - bzero(&result, sizeof(result)); + memset(&result, 0, sizeof(result)); err = preprocessor(prog, i, prog->insns, prog->insns_cnt, &result); if (err) { @@ -2147,7 +2100,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) if (err) return err; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { char buf[PATH_MAX]; int len; @@ -2194,7 +2147,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) if (!obj) return -ENOENT; - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { char buf[PATH_MAX]; int len; @@ -2378,6 +2331,11 @@ unsigned int bpf_object__kversion(struct bpf_object *obj) return obj ? obj->kern_version : 0; } +struct btf *bpf_object__btf(struct bpf_object *obj) +{ + return obj ? obj->btf : NULL; +} + int bpf_object__btf_fd(const struct bpf_object *obj) { return obj->btf ? btf__fd(obj->btf) : -1; @@ -2667,9 +2625,38 @@ static const struct { #undef BPF_EAPROG_SEC #undef BPF_APROG_COMPAT +#define MAX_TYPE_NAME_SIZE 32 + +static char *libbpf_get_type_names(bool attach_type) +{ + int i, len = ARRAY_SIZE(section_names) * MAX_TYPE_NAME_SIZE; + char *buf; + + buf = malloc(len); + if (!buf) + return NULL; + + buf[0] = '\0'; + /* Forge string buf with all available names */ + for (i = 0; i < ARRAY_SIZE(section_names); i++) { + if (attach_type && !section_names[i].is_attachable) + continue; + + if (strlen(buf) + strlen(section_names[i].sec) + 2 > len) { + free(buf); + return NULL; + } + strcat(buf, " "); + strcat(buf, section_names[i].sec); + } + + return buf; +} + int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type) { + char *type_names; int i; if (!name) @@ -2682,12 +2669,20 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, *expected_attach_type = section_names[i].expected_attach_type; return 0; } + pr_warning("failed to guess program type based on ELF section name '%s'\n", name); + type_names = libbpf_get_type_names(false); + if (type_names != NULL) { + pr_info("supported section(type) names are:%s\n", type_names); + free(type_names); + } + return -EINVAL; } int libbpf_attach_type_by_name(const char *name, enum bpf_attach_type *attach_type) { + char *type_names; int i; if (!name) @@ -2701,6 +2696,13 @@ int libbpf_attach_type_by_name(const char *name, *attach_type = section_names[i].attach_type; return 0; } + pr_warning("failed to guess attach type based on ELF section name '%s'\n", name); + type_names = libbpf_get_type_names(true); + if (type_names != NULL) { + pr_info("attachable section(type) names are:%s\n", type_names); + free(type_names); + } + return -EINVAL; } @@ -2833,13 +2835,19 @@ bpf_object__find_map_by_name(struct bpf_object *obj, const char *name) { struct bpf_map *pos; - bpf_map__for_each(pos, obj) { + bpf_object__for_each_map(pos, obj) { if (pos->name && !strcmp(pos->name, name)) return pos; } return NULL; } +int +bpf_object__find_map_fd_by_name(struct bpf_object *obj, const char *name) +{ + return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); +} + struct bpf_map * bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) { @@ -2907,8 +2915,6 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, err = bpf_program__identify_section(prog, &prog_type, &expected_attach_type); if (err < 0) { - pr_warning("failed to guess program type based on section name %s\n", - prog->section_name); bpf_object__close(obj); return -EINVAL; } @@ -2922,7 +2928,7 @@ int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, first_prog = prog; } - bpf_map__for_each(map, obj) { + bpf_object__for_each_map(map, obj) { if (!bpf_map__is_offload_neutral(map)) map->map_ifindex = attr->ifindex; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 5f68d7b75215..b4652aa1a58a 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -47,17 +47,16 @@ enum libbpf_errno { LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); -/* - * __printf is defined in include/linux/compiler-gcc.h. However, - * it would be better if libbpf.h didn't depend on Linux header files. - * So instead of __printf, here we use gcc attribute directly. - */ -typedef int (*libbpf_print_fn_t)(const char *, ...) - __attribute__((format(printf, 1, 2))); +enum libbpf_print_level { + LIBBPF_WARN, + LIBBPF_INFO, + LIBBPF_DEBUG, +}; -LIBBPF_API void libbpf_set_print(libbpf_print_fn_t warn, - libbpf_print_fn_t info, - libbpf_print_fn_t debug); +typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, + const char *, va_list ap); + +LIBBPF_API void libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ struct bpf_object; @@ -90,6 +89,9 @@ LIBBPF_API int bpf_object__load(struct bpf_object *obj); LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(struct bpf_object *obj); + +struct btf; +LIBBPF_API struct btf *bpf_object__btf(struct bpf_object *obj); LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); LIBBPF_API struct bpf_program * @@ -264,6 +266,9 @@ struct bpf_map; LIBBPF_API struct bpf_map * bpf_object__find_map_by_name(struct bpf_object *obj, const char *name); +LIBBPF_API int +bpf_object__find_map_fd_by_name(struct bpf_object *obj, const char *name); + /* * Get bpf_map through the offset of corresponding struct bpf_map_def * in the BPF object file. @@ -273,10 +278,11 @@ bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); LIBBPF_API struct bpf_map * bpf_map__next(struct bpf_map *map, struct bpf_object *obj); -#define bpf_map__for_each(pos, obj) \ +#define bpf_object__for_each_map(pos, obj) \ for ((pos) = bpf_map__next(NULL, (obj)); \ (pos) != NULL; \ (pos) = bpf_map__next((pos), (obj))) +#define bpf_map__for_each bpf_object__for_each_map LIBBPF_API struct bpf_map * bpf_map__prev(struct bpf_map *map, struct bpf_object *obj); @@ -292,6 +298,7 @@ LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, bpf_map_clear_priv_t clear_priv); LIBBPF_API void *bpf_map__priv(struct bpf_map *map); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); +LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); LIBBPF_API bool bpf_map__is_offload_neutral(struct bpf_map *map); LIBBPF_API void bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); @@ -314,6 +321,7 @@ LIBBPF_API int bpf_prog_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd); LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); +LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, @@ -355,6 +363,20 @@ LIBBPF_API const struct bpf_line_info * bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, __u32 insn_off, __u32 nr_skip); +/* + * Probe for supported system features + * + * Note that running many of these probes in a short amount of time can cause + * the kernel to reach the maximal size of lockable memory allowed for the + * user, causing subsequent probes to fail. In this case, the caller may want + * to adjust that limit with setrlimit(). + */ +LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, + __u32 ifindex); +LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); +LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, + enum bpf_prog_type prog_type, __u32 ifindex); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index cd02cd4e2cc3..778a26702a70 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -124,3 +124,33 @@ LIBBPF_0.0.1 { local: *; }; + +LIBBPF_0.0.2 { + global: + bpf_probe_helper; + bpf_probe_map_type; + bpf_probe_prog_type; + bpf_map__resize; + bpf_map_lookup_elem_flags; + bpf_object__btf; + bpf_object__find_map_fd_by_name; + bpf_get_link_xdp_id; + btf__dedup; + btf__get_map_kv_tids; + btf__get_nr_types; + btf__get_raw_data; + btf__load; + btf_ext__free; + btf_ext__func_info_rec_size; + btf_ext__get_raw_data; + btf_ext__line_info_rec_size; + btf_ext__new; + btf_ext__reloc_func_info; + btf_ext__reloc_line_info; + xsk_umem__create; + xsk_socket__create; + xsk_umem__delete; + xsk_socket__delete; + xsk_umem__fd; + xsk_socket__fd; +} LIBBPF_0.0.1; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c new file mode 100644 index 000000000000..8c3a1c04dcb2 --- /dev/null +++ b/tools/lib/bpf/libbpf_probes.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2019 Netronome Systems, Inc. */ + +#include <errno.h> +#include <fcntl.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <net/if.h> +#include <sys/utsname.h> + +#include <linux/filter.h> +#include <linux/kernel.h> + +#include "bpf.h" +#include "libbpf.h" + +static bool grep(const char *buffer, const char *pattern) +{ + return !!strstr(buffer, pattern); +} + +static int get_vendor_id(int ifindex) +{ + char ifname[IF_NAMESIZE], path[64], buf[8]; + ssize_t len; + int fd; + + if (!if_indextoname(ifindex, ifname)) + return -1; + + snprintf(path, sizeof(path), "/sys/class/net/%s/device/vendor", ifname); + + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) + return -1; + if (len >= (ssize_t)sizeof(buf)) + return -1; + buf[len] = '\0'; + + return strtol(buf, NULL, 0); +} + +static int get_kernel_version(void) +{ + int version, subversion, patchlevel; + struct utsname utsn; + + /* Return 0 on failure, and attempt to probe with empty kversion */ + if (uname(&utsn)) + return 0; + + if (sscanf(utsn.release, "%d.%d.%d", + &version, &subversion, &patchlevel) != 3) + return 0; + + return (version << 16) + (subversion << 8) + patchlevel; +} + +static void +probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, + size_t insns_cnt, char *buf, size_t buf_len, __u32 ifindex) +{ + struct bpf_load_program_attr xattr = {}; + int fd; + + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + xattr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + break; + case BPF_PROG_TYPE_KPROBE: + xattr.kern_version = get_kernel_version(); + break; + case BPF_PROG_TYPE_UNSPEC: + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_SK_REUSEPORT: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + default: + break; + } + + xattr.prog_type = prog_type; + xattr.insns = insns; + xattr.insns_cnt = insns_cnt; + xattr.license = "GPL"; + xattr.prog_ifindex = ifindex; + + fd = bpf_load_program_xattr(&xattr, buf, buf_len); + if (fd >= 0) + close(fd); +} + +bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex) +{ + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN() + }; + + if (ifindex && prog_type == BPF_PROG_TYPE_SCHED_CLS) + /* nfp returns -EINVAL on exit(0) with TC offload */ + insns[0].imm = 2; + + errno = 0; + probe_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex); + + return errno != EINVAL && errno != EOPNOTSUPP; +} + +bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) +{ + int key_size, value_size, max_entries, map_flags; + struct bpf_create_map_attr attr = {}; + int fd = -1, fd_inner; + + key_size = sizeof(__u32); + value_size = sizeof(__u32); + max_entries = 1; + map_flags = 0; + + switch (map_type) { + case BPF_MAP_TYPE_STACK_TRACE: + value_size = sizeof(__u64); + break; + case BPF_MAP_TYPE_LPM_TRIE: + key_size = sizeof(__u64); + value_size = sizeof(__u64); + map_flags = BPF_F_NO_PREALLOC; + break; + case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: + key_size = sizeof(struct bpf_cgroup_storage_key); + value_size = sizeof(__u64); + max_entries = 0; + break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + key_size = 0; + break; + case BPF_MAP_TYPE_UNSPEC: + case BPF_MAP_TYPE_HASH: + case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PROG_ARRAY: + case BPF_MAP_TYPE_PERF_EVENT_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_CGROUP_ARRAY: + case BPF_MAP_TYPE_LRU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_SOCKMAP: + case BPF_MAP_TYPE_CPUMAP: + case BPF_MAP_TYPE_XSKMAP: + case BPF_MAP_TYPE_SOCKHASH: + case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: + default: + break; + } + + if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + /* TODO: probe for device, once libbpf has a function to create + * map-in-map for offload + */ + if (ifindex) + return false; + + fd_inner = bpf_create_map(BPF_MAP_TYPE_HASH, + sizeof(__u32), sizeof(__u32), 1, 0); + if (fd_inner < 0) + return false; + fd = bpf_create_map_in_map(map_type, NULL, sizeof(__u32), + fd_inner, 1, 0); + close(fd_inner); + } else { + /* Note: No other restriction on map type probes for offload */ + attr.map_type = map_type; + attr.key_size = key_size; + attr.value_size = value_size; + attr.max_entries = max_entries; + attr.map_flags = map_flags; + attr.map_ifindex = ifindex; + + fd = bpf_create_map_xattr(&attr); + } + if (fd >= 0) + close(fd); + + return fd >= 0; +} + +bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, + __u32 ifindex) +{ + struct bpf_insn insns[2] = { + BPF_EMIT_CALL(id), + BPF_EXIT_INSN() + }; + char buf[4096] = {}; + bool res; + + probe_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), + ifindex); + res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); + + if (ifindex) { + switch (get_vendor_id(ifindex)) { + case 0x19ee: /* Netronome specific */ + res = res && !grep(buf, "not supported by FW") && + !grep(buf, "unsupported function id"); + break; + default: + break; + } + } + + return res; +} diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h new file mode 100644 index 000000000000..81ecda0cb9c9 --- /dev/null +++ b/tools/lib/bpf/libbpf_util.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2019 Facebook */ + +#ifndef __LIBBPF_LIBBPF_UTIL_H +#define __LIBBPF_LIBBPF_UTIL_H + +#include <stdbool.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern void libbpf_print(enum libbpf_print_level level, + const char *format, ...) + __attribute__((format(printf, 2, 3))); + +#define __pr(level, fmt, ...) \ +do { \ + libbpf_print(level, "libbpf: " fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_warning(fmt, ...) __pr(LIBBPF_WARN, fmt, ##__VA_ARGS__) +#define pr_info(fmt, ...) __pr(LIBBPF_INFO, fmt, ##__VA_ARGS__) +#define pr_debug(fmt, ...) __pr(LIBBPF_DEBUG, fmt, ##__VA_ARGS__) + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 0ce67aea8f3b..ce3ec81b71c0 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -21,6 +21,12 @@ typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, void *cookie); +struct xdp_id_md { + int ifindex; + __u32 flags; + __u32 id; +}; + int libbpf_netlink_open(__u32 *nl_pid) { struct sockaddr_nl sa; @@ -196,6 +202,85 @@ static int __dump_link_nlmsg(struct nlmsghdr *nlh, return dump_link_nlmsg(cookie, ifi, tb); } +static unsigned char get_xdp_id_attr(unsigned char mode, __u32 flags) +{ + if (mode != XDP_ATTACHED_MULTI) + return IFLA_XDP_PROG_ID; + if (flags & XDP_FLAGS_DRV_MODE) + return IFLA_XDP_DRV_PROG_ID; + if (flags & XDP_FLAGS_HW_MODE) + return IFLA_XDP_HW_PROG_ID; + if (flags & XDP_FLAGS_SKB_MODE) + return IFLA_XDP_SKB_PROG_ID; + + return IFLA_XDP_UNSPEC; +} + +static int get_xdp_id(void *cookie, void *msg, struct nlattr **tb) +{ + struct nlattr *xdp_tb[IFLA_XDP_MAX + 1]; + struct xdp_id_md *xdp_id = cookie; + struct ifinfomsg *ifinfo = msg; + unsigned char mode, xdp_attr; + int ret; + + if (xdp_id->ifindex && xdp_id->ifindex != ifinfo->ifi_index) + return 0; + + if (!tb[IFLA_XDP]) + return 0; + + ret = libbpf_nla_parse_nested(xdp_tb, IFLA_XDP_MAX, tb[IFLA_XDP], NULL); + if (ret) + return ret; + + if (!xdp_tb[IFLA_XDP_ATTACHED]) + return 0; + + mode = libbpf_nla_getattr_u8(xdp_tb[IFLA_XDP_ATTACHED]); + if (mode == XDP_ATTACHED_NONE) + return 0; + + xdp_attr = get_xdp_id_attr(mode, xdp_id->flags); + if (!xdp_attr || !xdp_tb[xdp_attr]) + return 0; + + xdp_id->id = libbpf_nla_getattr_u32(xdp_tb[xdp_attr]); + + return 0; +} + +int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) +{ + struct xdp_id_md xdp_id = {}; + int sock, ret; + __u32 nl_pid; + __u32 mask; + + if (flags & ~XDP_FLAGS_MASK) + return -EINVAL; + + /* Check whether the single {HW,DRV,SKB} mode is set */ + flags &= (XDP_FLAGS_SKB_MODE | XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE); + mask = flags - 1; + if (flags && flags & mask) + return -EINVAL; + + sock = libbpf_netlink_open(&nl_pid); + if (sock < 0) + return sock; + + xdp_id.ifindex = ifindex; + xdp_id.flags = flags; + + ret = libbpf_nl_get_link(sock, nl_pid, get_xdp_id, &xdp_id); + if (!ret) + *prog_id = xdp_id.id; + + close(sock); + return ret; +} + int libbpf_nl_get_link(int sock, unsigned int nl_pid, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { diff --git a/tools/lib/bpf/test_libbpf.cpp b/tools/lib/bpf/test_libbpf.cpp index abf3fc25c9fa..fc134873bb6d 100644 --- a/tools/lib/bpf/test_libbpf.cpp +++ b/tools/lib/bpf/test_libbpf.cpp @@ -8,11 +8,11 @@ int main(int argc, char *argv[]) { /* libbpf.h */ - libbpf_set_print(NULL, NULL, NULL); + libbpf_set_print(NULL); /* bpf.h */ bpf_prog_get_fd_by_id(0); /* btf.h */ - btf__new(NULL, 0, NULL); + btf__new(NULL, 0); } diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c new file mode 100644 index 000000000000..f98ac82c9aea --- /dev/null +++ b/tools/lib/bpf/xsk.c @@ -0,0 +1,723 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2019 Intel Corporation. + * + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <arpa/inet.h> +#include <asm/barrier.h> +#include <linux/compiler.h> +#include <linux/ethtool.h> +#include <linux/filter.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_xdp.h> +#include <linux/sockios.h> +#include <net/if.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> + +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_util.h" +#include "xsk.h" + +#ifndef SOL_XDP + #define SOL_XDP 283 +#endif + +#ifndef AF_XDP + #define AF_XDP 44 +#endif + +#ifndef PF_XDP + #define PF_XDP AF_XDP +#endif + +struct xsk_umem { + struct xsk_ring_prod *fill; + struct xsk_ring_cons *comp; + char *umem_area; + struct xsk_umem_config config; + int fd; + int refcount; +}; + +struct xsk_socket { + struct xsk_ring_cons *rx; + struct xsk_ring_prod *tx; + __u64 outstanding_tx; + struct xsk_umem *umem; + struct xsk_socket_config config; + int fd; + int xsks_map; + int ifindex; + int prog_fd; + int qidconf_map_fd; + int xsks_map_fd; + __u32 queue_id; + char ifname[IFNAMSIZ]; +}; + +struct xsk_nl_info { + bool xdp_prog_attached; + int ifindex; + int fd; +}; + +/* For 32-bit systems, we need to use mmap2 as the offsets are 64-bit. + * Unfortunately, it is not part of glibc. + */ +static inline void *xsk_mmap(void *addr, size_t length, int prot, int flags, + int fd, __u64 offset) +{ +#ifdef __NR_mmap2 + unsigned int page_shift = __builtin_ffs(getpagesize()) - 1; + long ret = syscall(__NR_mmap2, addr, length, prot, flags, fd, + (off_t)(offset >> page_shift)); + + return (void *)ret; +#else + return mmap(addr, length, prot, flags, fd, offset); +#endif +} + +int xsk_umem__fd(const struct xsk_umem *umem) +{ + return umem ? umem->fd : -EINVAL; +} + +int xsk_socket__fd(const struct xsk_socket *xsk) +{ + return xsk ? xsk->fd : -EINVAL; +} + +static bool xsk_page_aligned(void *buffer) +{ + unsigned long addr = (unsigned long)buffer; + + return !(addr & (getpagesize() - 1)); +} + +static void xsk_set_umem_config(struct xsk_umem_config *cfg, + const struct xsk_umem_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE; + cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + return; + } + + cfg->fill_size = usr_cfg->fill_size; + cfg->comp_size = usr_cfg->comp_size; + cfg->frame_size = usr_cfg->frame_size; + cfg->frame_headroom = usr_cfg->frame_headroom; +} + +static void xsk_set_xdp_socket_config(struct xsk_socket_config *cfg, + const struct xsk_socket_config *usr_cfg) +{ + if (!usr_cfg) { + cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg->libbpf_flags = 0; + cfg->xdp_flags = 0; + cfg->bind_flags = 0; + return; + } + + cfg->rx_size = usr_cfg->rx_size; + cfg->tx_size = usr_cfg->tx_size; + cfg->libbpf_flags = usr_cfg->libbpf_flags; + cfg->xdp_flags = usr_cfg->xdp_flags; + cfg->bind_flags = usr_cfg->bind_flags; +} + +int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area, __u64 size, + struct xsk_ring_prod *fill, struct xsk_ring_cons *comp, + const struct xsk_umem_config *usr_config) +{ + struct xdp_mmap_offsets off; + struct xdp_umem_reg mr; + struct xsk_umem *umem; + socklen_t optlen; + void *map; + int err; + + if (!umem_area || !umem_ptr || !fill || !comp) + return -EFAULT; + if (!size && !xsk_page_aligned(umem_area)) + return -EINVAL; + + umem = calloc(1, sizeof(*umem)); + if (!umem) + return -ENOMEM; + + umem->fd = socket(AF_XDP, SOCK_RAW, 0); + if (umem->fd < 0) { + err = -errno; + goto out_umem_alloc; + } + + umem->umem_area = umem_area; + xsk_set_umem_config(&umem->config, usr_config); + + mr.addr = (uintptr_t)umem_area; + mr.len = size; + mr.chunk_size = umem->config.frame_size; + mr.headroom = umem->config.frame_headroom; + + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)); + if (err) { + err = -errno; + goto out_socket; + } + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_FILL_RING, + &umem->config.fill_size, + sizeof(umem->config.fill_size)); + if (err) { + err = -errno; + goto out_socket; + } + err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_COMPLETION_RING, + &umem->config.comp_size, + sizeof(umem->config.comp_size)); + if (err) { + err = -errno; + goto out_socket; + } + + optlen = sizeof(off); + err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); + if (err) { + err = -errno; + goto out_socket; + } + + map = xsk_mmap(NULL, off.fr.desc + + umem->config.fill_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + umem->fd, XDP_UMEM_PGOFF_FILL_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_socket; + } + + umem->fill = fill; + fill->mask = umem->config.fill_size - 1; + fill->size = umem->config.fill_size; + fill->producer = map + off.fr.producer; + fill->consumer = map + off.fr.consumer; + fill->ring = map + off.fr.desc; + fill->cached_cons = umem->config.fill_size; + + map = xsk_mmap(NULL, + off.cr.desc + umem->config.comp_size * sizeof(__u64), + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, + umem->fd, XDP_UMEM_PGOFF_COMPLETION_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_mmap; + } + + umem->comp = comp; + comp->mask = umem->config.comp_size - 1; + comp->size = umem->config.comp_size; + comp->producer = map + off.cr.producer; + comp->consumer = map + off.cr.consumer; + comp->ring = map + off.cr.desc; + + *umem_ptr = umem; + return 0; + +out_mmap: + munmap(umem->fill, + off.fr.desc + umem->config.fill_size * sizeof(__u64)); +out_socket: + close(umem->fd); +out_umem_alloc: + free(umem); + return err; +} + +static int xsk_load_xdp_prog(struct xsk_socket *xsk) +{ + char bpf_log_buf[BPF_LOG_BUF_SIZE]; + int err, prog_fd; + + /* This is the C-program: + * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) + * { + * int *qidconf, index = ctx->rx_queue_index; + * + * // A set entry here means that the correspnding queue_id + * // has an active AF_XDP socket bound to it. + * qidconf = bpf_map_lookup_elem(&qidconf_map, &index); + * if (!qidconf) + * return XDP_ABORTED; + * + * if (*qidconf) + * return bpf_redirect_map(&xsks_map, index, 0); + * + * return XDP_PASS; + * } + */ + struct bpf_insn prog[] = { + /* r1 = *(u32 *)(r1 + 16) */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 16), + /* *(u32 *)(r10 - 4) = r1 */ + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_1, -4), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), + BPF_LD_MAP_FD(BPF_REG_1, xsk->qidconf_map_fd), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), + BPF_MOV32_IMM(BPF_REG_0, 0), + /* if r1 == 0 goto +8 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8), + BPF_MOV32_IMM(BPF_REG_0, 2), + /* r1 = *(u32 *)(r1 + 0) */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0), + /* if r1 == 0 goto +5 */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5), + /* r2 = *(u32 *)(r10 - 4) */ + BPF_LD_MAP_FD(BPF_REG_1, xsk->xsks_map_fd), + BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4), + BPF_MOV32_IMM(BPF_REG_3, 0), + BPF_EMIT_CALL(BPF_FUNC_redirect_map), + /* The jumps are to this instruction */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + + prog_fd = bpf_load_program(BPF_PROG_TYPE_XDP, prog, insns_cnt, + "LGPL-2.1 or BSD-2-Clause", 0, bpf_log_buf, + BPF_LOG_BUF_SIZE); + if (prog_fd < 0) { + pr_warning("BPF log buffer:\n%s", bpf_log_buf); + return prog_fd; + } + + err = bpf_set_link_xdp_fd(xsk->ifindex, prog_fd, xsk->config.xdp_flags); + if (err) { + close(prog_fd); + return err; + } + + xsk->prog_fd = prog_fd; + return 0; +} + +static int xsk_get_max_queues(struct xsk_socket *xsk) +{ + struct ethtool_channels channels; + struct ifreq ifr; + int fd, err, ret; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) + return -errno; + + channels.cmd = ETHTOOL_GCHANNELS; + ifr.ifr_data = (void *)&channels; + strncpy(ifr.ifr_name, xsk->ifname, IFNAMSIZ); + err = ioctl(fd, SIOCETHTOOL, &ifr); + if (err && errno != EOPNOTSUPP) { + ret = -errno; + goto out; + } + + if (channels.max_combined == 0 || errno == EOPNOTSUPP) + /* If the device says it has no channels, then all traffic + * is sent to a single stream, so max queues = 1. + */ + ret = 1; + else + ret = channels.max_combined; + +out: + close(fd); + return ret; +} + +static int xsk_create_bpf_maps(struct xsk_socket *xsk) +{ + int max_queues; + int fd; + + max_queues = xsk_get_max_queues(xsk); + if (max_queues < 0) + return max_queues; + + fd = bpf_create_map_name(BPF_MAP_TYPE_ARRAY, "qidconf_map", + sizeof(int), sizeof(int), max_queues, 0); + if (fd < 0) + return fd; + xsk->qidconf_map_fd = fd; + + fd = bpf_create_map_name(BPF_MAP_TYPE_XSKMAP, "xsks_map", + sizeof(int), sizeof(int), max_queues, 0); + if (fd < 0) { + close(xsk->qidconf_map_fd); + return fd; + } + xsk->xsks_map_fd = fd; + + return 0; +} + +static void xsk_delete_bpf_maps(struct xsk_socket *xsk) +{ + close(xsk->qidconf_map_fd); + close(xsk->xsks_map_fd); +} + +static int xsk_update_bpf_maps(struct xsk_socket *xsk, int qidconf_value, + int xsks_value) +{ + bool qidconf_map_updated = false, xsks_map_updated = false; + struct bpf_prog_info prog_info = {}; + __u32 prog_len = sizeof(prog_info); + struct bpf_map_info map_info; + __u32 map_len = sizeof(map_info); + __u32 *map_ids; + int reset_value = 0; + __u32 num_maps; + unsigned int i; + int err; + + err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len); + if (err) + return err; + + num_maps = prog_info.nr_map_ids; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); + if (!map_ids) + return -ENOMEM; + + memset(&prog_info, 0, prog_len); + prog_info.nr_map_ids = num_maps; + prog_info.map_ids = (__u64)(unsigned long)map_ids; + + err = bpf_obj_get_info_by_fd(xsk->prog_fd, &prog_info, &prog_len); + if (err) + goto out_map_ids; + + for (i = 0; i < prog_info.nr_map_ids; i++) { + int fd; + + fd = bpf_map_get_fd_by_id(map_ids[i]); + if (fd < 0) { + err = -errno; + goto out_maps; + } + + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); + if (err) + goto out_maps; + + if (!strcmp(map_info.name, "qidconf_map")) { + err = bpf_map_update_elem(fd, &xsk->queue_id, + &qidconf_value, 0); + if (err) + goto out_maps; + qidconf_map_updated = true; + xsk->qidconf_map_fd = fd; + } else if (!strcmp(map_info.name, "xsks_map")) { + err = bpf_map_update_elem(fd, &xsk->queue_id, + &xsks_value, 0); + if (err) + goto out_maps; + xsks_map_updated = true; + xsk->xsks_map_fd = fd; + } + + if (qidconf_map_updated && xsks_map_updated) + break; + } + + if (!(qidconf_map_updated && xsks_map_updated)) { + err = -ENOENT; + goto out_maps; + } + + err = 0; + goto out_success; + +out_maps: + if (qidconf_map_updated) + (void)bpf_map_update_elem(xsk->qidconf_map_fd, &xsk->queue_id, + &reset_value, 0); + if (xsks_map_updated) + (void)bpf_map_update_elem(xsk->xsks_map_fd, &xsk->queue_id, + &reset_value, 0); +out_success: + if (qidconf_map_updated) + close(xsk->qidconf_map_fd); + if (xsks_map_updated) + close(xsk->xsks_map_fd); +out_map_ids: + free(map_ids); + return err; +} + +static int xsk_setup_xdp_prog(struct xsk_socket *xsk) +{ + bool prog_attached = false; + __u32 prog_id = 0; + int err; + + err = bpf_get_link_xdp_id(xsk->ifindex, &prog_id, + xsk->config.xdp_flags); + if (err) + return err; + + if (!prog_id) { + prog_attached = true; + err = xsk_create_bpf_maps(xsk); + if (err) + return err; + + err = xsk_load_xdp_prog(xsk); + if (err) + goto out_maps; + } else { + xsk->prog_fd = bpf_prog_get_fd_by_id(prog_id); + } + + err = xsk_update_bpf_maps(xsk, true, xsk->fd); + if (err) + goto out_load; + + return 0; + +out_load: + if (prog_attached) + close(xsk->prog_fd); +out_maps: + if (prog_attached) + xsk_delete_bpf_maps(xsk); + return err; +} + +int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, + __u32 queue_id, struct xsk_umem *umem, + struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, + const struct xsk_socket_config *usr_config) +{ + struct sockaddr_xdp sxdp = {}; + struct xdp_mmap_offsets off; + struct xsk_socket *xsk; + socklen_t optlen; + void *map; + int err; + + if (!umem || !xsk_ptr || !rx || !tx) + return -EFAULT; + + if (umem->refcount) { + pr_warning("Error: shared umems not supported by libbpf.\n"); + return -EBUSY; + } + + xsk = calloc(1, sizeof(*xsk)); + if (!xsk) + return -ENOMEM; + + if (umem->refcount++ > 0) { + xsk->fd = socket(AF_XDP, SOCK_RAW, 0); + if (xsk->fd < 0) { + err = -errno; + goto out_xsk_alloc; + } + } else { + xsk->fd = umem->fd; + } + + xsk->outstanding_tx = 0; + xsk->queue_id = queue_id; + xsk->umem = umem; + xsk->ifindex = if_nametoindex(ifname); + if (!xsk->ifindex) { + err = -errno; + goto out_socket; + } + strncpy(xsk->ifname, ifname, IFNAMSIZ); + + xsk_set_xdp_socket_config(&xsk->config, usr_config); + + if (rx) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, + &xsk->config.rx_size, + sizeof(xsk->config.rx_size)); + if (err) { + err = -errno; + goto out_socket; + } + } + if (tx) { + err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, + &xsk->config.tx_size, + sizeof(xsk->config.tx_size)); + if (err) { + err = -errno; + goto out_socket; + } + } + + optlen = sizeof(off); + err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); + if (err) { + err = -errno; + goto out_socket; + } + + if (rx) { + map = xsk_mmap(NULL, off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_RX_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_socket; + } + + rx->mask = xsk->config.rx_size - 1; + rx->size = xsk->config.rx_size; + rx->producer = map + off.rx.producer; + rx->consumer = map + off.rx.consumer; + rx->ring = map + off.rx.desc; + } + xsk->rx = rx; + + if (tx) { + map = xsk_mmap(NULL, off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, + xsk->fd, XDP_PGOFF_TX_RING); + if (map == MAP_FAILED) { + err = -errno; + goto out_mmap_rx; + } + + tx->mask = xsk->config.tx_size - 1; + tx->size = xsk->config.tx_size; + tx->producer = map + off.tx.producer; + tx->consumer = map + off.tx.consumer; + tx->ring = map + off.tx.desc; + tx->cached_cons = xsk->config.tx_size; + } + xsk->tx = tx; + + sxdp.sxdp_family = PF_XDP; + sxdp.sxdp_ifindex = xsk->ifindex; + sxdp.sxdp_queue_id = xsk->queue_id; + sxdp.sxdp_flags = xsk->config.bind_flags; + + err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp)); + if (err) { + err = -errno; + goto out_mmap_tx; + } + + if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) { + err = xsk_setup_xdp_prog(xsk); + if (err) + goto out_mmap_tx; + } + + *xsk_ptr = xsk; + return 0; + +out_mmap_tx: + if (tx) + munmap(xsk->tx, + off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc)); +out_mmap_rx: + if (rx) + munmap(xsk->rx, + off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc)); +out_socket: + if (--umem->refcount) + close(xsk->fd); +out_xsk_alloc: + free(xsk); + return err; +} + +int xsk_umem__delete(struct xsk_umem *umem) +{ + struct xdp_mmap_offsets off; + socklen_t optlen; + int err; + + if (!umem) + return 0; + + if (umem->refcount) + return -EBUSY; + + optlen = sizeof(off); + err = getsockopt(umem->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); + if (!err) { + munmap(umem->fill->ring, + off.fr.desc + umem->config.fill_size * sizeof(__u64)); + munmap(umem->comp->ring, + off.cr.desc + umem->config.comp_size * sizeof(__u64)); + } + + close(umem->fd); + free(umem); + + return 0; +} + +void xsk_socket__delete(struct xsk_socket *xsk) +{ + struct xdp_mmap_offsets off; + socklen_t optlen; + int err; + + if (!xsk) + return; + + (void)xsk_update_bpf_maps(xsk, 0, 0); + + optlen = sizeof(off); + err = getsockopt(xsk->fd, SOL_XDP, XDP_MMAP_OFFSETS, &off, &optlen); + if (!err) { + if (xsk->rx) + munmap(xsk->rx->ring, + off.rx.desc + + xsk->config.rx_size * sizeof(struct xdp_desc)); + if (xsk->tx) + munmap(xsk->tx->ring, + off.tx.desc + + xsk->config.tx_size * sizeof(struct xdp_desc)); + } + + xsk->umem->refcount--; + /* Do not close an fd that also has an associated umem connected + * to it. + */ + if (xsk->fd != xsk->umem->fd) + close(xsk->fd); + free(xsk); +} diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h new file mode 100644 index 000000000000..a497f00e2962 --- /dev/null +++ b/tools/lib/bpf/xsk.h @@ -0,0 +1,203 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* + * AF_XDP user-space access library. + * + * Copyright(c) 2018 - 2019 Intel Corporation. + * + * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> + */ + +#ifndef __LIBBPF_XSK_H +#define __LIBBPF_XSK_H + +#include <stdio.h> +#include <stdint.h> +#include <linux/if_xdp.h> + +#include "libbpf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Do not access these members directly. Use the functions below. */ +#define DEFINE_XSK_RING(name) \ +struct name { \ + __u32 cached_prod; \ + __u32 cached_cons; \ + __u32 mask; \ + __u32 size; \ + __u32 *producer; \ + __u32 *consumer; \ + void *ring; \ +} + +DEFINE_XSK_RING(xsk_ring_prod); +DEFINE_XSK_RING(xsk_ring_cons); + +struct xsk_umem; +struct xsk_socket; + +static inline __u64 *xsk_ring_prod__fill_addr(struct xsk_ring_prod *fill, + __u32 idx) +{ + __u64 *addrs = (__u64 *)fill->ring; + + return &addrs[idx & fill->mask]; +} + +static inline const __u64 * +xsk_ring_cons__comp_addr(const struct xsk_ring_cons *comp, __u32 idx) +{ + const __u64 *addrs = (const __u64 *)comp->ring; + + return &addrs[idx & comp->mask]; +} + +static inline struct xdp_desc *xsk_ring_prod__tx_desc(struct xsk_ring_prod *tx, + __u32 idx) +{ + struct xdp_desc *descs = (struct xdp_desc *)tx->ring; + + return &descs[idx & tx->mask]; +} + +static inline const struct xdp_desc * +xsk_ring_cons__rx_desc(const struct xsk_ring_cons *rx, __u32 idx) +{ + const struct xdp_desc *descs = (const struct xdp_desc *)rx->ring; + + return &descs[idx & rx->mask]; +} + +static inline __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb) +{ + __u32 free_entries = r->cached_cons - r->cached_prod; + + if (free_entries >= nb) + return free_entries; + + /* Refresh the local tail pointer. + * cached_cons is r->size bigger than the real consumer pointer so + * that this addition can be avoided in the more frequently + * executed code that computs free_entries in the beginning of + * this function. Without this optimization it whould have been + * free_entries = r->cached_prod - r->cached_cons + r->size. + */ + r->cached_cons = *r->consumer + r->size; + + return r->cached_cons - r->cached_prod; +} + +static inline __u32 xsk_cons_nb_avail(struct xsk_ring_cons *r, __u32 nb) +{ + __u32 entries = r->cached_prod - r->cached_cons; + + if (entries == 0) { + r->cached_prod = *r->producer; + entries = r->cached_prod - r->cached_cons; + } + + return (entries > nb) ? nb : entries; +} + +static inline size_t xsk_ring_prod__reserve(struct xsk_ring_prod *prod, + size_t nb, __u32 *idx) +{ + if (unlikely(xsk_prod_nb_free(prod, nb) < nb)) + return 0; + + *idx = prod->cached_prod; + prod->cached_prod += nb; + + return nb; +} + +static inline void xsk_ring_prod__submit(struct xsk_ring_prod *prod, size_t nb) +{ + /* Make sure everything has been written to the ring before signalling + * this to the kernel. + */ + smp_wmb(); + + *prod->producer += nb; +} + +static inline size_t xsk_ring_cons__peek(struct xsk_ring_cons *cons, + size_t nb, __u32 *idx) +{ + size_t entries = xsk_cons_nb_avail(cons, nb); + + if (likely(entries > 0)) { + /* Make sure we do not speculatively read the data before + * we have received the packet buffers from the ring. + */ + smp_rmb(); + + *idx = cons->cached_cons; + cons->cached_cons += entries; + } + + return entries; +} + +static inline void xsk_ring_cons__release(struct xsk_ring_cons *cons, size_t nb) +{ + *cons->consumer += nb; +} + +static inline void *xsk_umem__get_data(void *umem_area, __u64 addr) +{ + return &((char *)umem_area)[addr]; +} + +LIBBPF_API int xsk_umem__fd(const struct xsk_umem *umem); +LIBBPF_API int xsk_socket__fd(const struct xsk_socket *xsk); + +#define XSK_RING_CONS__DEFAULT_NUM_DESCS 2048 +#define XSK_RING_PROD__DEFAULT_NUM_DESCS 2048 +#define XSK_UMEM__DEFAULT_FRAME_SHIFT 11 /* 2048 bytes */ +#define XSK_UMEM__DEFAULT_FRAME_SIZE (1 << XSK_UMEM__DEFAULT_FRAME_SHIFT) +#define XSK_UMEM__DEFAULT_FRAME_HEADROOM 0 + +struct xsk_umem_config { + __u32 fill_size; + __u32 comp_size; + __u32 frame_size; + __u32 frame_headroom; +}; + +/* Flags for the libbpf_flags field. */ +#define XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD (1 << 0) + +struct xsk_socket_config { + __u32 rx_size; + __u32 tx_size; + __u32 libbpf_flags; + __u32 xdp_flags; + __u16 bind_flags; +}; + +/* Set config to NULL to get the default configuration. */ +LIBBPF_API int xsk_umem__create(struct xsk_umem **umem, + void *umem_area, __u64 size, + struct xsk_ring_prod *fill, + struct xsk_ring_cons *comp, + const struct xsk_umem_config *config); +LIBBPF_API int xsk_socket__create(struct xsk_socket **xsk, + const char *ifname, __u32 queue_id, + struct xsk_umem *umem, + struct xsk_ring_cons *rx, + struct xsk_ring_prod *tx, + const struct xsk_socket_config *config); + +/* Returns 0 for success and -EBUSY if the umem is still in use. */ +LIBBPF_API int xsk_umem__delete(struct xsk_umem *umem); +LIBBPF_API void xsk_socket__delete(struct xsk_socket *xsk); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* __LIBBPF_XSK_H */ diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h index d640a9761f09..a81d91d4fc78 100644 --- a/tools/lib/lockdep/include/liblockdep/common.h +++ b/tools/lib/lockdep/include/liblockdep/common.h @@ -45,6 +45,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); void lockdep_reset_lock(struct lockdep_map *lock); +void lockdep_register_key(struct lock_class_key *key); +void lockdep_unregister_key(struct lock_class_key *key); extern void debug_check_no_locks_freed(const void *from, unsigned long len); #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ diff --git a/tools/lib/lockdep/include/liblockdep/mutex.h b/tools/lib/lockdep/include/liblockdep/mutex.h index 2073d4e1f2f0..783dd0df06f9 100644 --- a/tools/lib/lockdep/include/liblockdep/mutex.h +++ b/tools/lib/lockdep/include/liblockdep/mutex.h @@ -7,6 +7,7 @@ struct liblockdep_pthread_mutex { pthread_mutex_t mutex; + struct lock_class_key key; struct lockdep_map dep_map; }; @@ -27,11 +28,10 @@ static inline int __mutex_init(liblockdep_pthread_mutex_t *lock, return pthread_mutex_init(&lock->mutex, __mutexattr); } -#define liblockdep_pthread_mutex_init(mutex, mutexattr) \ -({ \ - static struct lock_class_key __key; \ - \ - __mutex_init((mutex), #mutex, &__key, (mutexattr)); \ +#define liblockdep_pthread_mutex_init(mutex, mutexattr) \ +({ \ + lockdep_register_key(&(mutex)->key); \ + __mutex_init((mutex), #mutex, &(mutex)->key, (mutexattr)); \ }) static inline int liblockdep_pthread_mutex_lock(liblockdep_pthread_mutex_t *lock) @@ -55,6 +55,7 @@ static inline int liblockdep_pthread_mutex_trylock(liblockdep_pthread_mutex_t *l static inline int liblockdep_pthread_mutex_destroy(liblockdep_pthread_mutex_t *lock) { lockdep_reset_lock(&lock->dep_map); + lockdep_unregister_key(&lock->key); return pthread_mutex_destroy(&lock->mutex); } diff --git a/tools/lib/lockdep/run_tests.sh b/tools/lib/lockdep/run_tests.sh index c8fbd0306960..11f425662b43 100755 --- a/tools/lib/lockdep/run_tests.sh +++ b/tools/lib/lockdep/run_tests.sh @@ -11,7 +11,7 @@ find tests -name '*.c' | sort | while read -r i; do testname=$(basename "$i" .c) echo -ne "$testname... " if gcc -o "tests/$testname" -pthread "$i" liblockdep.a -Iinclude -D__USE_LIBLOCKDEP && - timeout 1 "tests/$testname" 2>&1 | "tests/${testname}.sh"; then + timeout 1 "tests/$testname" 2>&1 | /bin/bash "tests/${testname}.sh"; then echo "PASSED!" else echo "FAILED!" @@ -24,7 +24,7 @@ find tests -name '*.c' | sort | while read -r i; do echo -ne "(PRELOAD) $testname... " if gcc -o "tests/$testname" -pthread -Iinclude "$i" && timeout 1 ./lockdep "tests/$testname" 2>&1 | - "tests/${testname}.sh"; then + /bin/bash "tests/${testname}.sh"; then echo "PASSED!" else echo "FAILED!" @@ -37,7 +37,7 @@ find tests -name '*.c' | sort | while read -r i; do echo -ne "(PRELOAD + Valgrind) $testname... " if gcc -o "tests/$testname" -pthread -Iinclude "$i" && { timeout 10 valgrind --read-var-info=yes ./lockdep "./tests/$testname" >& "tests/${testname}.vg.out"; true; } && - "tests/${testname}.sh" < "tests/${testname}.vg.out" && + /bin/bash "tests/${testname}.sh" < "tests/${testname}.vg.out" && ! grep -Eq '(^==[0-9]*== (Invalid |Uninitialised ))|Mismatched free|Source and destination overlap| UME ' "tests/${testname}.vg.out"; then echo "PASSED!" else diff --git a/tools/lib/lockdep/tests/ABBA.c b/tools/lib/lockdep/tests/ABBA.c index 623313f54720..543789bc3e37 100644 --- a/tools/lib/lockdep/tests/ABBA.c +++ b/tools/lib/lockdep/tests/ABBA.c @@ -14,4 +14,13 @@ void main(void) pthread_mutex_destroy(&b); pthread_mutex_destroy(&a); + + pthread_mutex_init(&a, NULL); + pthread_mutex_init(&b, NULL); + + LOCK_UNLOCK_2(a, b); + LOCK_UNLOCK_2(b, a); + + pthread_mutex_destroy(&b); + pthread_mutex_destroy(&a); } diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 17c2b596f043..904adb70a4f0 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -22,6 +22,7 @@ */ #include <linux/rbtree_augmented.h> +#include <linux/export.h> /* * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree @@ -43,6 +44,30 @@ * parentheses and have some accompanying text comment. */ +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ + static inline void rb_set_black(struct rb_node *rb) { rb->__rb_parent_color |= RB_BLACK; @@ -70,22 +95,35 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + if (newleft) + *leftmost = node; + while (true) { /* - * Loop invariant: node is red - * - * If there is a black parent, we are done. - * Otherwise, take some corrective action as we don't - * want a red root or two consecutive red nodes. + * Loop invariant: node is red. */ - if (!parent) { + if (unlikely(!parent)) { + /* + * The inserted node is root. Either this is the + * first node, or we recursed at Case 1 below and + * are no longer violating 4). + */ rb_set_parent_color(node, NULL, RB_BLACK); break; - } else if (rb_is_black(parent)) + } + + /* + * If there is a black parent, we are done. + * Otherwise, take some corrective action as, + * per 4), we don't want a red root or two + * consecutive red nodes. + */ + if(rb_is_black(parent)) break; gparent = rb_red_parent(parent); @@ -94,7 +132,7 @@ __rb_insert(struct rb_node *node, struct rb_root *root, if (parent != tmp) { /* parent == gparent->rb_left */ if (tmp && rb_is_red(tmp)) { /* - * Case 1 - color flips + * Case 1 - node's uncle is red (color flips). * * G g * / \ / \ @@ -117,7 +155,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, tmp = parent->rb_right; if (node == tmp) { /* - * Case 2 - left rotate at parent + * Case 2 - node's uncle is black and node is + * the parent's right child (left rotate at parent). * * G G * / \ / \ @@ -128,8 +167,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, * This still leaves us in violation of 4), the * continuation into Case 3 will fix that. */ - parent->rb_right = tmp = node->rb_left; - node->rb_left = parent; + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); @@ -140,7 +180,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } /* - * Case 3 - right rotate at gparent + * Case 3 - node's uncle is black and node is + * the parent's left child (right rotate at gparent). * * G P * / \ / \ @@ -148,8 +189,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, * / \ * n U */ - gparent->rb_left = tmp; /* == parent->rb_right */ - parent->rb_right = gparent; + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); @@ -170,8 +211,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, tmp = parent->rb_left; if (node == tmp) { /* Case 2 - right rotate at parent */ - parent->rb_left = tmp = node->rb_right; - node->rb_right = parent; + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); @@ -182,8 +224,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } /* Case 3 - left rotate at gparent */ - gparent->rb_right = tmp; /* == parent->rb_left */ - parent->rb_left = gparent; + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); @@ -223,8 +265,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * Sl Sr N Sl */ - parent->rb_right = tmp1 = sibling->rb_left; - sibling->rb_left = parent; + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); @@ -268,15 +311,31 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ - sibling->rb_left = tmp1 = tmp2->rb_right; - tmp2->rb_right = sibling; - parent->rb_right = tmp2; + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); @@ -296,8 +355,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * (sl) sr N (sl) */ - parent->rb_right = tmp2 = sibling->rb_left; - sibling->rb_left = parent; + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); @@ -309,8 +369,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, sibling = parent->rb_left; if (rb_is_red(sibling)) { /* Case 1 - right rotate at parent */ - parent->rb_left = tmp1 = sibling->rb_right; - sibling->rb_right = parent; + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); @@ -334,10 +395,11 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ - sibling->rb_right = tmp1 = tmp2->rb_left; - tmp2->rb_left = sibling; - parent->rb_left = tmp2; + /* Case 3 - left rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); @@ -345,9 +407,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ - parent->rb_left = tmp2 = sibling->rb_right; - sibling->rb_right = parent; + /* Case 4 - right rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); @@ -378,22 +441,41 @@ static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} static const struct rb_augment_callbacks dummy_callbacks = { - dummy_propagate, dummy_copy, dummy_rotate + .propagate = dummy_propagate, + .copy = dummy_copy, + .rotate = dummy_rotate }; void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, dummy_rotate); + __rb_insert(node, root, false, NULL, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, + NULL, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } +void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, bool leftmost) +{ + __rb_insert(node, &root->rb_root, leftmost, + &root->rb_leftmost, dummy_rotate); +} + +void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); +} + /* * Augmented rbtree manipulation functions. * @@ -402,9 +484,10 @@ void rb_erase(struct rb_node *node, struct rb_root *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, augment_rotate); + __rb_insert(node, root, newleft, leftmost, augment_rotate); } /* @@ -498,15 +581,24 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, { struct rb_node *parent = rb_parent(victim); + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + /* Set the surrounding nodes to point to the replacement */ - __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} - /* Copy the pointers/colour from the victim to the replacement */ - *new = *victim; +void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root) +{ + rb_replace_node(victim, new, &root->rb_root); + + if (root->rb_leftmost == victim) + root->rb_leftmost = new; } static struct rb_node *rb_left_deepest_node(const struct rb_node *node) diff --git a/tools/memory-model/.gitignore b/tools/memory-model/.gitignore new file mode 100644 index 000000000000..b1d34c52f3c3 --- /dev/null +++ b/tools/memory-model/.gitignore @@ -0,0 +1 @@ +litmus diff --git a/tools/memory-model/README b/tools/memory-model/README index acf9077cffaa..0f2c366518c6 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -156,6 +156,8 @@ lock.cat README This file. +scripts Various scripts, see scripts/README. + =========== LIMITATIONS diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index b84fb2f67109..796513362c05 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -29,7 +29,8 @@ enum Barriers = 'wmb (*smp_wmb*) || 'sync-rcu (*synchronize_rcu*) || 'before-atomic (*smp_mb__before_atomic*) || 'after-atomic (*smp_mb__after_atomic*) || - 'after-spinlock (*smp_mb__after_spinlock*) + 'after-spinlock (*smp_mb__after_spinlock*) || + 'after-unlock-lock (*smp_mb__after_unlock_lock*) instructions F[Barriers] (* Compute matching pairs of nested Rcu-lock and Rcu-unlock *) diff --git a/tools/memory-model/linux-kernel.cat b/tools/memory-model/linux-kernel.cat index 882fc33274ac..8f23c74a96fd 100644 --- a/tools/memory-model/linux-kernel.cat +++ b/tools/memory-model/linux-kernel.cat @@ -30,7 +30,9 @@ let wmb = [W] ; fencerel(Wmb) ; [W] let mb = ([M] ; fencerel(Mb) ; [M]) | ([M] ; fencerel(Before-atomic) ; [RMW] ; po? ; [M]) | ([M] ; po? ; [RMW] ; fencerel(After-atomic) ; [M]) | - ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) + ([M] ; po? ; [LKW] ; fencerel(After-spinlock) ; [M]) | + ([M] ; po ; [UL] ; (co | po) ; [LKW] ; + fencerel(After-unlock-lock) ; [M]) let gp = po ; [Sync-rcu] ; po? let strong-fence = mb | gp diff --git a/tools/memory-model/linux-kernel.def b/tools/memory-model/linux-kernel.def index 6fa3eb28d40b..b27911cc087d 100644 --- a/tools/memory-model/linux-kernel.def +++ b/tools/memory-model/linux-kernel.def @@ -23,6 +23,7 @@ smp_wmb() { __fence{wmb}; } smp_mb__before_atomic() { __fence{before-atomic}; } smp_mb__after_atomic() { __fence{after-atomic}; } smp_mb__after_spinlock() { __fence{after-spinlock}; } +smp_mb__after_unlock_lock() { __fence{after-unlock-lock}; } // Exchange xchg(X,V) __xchg{mb}(X,V) diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README new file mode 100644 index 000000000000..29375a1fbbfa --- /dev/null +++ b/tools/memory-model/scripts/README @@ -0,0 +1,70 @@ + ============ + LKMM SCRIPTS + ============ + + +These scripts are run from the tools/memory-model directory. + +checkalllitmus.sh + + Run all litmus tests in the litmus-tests directory, checking + the results against the expected results recorded in the + "Result:" comment lines. + +checkghlitmus.sh + + Run all litmus tests in the https://github.com/paulmckrcu/litmus + archive that are C-language and that have "Result:" comment lines + documenting expected results, comparing the actual results to + those expected. + +checklitmushist.sh + + Run all litmus tests having .litmus.out files from previous + initlitmushist.sh or newlitmushist.sh runs, comparing the + herd output to that of the original runs. + +checklitmus.sh + + Check a single litmus test against its "Result:" expected result. + +cmplitmushist.sh + + Compare output from two different runs of the same litmus tests, + with the absolute pathnames of the tests to run provided one + name per line on standard input. Not normally run manually, + provided instead for use by other scripts. + +initlitmushist.sh + + Run all litmus tests having no more than the specified number + of processes given a specified timeout, recording the results + in .litmus.out files. + +judgelitmus.sh + + Given a .litmus file and its .litmus.out herd output, check the + .litmus.out file against the .litmus file's "Result:" comment to + judge whether the test ran correctly. Not normally run manually, + provided instead for use by other scripts. + +newlitmushist.sh + + For all new or updated litmus tests having no more than the + specified number of processes given a specified timeout, run + and record the results in .litmus.out files. + +parseargs.sh + + Parse command-line arguments. Not normally run manually, + provided instead for use by other scripts. + +runlitmushist.sh + + Run the litmus tests whose absolute pathnames are provided one + name per line on standard input. Not normally run manually, + provided instead for use by other scripts. + +README + + This file diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index ca528f9a24d4..b35fcd61ecf6 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,42 +1,27 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ # -# Run herd tests on all .litmus files in the specified directory (which -# defaults to litmus-tests) and check each file's result against a "Result:" -# comment within that litmus test. If the verification result does not -# match that specified in the litmus test, this script prints an error -# message prefixed with "^^^". It also outputs verification results to -# a file whose name is that of the specified litmus test, but with ".out" -# appended. +# Run herd tests on all .litmus files in the litmus-tests directory +# and check each file's result against a "Result:" comment within that +# litmus test. If the verification result does not match that specified +# in the litmus test, this script prints an error message prefixed with +# "^^^". It also outputs verification results to a file whose name is +# that of the specified litmus test, but with ".out" appended. # # Usage: -# checkalllitmus.sh [ directory ] +# checkalllitmus.sh # -# The LINUX_HERD_OPTIONS environment variable may be used to specify -# arguments to herd, whose default is defined by the checklitmus.sh script. -# Thus, one would normally run this in the directory containing the memory -# model, specifying the pathname of the litmus test to check. +# Run this in the directory containing the memory model. # # This script makes no attempt to run the litmus tests concurrently. # -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. -# # Copyright IBM Corporation, 2018 # # Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> -litmusdir=${1-litmus-tests} +. scripts/parseargs.sh + +litmusdir=litmus-tests if test -d "$litmusdir" -a -r "$litmusdir" -a -x "$litmusdir" then : @@ -45,6 +30,14 @@ else exit 255 fi +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find $litmusdir -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + # Find the checklitmus script. If it is not where we expect it, then # assume that the caller has the PATH environment variable set # appropriately. @@ -57,7 +50,7 @@ fi # Run the script on all the litmus tests in the specified directory ret=0 -for i in litmus-tests/*.litmus +for i in $litmusdir/*.litmus do if ! $clscript $i then @@ -66,8 +59,8 @@ do done if test "$ret" -ne 0 then - echo " ^^^ VERIFICATION MISMATCHES" + echo " ^^^ VERIFICATION MISMATCHES" 1>&2 else - echo All litmus tests verified as was expected. + echo All litmus tests verified as was expected. 1>&2 fi exit $ret diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh new file mode 100644 index 000000000000..6589fbb6f653 --- /dev/null +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests having a maximum number of processes +# to run, defaults to 6. +# +# sh checkghlitmus.sh +# +# Run from the Linux kernel tools/memory-model directory. See the +# parseargs.sh scripts for arguments. + +. scripts/parseargs.sh + +T=/tmp/checkghlitmus.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +# Clone the repository if it is not already present. +if test -d litmus +then + : +else + git clone https://github.com/paulmckrcu/litmus + ( cd litmus; git checkout origin/master ) +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already + +# Create a list of C-language litmus tests with "Result:" commands and +# no more than the specified number of processes. +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result +xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short + +# Form list of tests without corresponding .litmus.out files +sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed + +# Run any needed tests. +if scripts/runlitmushist.sh < $T/list-C-needed > $T/run.stdout 2> $T/run.stderr +then + errs= +else + errs=1 +fi + +sed < $T/list-C-result-short -e 's,^,scripts/judgelitmus.sh ,' | + sh > $T/judge.stdout 2> $T/judge.stderr + +if test -n "$errs" +then + cat $T/run.stderr 1>&2 +fi +grep '!!!' $T/judge.stdout diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index bf12a75c0719..dd08801a30b0 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -1,40 +1,24 @@ #!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ # -# Run a herd test and check the result against a "Result:" comment within -# the litmus test. If the verification result does not match that specified -# in the litmus test, this script prints an error message prefixed with -# "^^^" and exits with a non-zero status. It also outputs verification +# Run a herd test and invokes judgelitmus.sh to check the result against +# a "Result:" comment within the litmus test. It also outputs verification # results to a file whose name is that of the specified litmus test, but # with ".out" appended. # # Usage: # checklitmus.sh file.litmus # -# The LINUX_HERD_OPTIONS environment variable may be used to specify -# arguments to herd, which default to "-conf linux-kernel.cfg". Thus, -# one would normally run this in the directory containing the memory model, -# specifying the pathname of the litmus test to check. -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, you can access it online at -# http://www.gnu.org/licenses/gpl-2.0.html. +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The caller is expected to have +# properly set up the LKMM environment variables. # # Copyright IBM Corporation, 2018 # # Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> litmus=$1 -herdoptions=${LINUX_HERD_OPTIONS--conf linux-kernel.cfg} +herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} if test -f "$litmus" -a -r "$litmus" then @@ -43,44 +27,8 @@ else echo ' --- ' error: \"$litmus\" is not a readable file exit 255 fi -if grep -q '^ \* Result: ' $litmus -then - outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` -else - outcome=specified -fi -echo Herd options: $herdoptions > $litmus.out -/usr/bin/time herd7 -o ~/tmp $herdoptions $litmus >> $litmus.out 2>&1 -grep "Herd options:" $litmus.out -grep '^Observation' $litmus.out -if grep -q '^Observation' $litmus.out -then - : -else - cat $litmus.out - echo ' ^^^ Verification error' - echo ' ^^^ Verification error' >> $litmus.out 2>&1 - exit 255 -fi -if test "$outcome" = DEADLOCK -then - echo grep 3 and 4 - if grep '^Observation' $litmus.out | grep -q 'Never 0 0$' - then - ret=0 - else - echo " ^^^ Unexpected non-$outcome verification" - echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1 - ret=1 - fi -elif grep '^Observation' $litmus.out | grep -q $outcome || test "$outcome" = Maybe -then - ret=0 -else - echo " ^^^ Unexpected non-$outcome verification" - echo " ^^^ Unexpected non-$outcome verification" >> $litmus.out 2>&1 - ret=1 -fi -tail -2 $litmus.out | head -1 -exit $ret +echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + +scripts/judgelitmus.sh $litmus diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh new file mode 100644 index 000000000000..1d210ffb7c8a --- /dev/null +++ b/tools/memory-model/scripts/checklitmushist.sh @@ -0,0 +1,60 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Reruns the C-language litmus tests previously run that match the +# specified criteria, and compares the result to that of the previous +# runs from initlitmushist.sh and/or newlitmushist.sh. +# +# sh checklitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +. scripts/parseargs.sh + +T=/tmp/checklitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Run scripts/initlitmushist.sh first, need litmus repo. + exit 1 +fi + +# Create the results directory and populate it with subdirectories. +# The initial output is created here to avoid clobbering the output +# generated earlier. +mkdir $T/results +find litmus -type d -print | ( cd $T/results; sed -e 's/^/mkdir -p /' | sh ) + +# Create the list of litmus tests already run, then remove those that +# are excluded by this run's --procs argument. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already +xargs < $T/list-C-already -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +# Redirect output, run tests, then restore destination directory. +destdir="$LKMM_DESTDIR" +LKMM_DESTDIR=$T/results; export LKMM_DESTDIR +scripts/runlitmushist.sh < $T/list-C-short > $T/runlitmushist.sh.out 2>&1 +LKMM_DESTDIR="$destdir"; export LKMM_DESTDIR + +# Move the newly generated .litmus.out files to .litmus.out.new files +# in the destination directory. +cdir=`pwd` +ddir=`awk -v c="$cdir" -v d="$LKMM_DESTDIR" \ + 'END { if (d ~ /^\//) print d; else print c "/" d; }' < /dev/null` +( cd $T/results; find litmus -type f -name '*.litmus.out' -print | + sed -e 's,^.*$,cp & '"$ddir"'/&.new,' | sh ) + +sed < $T/list-C-short -e 's,^,'"$LKMM_DESTDIR/"',' | + sh scripts/cmplitmushist.sh +exit $? diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh new file mode 100644 index 000000000000..0f498aeeccf5 --- /dev/null +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -0,0 +1,87 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Compares .out and .out.new files for each name on standard input, +# one full pathname per line. Outputs comparison results followed by +# a summary. +# +# sh cmplitmushist.sh + +T=/tmp/cmplitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +# comparetest oldpath newpath +perfect=0 +obsline=0 +noobsline=0 +obsresult=0 +badcompare=0 +comparetest () { + grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout + grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout + if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1 + then + echo Exact output match: $2 + perfect=`expr "$perfect" + 1` + return 0 + fi + + grep '^Observation' $1 > $T/oldout + grep '^Observation' $2 > $T/newout + if test -s $T/oldout -o -s $T/newout + then + if cmp -s $T/oldout $T/newout + then + echo Matching Observation result and counts: $2 + obsline=`expr "$obsline" + 1` + return 0 + fi + else + echo Missing Observation line "(e.g., herd7 timeout)": $2 + noobsline=`expr "$noobsline" + 1` + return 0 + fi + + grep '^Observation' $1 | awk '{ print $3 }' > $T/oldout + grep '^Observation' $2 | awk '{ print $3 }' > $T/newout + if cmp -s $T/oldout $T/newout + then + echo Matching Observation Always/Sometimes/Never result: $2 + obsresult=`expr "$obsresult" + 1` + return 0 + fi + echo ' !!!' Result changed: $2 + badcompare=`expr "$badcompare" + 1` + return 1 +} + +sed -e 's/^.*$/comparetest &.out &.out.new/' > $T/cmpscript +. $T/cmpscript > $T/cmpscript.out +cat $T/cmpscript.out + +echo ' ---' Summary: 1>&2 +grep '!!!' $T/cmpscript.out 1>&2 +if test "$perfect" -ne 0 +then + echo Exact output matches: $perfect 1>&2 +fi +if test "$obsline" -ne 0 +then + echo Matching Observation result and counts: $obsline 1>&2 +fi +if test "$noobsline" -ne 0 +then + echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2 +fi +if test "$obsresult" -ne 0 +then + echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2 +fi +if test "$badcompare" -ne 0 +then + echo "!!!" Result changed: $badcompare 1>&2 + exit 1 +fi + +exit 0 diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh new file mode 100644 index 000000000000..956b6957484d --- /dev/null +++ b/tools/memory-model/scripts/initlitmushist.sh @@ -0,0 +1,68 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests matching the specified criteria. +# Generates the output for each .litmus file into a corresponding +# .litmus.out file, and does not judge the result. +# +# sh initlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# This script can consume significant wallclock time and CPU, especially as +# the value of --procs rises. On a four-core (eight hardware threads) +# 2.5GHz x86 with a one-minute per-run timeout: +# +# --procs wallclock CPU timeouts tests +# 1 0m11.241s 0m1.086s 0 19 +# 2 1m12.598s 2m8.459s 2 393 +# 3 1m30.007s 6m2.479s 4 2291 +# 4 3m26.042s 18m5.139s 9 3217 +# 5 4m26.661s 23m54.128s 13 3784 +# 6 4m41.900s 26m4.721s 13 4352 +# 7 5m51.463s 35m50.868s 13 4626 +# 8 10m5.235s 68m43.672s 34 5117 +# 9 15m57.80s 105m58.101s 69 5156 +# 10 16m14.13s 103m35.009s 69 5165 +# 20 27m48.55s 198m3.286s 156 5269 +# +# Increasing the timeout on the 20-process run to five minutes increases +# the runtime to about 90 minutes with the CPU time rising to about +# 10 hours. On the other hand, it decreases the number of timeouts to 101. +# +# Note that there are historical tests for which herd7 will fail +# completely, for example, litmus/manual/atomic/C-unlock-wait-00.litmus +# contains a call to spin_unlock_wait(), which no longer exists in either +# the kernel or LKMM. + +. scripts/parseargs.sh + +T=/tmp/initlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + git clone https://github.com/paulmckrcu/litmus + ( cd litmus; git checkout origin/master ) +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests with no more than the +# specified number of processes (per the --procs argument). +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +scripts/runlitmushist.sh < $T/list-C-short + +exit 0 diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh new file mode 100644 index 000000000000..0cc63875e395 --- /dev/null +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -0,0 +1,78 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Given a .litmus test and the corresponding .litmus.out file, check +# the .litmus.out file against the "Result:" comment to judge whether +# the test ran correctly. +# +# Usage: +# judgelitmus.sh file.litmus +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +litmus=$1 + +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi +if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out +then + : +else + echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file + exit 255 +fi +if grep -q '^ \* Result: ' $litmus +then + outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` +else + outcome=specified +fi + +grep '^Observation' $LKMM_DESTDIR/$litmus.out +if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out +then + : +else + echo ' !!! Verification error' $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 255 +fi +if test "$outcome" = DEADLOCK +then + if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' + then + ret=0 + else + echo " !!! Unexpected non-$outcome verification" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 + fi +elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe +then + ret=0 +else + echo " !!! Unexpected non-$outcome verification" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 +fi +tail -2 $LKMM_DESTDIR/$litmus.out | head -1 +exit $ret diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh new file mode 100644 index 000000000000..991f8f814881 --- /dev/null +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -0,0 +1,61 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests matching the specified criteria +# that do not already have a corresponding .litmus.out file, and does +# not judge the result. +# +# sh newlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# See scripts/parseargs.sh for list of arguments. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +. scripts/parseargs.sh + +T=/tmp/newlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Run scripts/initlitmushist.sh first, need litmus repo. + exit 1 +fi + +# Create any new directories that have appeared in the github litmus +# repo since the last run. +if test "$LKMM_DESTDIR" != "." +then + find litmus -type d -print | + ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) +fi + +# Create a list of the C-language litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | + sed -e 's/\.out$//' | + xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already + +# Form full list of litmus tests with no more than the specified +# number of processes (per the --procs argument). +find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all +xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short + +# Form list of new tests. Note: This does not handle litmus-test deletion! +sort $T/list-C-already $T/list-C-short | uniq -u > $T/list-C-new + +# Form list of litmus tests that have changed since the last run. +sed < $T/list-C-short -e 's,^.*$,if test & -nt '"$LKMM_DESTDIR"'/&.out; then echo &; fi,' > $T/list-C-script +sh $T/list-C-script > $T/list-C-newer + +# Merge the list of new and of updated litmus tests: These must be (re)run. +sort -u $T/list-C-new $T/list-C-newer > $T/list-C-needed + +scripts/runlitmushist.sh < $T/list-C-needed + +exit 0 diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh new file mode 100644 index 000000000000..859e1d581e05 --- /dev/null +++ b/tools/memory-model/scripts/parseargs.sh @@ -0,0 +1,136 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# the corresponding .litmus.out file, and does not judge the result. +# +# . scripts/parseargs.sh +# +# Include into other Linux kernel tools/memory-model scripts. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +T=/tmp/parseargs.sh.$$ +mkdir $T + +# Initialize one parameter: initparam name default +initparam () { + echo if test -z '"$'$1'"' > $T/s + echo then >> $T/s + echo $1='"'$2'"' >> $T/s + echo export $1 >> $T/s + echo fi >> $T/s + echo $1_DEF='$'$1 >> $T/s + . $T/s +} + +initparam LKMM_DESTDIR "." +initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg" +initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN` +initparam LKMM_PROCS "3" +initparam LKMM_TIMEOUT "1m" + +scriptname=$0 + +usagehelp () { + echo "Usage $scriptname [ arguments ]" + echo " --destdir path (place for .litmus.out, default by .litmus)" + echo " --herdopts -conf linux-kernel.cfg ..." + echo " --jobs N (number of jobs, default one per CPU)" + echo " --procs N (litmus tests with at most this many processes)" + echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')" + echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" + exit 1 +} + +usage () { + usagehelp 1>&2 +} + +# checkarg --argname argtype $# arg mustmatch cannotmatch +checkarg () { + if test $3 -le 1 + then + echo $1 needs argument $2 matching \"$5\" + usage + fi + if echo "$4" | grep -q -e "$5" + then + : + else + echo $1 $2 \"$4\" must match \"$5\" + usage + fi + if echo "$4" | grep -q -e "$6" + then + echo $1 $2 \"$4\" must not match \"$6\" + usage + fi +} + +while test $# -gt 0 +do + case "$1" in + --destdir) + checkarg --destdir "(path to directory)" "$#" "$2" '.\+' '^--' + LKMM_DESTDIR="$2" + mkdir $LKMM_DESTDIR > /dev/null 2>&1 + if ! test -e "$LKMM_DESTDIR" + then + echo "Cannot create directory --destdir '$LKMM_DESTDIR'" + usage + fi + if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" + then + : + else + echo "Directory --destdir '$LKMM_DESTDIR' insufficient permissions to create files" + usage + fi + shift + ;; + --herdopts|--herdopt) + checkarg --destdir "(herd options)" "$#" "$2" '.*' '^--' + LKMM_HERD_OPTIONS="$2" + shift + ;; + -j[1-9]*) + njobs="`echo $1 | sed -e 's/^-j//'`" + trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`" + if test -n "$trailchars" + then + echo $1 trailing characters "'$trailchars'" + usagehelp + fi + LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`" + ;; + --jobs|--job|-j) + checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--' + LKMM_JOBS="$2" + shift + ;; + --procs|--proc) + checkarg --procs "(number)" "$#" "$2" '^[0-9]\+$' '^--' + LKMM_PROCS="$2" + shift + ;; + --timeout) + checkarg --timeout "(timeout spec)" "$#" "$2" '^\([0-9]\+[smhd]\?\|\)$' '^--' + LKMM_TIMEOUT="$2" + shift + ;; + *) + echo Unknown argument $1 + usage + ;; + esac + shift +done +if test -z "$LKMM_TIMEOUT" +then + LKMM_TIMEOUT_CMD=""; export LKMM_TIMEOUT_CMD +else + LKMM_TIMEOUT_CMD="timeout $LKMM_TIMEOUT"; export LKMM_TIMEOUT_CMD +fi +rm -rf $T diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh new file mode 100644 index 000000000000..e507f5f933d5 --- /dev/null +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0+ +# +# Runs the C-language litmus tests specified on standard input, using up +# to the specified number of CPUs (defaulting to all of them) and placing +# the results in the specified directory (defaulting to the same place +# the litmus test came from). +# +# sh runlitmushist.sh +# +# Run from the Linux kernel tools/memory-model directory. +# This script uses environment variables produced by parseargs.sh. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + +T=/tmp/runlitmushist.sh.$$ +trap 'rm -rf $T' 0 +mkdir $T + +if test -d litmus +then + : +else + echo Directory \"litmus\" missing, aborting run. + exit 1 +fi + +# Prefixes for per-CPU scripts +for ((i=0;i<$LKMM_JOBS;i++)) +do + echo dir="$LKMM_DESTDIR" > $T/$i.sh + echo T=$T >> $T/$i.sh + echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh + cat << '___EOF___' >> $T/$i.sh + runtest () { + echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1' + if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1 + then + if ! grep -q '^Observation ' $dir/$1.out + then + echo ' !!! Herd failed, no Observation:' $1 + fi + else + exitcode=$? + if test "$exitcode" -eq 124 + then + exitmsg="timed out" + else + exitmsg="failed, exit code $exitcode" + fi + echo ' !!! Herd' ${exitmsg}: $1 + fi + } +___EOF___ +done + +awk -v q="'" -v b='\\' ' +{ + print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0 +}' | bash | +sort -k1n | +awk -v ncpu=$LKMM_JOBS -v t=$T ' +{ + print "runtest " $2 >> t "/" NR % ncpu ".sh"; +} + +END { + for (i = 0; i < ncpu; i++) { + print "sh " t "/" i ".sh > " t "/" i ".sh.out 2>&1 &"; + close(t "/" i ".sh"); + } + print "wait"; +}' | sh +cat $T/*.sh.out +if grep -q '!!!' $T/*.sh.out +then + echo ' ---' Summary: 1>&2 + grep '!!!' $T/*.sh.out 1>&2 + nfail="`grep '!!!' $T/*.sh.out | wc -l`" + echo 'Number of failed herd runs (e.g., timeout): ' $nfail 1>&2 + exit 1 +else + echo All runs completed successfully. 1>&2 + exit 0 +fi diff --git a/tools/perf/Build b/tools/perf/Build index e5232d567611..5f392dbb88fc 100644 --- a/tools/perf/Build +++ b/tools/perf/Build @@ -46,10 +46,10 @@ CFLAGS_builtin-trace.o += -DSTRACE_GROUPS_DIR="BUILD_STR($(STRACE_GROUPS_DIR_ CFLAGS_builtin-report.o += -DTIPDIR="BUILD_STR($(tipdir_SQ))" CFLAGS_builtin-report.o += -DDOCDIR="BUILD_STR($(srcdir_SQ)/Documentation)" -libperf-y += util/ -libperf-y += arch/ -libperf-y += ui/ -libperf-y += scripts/ -libperf-$(CONFIG_TRACE) += trace/beauty/ +perf-y += util/ +perf-y += arch/ +perf-y += ui/ +perf-y += scripts/ +perf-$(CONFIG_TRACE) += trace/beauty/ gtk-y += ui/gtk/ diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt index 4ac7775fbc11..86f3dcc15f83 100644 --- a/tools/perf/Documentation/perf-config.txt +++ b/tools/perf/Documentation/perf-config.txt @@ -120,6 +120,10 @@ Given a $HOME/.perfconfig like this: children = true group = true + [llvm] + dump-obj = true + clang-opt = -g + You can hide source code of annotate feature setting the config to false with % perf config annotate.hide_src_code=true @@ -553,6 +557,33 @@ trace.*:: trace.show_zeros:: Do not suppress syscall arguments that are equal to zero. +llvm.*:: + llvm.clang-path:: + Path to clang. If omit, search it from $PATH. + + llvm.clang-bpf-cmd-template:: + Cmdline template. Below lines show its default value. Environment + variable is used to pass options. + "$CLANG_EXEC -D__KERNEL__ $CLANG_OPTIONS $KERNEL_INC_OPTIONS \ + -Wno-unused-value -Wno-pointer-sign -working-directory \ + $WORKING_DIR -c $CLANG_SOURCE -target bpf -O2 -o -" + + llvm.clang-opt:: + Options passed to clang. + + llvm.kbuild-dir:: + kbuild directory. If not set, use /lib/modules/`uname -r`/build. + If set to "" deliberately, skip kernel header auto-detector. + + llvm.kbuild-opts:: + Options passed to 'make' when detecting kernel header options. + + llvm.dump-obj:: + Enable perf dump BPF object files compiled by LLVM. + + llvm.opts:: + Options passed to llc. + SEE ALSO -------- linkperf:perf[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index d232b13ea713..8f0c2be34848 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -88,6 +88,20 @@ OPTIONS If you want to profile write accesses in [0x1000~1008), just set 'mem:0x1000/8:w'. + - a BPF source file (ending in .c) or a precompiled object file (ending + in .o) selects one or more BPF events. + The BPF program can attach to various perf events based on the ELF section + names. + + When processing a '.c' file, perf searches an installed LLVM to compile it + into an object file first. Optional clang options can be passed via the + '--clang-opt' command line option, e.g.: + + perf record --clang-opt "-DLINUX_VERSION_CODE=0x50000" \ + -e tests/bpf-script-example.c + + Note: '--clang-opt' must be placed before '--event/-e'. + - a group of events surrounded by a pair of brace ("{event1,event2,...}"). Each event is separated by commas and the group should be quoted to prevent the shell interpretation. You also need to use --group on @@ -440,6 +454,11 @@ Use <n> control blocks in asynchronous (Posix AIO) trace writing mode (default: Asynchronous mode is supported only when linking Perf tool with libc library providing implementation for Posix AIO API. +--affinity=mode:: +Set affinity mask of trace reading thread according to the policy defined by 'mode' value: + node - thread affinity mask is set to NUMA node cpu mask of the processed mmap buffer + cpu - thread affinity mask is set to cpu of the processed mmap buffer + --all-kernel:: Configure all used events to run in kernel space. diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index 9e4def08d569..2e19fd7ffe35 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -159,6 +159,12 @@ OPTIONS the override, and the result of the above is that only S/W and H/W events are displayed with the given fields. + It's possible tp add/remove fields only for specific event type: + + -Fsw:-cpu,-period + + removes cpu and period from software events. + For the 'wildcard' option if a user selected field is invalid for an event type, a message is displayed to the user that the option is ignored for that type. For example: diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 631e687be4eb..fc6e43262c41 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -210,6 +210,14 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. may happen, for instance, when a thread gets migrated to a different CPU while processing a syscall. +--map-dump:: + Dump BPF maps setup by events passed via -e, for instance the augmented_raw_syscalls + living in tools/perf/examples/bpf/augmented_raw_syscalls.c. For now this + dumps just boolean map values and integer keys, in time this will print in hex + by default and use BTF when available, as well as use functions to do pretty + printing using the existing 'perf trace' syscall arg beautifiers to map integer + arguments to strings (pid to comm, syscall id to syscall name, etc). + PAGEFAULTS ---------- diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index dfb218feaad9..593ef49b273c 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -43,11 +43,10 @@ struct perf_file_section { Flags section: -The header is followed by different optional headers, described by the bits set -in flags. Only headers for which the bit is set are included. Each header -consists of a perf_file_section located after the initial header. -The respective perf_file_section points to the data of the additional -header and defines its size. +For each of the optional features a perf_file_section it placed after the data +section if the feature bit is set in the perf_header flags bitset. The +respective perf_file_section points to the data of the additional header and +defines its size. Some headers consist of strings, which are defined like this: @@ -131,7 +130,7 @@ An uint64_t with the total memory in bytes. HEADER_CMDLINE = 11, -A perf_header_string with the perf command line used to collect the data. +A perf_header_string_list with the perf arg-vector used to collect the data. HEADER_EVENT_DESC = 12, diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index b441c88cafa1..0f11d5891301 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -109,6 +109,13 @@ FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS) FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS) FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS) $(LIBUNWIND_LIBS) +FEATURE_CHECK_LDFLAGS-libunwind-arm = -lunwind -lunwind-arm +FEATURE_CHECK_LDFLAGS-libunwind-aarch64 = -lunwind -lunwind-aarch64 +FEATURE_CHECK_LDFLAGS-libunwind-x86 = -lunwind -llzma -lunwind-x86 +FEATURE_CHECK_LDFLAGS-libunwind-x86_64 = -lunwind -llzma -lunwind-x86_64 + +FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto + ifdef CSINCLUDES LIBOPENCSD_CFLAGS := -I$(CSINCLUDES) endif @@ -218,6 +225,8 @@ FEATURE_CHECK_LDFLAGS-libpython := $(PYTHON_EMBED_LDOPTS) FEATURE_CHECK_CFLAGS-libpython-version := $(PYTHON_EMBED_CCOPTS) FEATURE_CHECK_LDFLAGS-libpython-version := $(PYTHON_EMBED_LDOPTS) +FEATURE_CHECK_LDFLAGS-libaio = -lrt + CFLAGS += -fno-omit-frame-pointer CFLAGS += -ggdb3 CFLAGS += -funwind-tables @@ -386,7 +395,8 @@ ifeq ($(feature-setns), 1) $(call detected,CONFIG_SETNS) endif -ifndef NO_CORESIGHT +ifdef CORESIGHT + $(call feature_check,libopencsd) ifeq ($(feature-libopencsd), 1) CFLAGS += -DHAVE_CSTRACE_SUPPORT $(LIBOPENCSD_CFLAGS) LDFLAGS += $(LIBOPENCSD_LDFLAGS) @@ -482,6 +492,7 @@ endif ifndef NO_LIBUNWIND have_libunwind := + $(call feature_check,libunwind-x86) ifeq ($(feature-libunwind-x86), 1) $(call detected,CONFIG_LIBUNWIND_X86) CFLAGS += -DHAVE_LIBUNWIND_X86_SUPPORT @@ -490,6 +501,7 @@ ifndef NO_LIBUNWIND have_libunwind = 1 endif + $(call feature_check,libunwind-aarch64) ifeq ($(feature-libunwind-aarch64), 1) $(call detected,CONFIG_LIBUNWIND_AARCH64) CFLAGS += -DHAVE_LIBUNWIND_AARCH64_SUPPORT diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index 0ee6795d82cc..01f7555fd933 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -102,7 +102,7 @@ include ../scripts/utilities.mak # When selected, pass LLVM_CONFIG=/path/to/llvm-config to `make' if # llvm-config is not in $PATH. # -# Define NO_CORESIGHT if you do not want support for CoreSight trace decoding. +# Define CORESIGHT if you DO WANT support for CoreSight trace decoding. # # Define NO_AIO if you do not want support of Posix AIO based trace # streaming for record mode. Currently Posix AIO trace streaming is @@ -344,9 +344,9 @@ endif export PERL_PATH -LIB_FILE=$(OUTPUT)libperf.a +LIBPERF_A=$(OUTPUT)libperf.a -PERFLIBS = $(LIB_FILE) $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD) +PERFLIBS = $(LIBAPI) $(LIBTRACEEVENT) $(LIBSUBCMD) ifndef NO_LIBBPF PERFLIBS += $(LIBBPF) endif @@ -549,6 +549,8 @@ JEVENTS_IN := $(OUTPUT)pmu-events/jevents-in.o PMU_EVENTS_IN := $(OUTPUT)pmu-events/pmu-events-in.o +LIBPERF_IN := $(OUTPUT)libperf-in.o + export JEVENTS build := -f $(srctree)/tools/build/Makefile.build dir=. obj @@ -565,9 +567,12 @@ $(JEVENTS): $(JEVENTS_IN) $(PMU_EVENTS_IN): $(JEVENTS) FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=pmu-events obj=pmu-events -$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) +$(LIBPERF_IN): prepare FORCE + $(Q)$(MAKE) $(build)=libperf + +$(OUTPUT)perf: $(PERFLIBS) $(PERF_IN) $(PMU_EVENTS_IN) $(LIBPERF_IN) $(LIBTRACEEVENT_DYNAMIC_LIST) $(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(LIBTRACEEVENT_DYNAMIC_LIST_LDFLAGS) \ - $(PERF_IN) $(PMU_EVENTS_IN) $(LIBS) -o $@ + $(PERF_IN) $(PMU_EVENTS_IN) $(LIBPERF_IN) $(LIBS) -o $@ $(GTK_IN): FORCE $(Q)$(MAKE) $(build)=gtk @@ -683,12 +688,7 @@ endif $(patsubst perf-%,%.o,$(PROGRAMS)): $(wildcard */*.h) -LIBPERF_IN := $(OUTPUT)libperf-in.o - -$(LIBPERF_IN): prepare FORCE - $(Q)$(MAKE) $(build)=libperf - -$(LIB_FILE): $(LIBPERF_IN) +$(LIBPERF_A): $(LIBPERF_IN) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIBPERF_IN) $(LIB_OBJS) LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ) 'EXTRA_CFLAGS=$(EXTRA_CFLAGS)' 'LDFLAGS=$(LDFLAGS)' @@ -863,8 +863,8 @@ ifndef NO_LIBPYTHON $(call QUIET_INSTALL, python-scripts) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'; \ - $(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \ - $(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'; \ + $(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -m 644 -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \ + $(INSTALL) scripts/python/*.py -m 644 -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'; \ $(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin' endif $(call QUIET_INSTALL, perf_completion-script) \ @@ -910,7 +910,7 @@ python-clean: $(python-clean) clean:: $(LIBTRACEEVENT)-clean $(LIBAPI)-clean $(LIBBPF)-clean $(LIBSUBCMD)-clean config-clean fixdep-clean python-clean - $(call QUIET_CLEAN, core-objs) $(RM) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) + $(call QUIET_CLEAN, core-objs) $(RM) $(LIBPERF_A) $(OUTPUT)perf-archive $(OUTPUT)perf-with-kcore $(LANG_BINDINGS) $(Q)find $(if $(OUTPUT),$(OUTPUT),.) -name '*.o' -delete -o -name '\.*.cmd' -delete -o -name '\.*.d' -delete $(Q)$(RM) $(OUTPUT).config-detected $(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf perf-read-vdso32 perf-read-vdsox32 $(OUTPUT)pmu-events/jevents $(OUTPUT)$(LIBJVMTI).so diff --git a/tools/perf/arch/Build b/tools/perf/arch/Build index d9b6af837c7d..688818844c11 100644 --- a/tools/perf/arch/Build +++ b/tools/perf/arch/Build @@ -1,2 +1,2 @@ -libperf-y += common.o -libperf-y += $(SRCARCH)/ +perf-y += common.o +perf-y += $(SRCARCH)/ diff --git a/tools/perf/arch/arm/Build b/tools/perf/arch/arm/Build index 41bf61da476a..36222e64bbf7 100644 --- a/tools/perf/arch/arm/Build +++ b/tools/perf/arch/arm/Build @@ -1,2 +1,2 @@ -libperf-y += util/ -libperf-$(CONFIG_DWARF_UNWIND) += tests/ +perf-y += util/ +perf-$(CONFIG_DWARF_UNWIND) += tests/ diff --git a/tools/perf/arch/arm/tests/Build b/tools/perf/arch/arm/tests/Build index d9ae2733f9cc..bc8e97380c82 100644 --- a/tools/perf/arch/arm/tests/Build +++ b/tools/perf/arch/arm/tests/Build @@ -1,5 +1,5 @@ -libperf-y += regs_load.o -libperf-y += dwarf-unwind.o -libperf-y += vectors-page.o +perf-y += regs_load.o +perf-y += dwarf-unwind.o +perf-y += vectors-page.o -libperf-y += arch-tests.o +perf-y += arch-tests.o diff --git a/tools/perf/arch/arm/tests/dwarf-unwind.c b/tools/perf/arch/arm/tests/dwarf-unwind.c index 9a0242e74cfc..2c35e532bc9a 100644 --- a/tools/perf/arch/arm/tests/dwarf-unwind.c +++ b/tools/perf/arch/arm/tests/dwarf-unwind.c @@ -3,6 +3,7 @@ #include "perf_regs.h" #include "thread.h" #include "map.h" +#include "map_groups.h" #include "event.h" #include "debug.h" #include "tests/tests.h" diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build index e64c5f216448..296f0eac5e18 100644 --- a/tools/perf/arch/arm/util/Build +++ b/tools/perf/arch/arm/util/Build @@ -1,6 +1,6 @@ -libperf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o -libperf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o -libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o -libperf-$(CONFIG_AUXTRACE) += pmu.o auxtrace.o cs-etm.o +perf-$(CONFIG_AUXTRACE) += pmu.o auxtrace.o cs-etm.o diff --git a/tools/perf/arch/arm/util/cs-etm.c b/tools/perf/arch/arm/util/cs-etm.c index 2f595cd73da6..911426721170 100644 --- a/tools/perf/arch/arm/util/cs-etm.c +++ b/tools/perf/arch/arm/util/cs-etm.c @@ -5,6 +5,7 @@ */ #include <api/fs/fs.h> +#include <linux/bits.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <linux/coresight-pmu.h> @@ -22,12 +23,10 @@ #include "../../util/thread_map.h" #include "../../util/cs-etm.h" +#include <errno.h> #include <stdlib.h> #include <sys/stat.h> -#define ENABLE_SINK_MAX 128 -#define CS_BUS_DEVICE_PATH "/bus/coresight/devices/" - struct cs_etm_recording { struct auxtrace_record itr; struct perf_pmu *cs_etm_pmu; @@ -60,10 +59,48 @@ static int cs_etm_parse_snapshot_options(struct auxtrace_record *itr, return 0; } +static int cs_etm_set_sink_attr(struct perf_pmu *pmu, + struct perf_evsel *evsel) +{ + char msg[BUFSIZ], path[PATH_MAX], *sink; + struct perf_evsel_config_term *term; + int ret = -EINVAL; + u32 hash; + + if (evsel->attr.config2 & GENMASK(31, 0)) + return 0; + + list_for_each_entry(term, &evsel->config_terms, list) { + if (term->type != PERF_EVSEL__CONFIG_TERM_DRV_CFG) + continue; + + sink = term->val.drv_cfg; + snprintf(path, PATH_MAX, "sinks/%s", sink); + + ret = perf_pmu__scan_file(pmu, path, "%x", &hash); + if (ret != 1) { + pr_err("failed to set sink \"%s\" on event %s with %d (%s)\n", + sink, perf_evsel__name(evsel), errno, + str_error_r(errno, msg, sizeof(msg))); + return ret; + } + + evsel->attr.config2 |= hash; + return 0; + } + + /* + * No sink was provided on the command line - for _now_ treat + * this as an error. + */ + return ret; +} + static int cs_etm_recording_options(struct auxtrace_record *itr, struct perf_evlist *evlist, struct record_opts *opts) { + int ret; struct cs_etm_recording *ptr = container_of(itr, struct cs_etm_recording, itr); struct perf_pmu *cs_etm_pmu = ptr->cs_etm_pmu; @@ -92,6 +129,10 @@ static int cs_etm_recording_options(struct auxtrace_record *itr, if (!cs_etm_evsel) return 0; + ret = cs_etm_set_sink_attr(cs_etm_pmu, cs_etm_evsel); + if (ret) + return ret; + if (opts->use_clockid) { pr_err("Cannot use clockid (-k option) with %s\n", CORESIGHT_ETM_PMU_NAME); @@ -598,54 +639,3 @@ struct auxtrace_record *cs_etm_record_init(int *err) out: return NULL; } - -static FILE *cs_device__open_file(const char *name) -{ - struct stat st; - char path[PATH_MAX]; - const char *sysfs; - - sysfs = sysfs__mountpoint(); - if (!sysfs) - return NULL; - - snprintf(path, PATH_MAX, - "%s" CS_BUS_DEVICE_PATH "%s", sysfs, name); - - if (stat(path, &st) < 0) - return NULL; - - return fopen(path, "w"); - -} - -static int __printf(2, 3) cs_device__print_file(const char *name, const char *fmt, ...) -{ - va_list args; - FILE *file; - int ret = -EINVAL; - - va_start(args, fmt); - file = cs_device__open_file(name); - if (file) { - ret = vfprintf(file, fmt, args); - fclose(file); - } - va_end(args); - return ret; -} - -int cs_etm_set_drv_config(struct perf_evsel_config_term *term) -{ - int ret; - char enable_sink[ENABLE_SINK_MAX]; - - snprintf(enable_sink, ENABLE_SINK_MAX, "%s/%s", - term->val.drv_cfg, "enable_sink"); - - ret = cs_device__print_file(enable_sink, "%d", 1); - if (ret < 0) - return ret; - - return 0; -} diff --git a/tools/perf/arch/arm/util/cs-etm.h b/tools/perf/arch/arm/util/cs-etm.h index 1a12e64f5127..a3354bda4fe8 100644 --- a/tools/perf/arch/arm/util/cs-etm.h +++ b/tools/perf/arch/arm/util/cs-etm.h @@ -7,9 +7,6 @@ #ifndef INCLUDE__PERF_CS_ETM_H__ #define INCLUDE__PERF_CS_ETM_H__ -#include "../../util/evsel.h" - struct auxtrace_record *cs_etm_record_init(int *err); -int cs_etm_set_drv_config(struct perf_evsel_config_term *term); #endif diff --git a/tools/perf/arch/arm/util/pmu.c b/tools/perf/arch/arm/util/pmu.c index e047571e6080..bbc297a7e2e3 100644 --- a/tools/perf/arch/arm/util/pmu.c +++ b/tools/perf/arch/arm/util/pmu.c @@ -7,8 +7,8 @@ #include <string.h> #include <linux/coresight-pmu.h> #include <linux/perf_event.h> +#include <linux/string.h> -#include "cs-etm.h" #include "arm-spe.h" #include "../../util/pmu.h" @@ -19,7 +19,6 @@ struct perf_event_attr if (!strcmp(pmu->name, CORESIGHT_ETM_PMU_NAME)) { /* add ETM default config here */ pmu->selectable = true; - pmu->set_drv_config = cs_etm_set_drv_config; #if defined(__aarch64__) } else if (strstarts(pmu->name, ARM_SPE_PMU_NAME)) { return arm_spe_pmu_default_config(pmu); diff --git a/tools/perf/arch/arm64/Build b/tools/perf/arch/arm64/Build index 41bf61da476a..36222e64bbf7 100644 --- a/tools/perf/arch/arm64/Build +++ b/tools/perf/arch/arm64/Build @@ -1,2 +1,2 @@ -libperf-y += util/ -libperf-$(CONFIG_DWARF_UNWIND) += tests/ +perf-y += util/ +perf-$(CONFIG_DWARF_UNWIND) += tests/ diff --git a/tools/perf/arch/arm64/tests/Build b/tools/perf/arch/arm64/tests/Build index 883c57ff0c08..41707fea74b3 100644 --- a/tools/perf/arch/arm64/tests/Build +++ b/tools/perf/arch/arm64/tests/Build @@ -1,4 +1,4 @@ -libperf-y += regs_load.o -libperf-y += dwarf-unwind.o +perf-y += regs_load.o +perf-y += dwarf-unwind.o -libperf-y += arch-tests.o +perf-y += arch-tests.o diff --git a/tools/perf/arch/arm64/tests/dwarf-unwind.c b/tools/perf/arch/arm64/tests/dwarf-unwind.c index 5522ce384723..a6a407fa1b8b 100644 --- a/tools/perf/arch/arm64/tests/dwarf-unwind.c +++ b/tools/perf/arch/arm64/tests/dwarf-unwind.c @@ -3,6 +3,7 @@ #include "perf_regs.h" #include "thread.h" #include "map.h" +#include "map_groups.h" #include "event.h" #include "debug.h" #include "tests/tests.h" diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build index 68f8a8eb3ad0..3cde540d2fcf 100644 --- a/tools/perf/arch/arm64/util/Build +++ b/tools/perf/arch/arm64/util/Build @@ -1,10 +1,10 @@ -libperf-y += header.o -libperf-y += sym-handling.o -libperf-$(CONFIG_DWARF) += dwarf-regs.o -libperf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o -libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-y += header.o +perf-y += sym-handling.o +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o -libperf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \ +perf-$(CONFIG_AUXTRACE) += ../../arm/util/pmu.o \ ../../arm/util/auxtrace.o \ ../../arm/util/cs-etm.o \ arm-spe.o diff --git a/tools/perf/arch/nds32/Build b/tools/perf/arch/nds32/Build index 54afe4a467e7..e4e5f33c84d8 100644 --- a/tools/perf/arch/nds32/Build +++ b/tools/perf/arch/nds32/Build @@ -1 +1 @@ -libperf-y += util/ +perf-y += util/ diff --git a/tools/perf/arch/nds32/util/Build b/tools/perf/arch/nds32/util/Build index ca623bbf993c..d0bc205fe49a 100644 --- a/tools/perf/arch/nds32/util/Build +++ b/tools/perf/arch/nds32/util/Build @@ -1 +1 @@ -libperf-y += header.o +perf-y += header.o diff --git a/tools/perf/arch/powerpc/Build b/tools/perf/arch/powerpc/Build index db52fa22d3a1..a7dd46a5b678 100644 --- a/tools/perf/arch/powerpc/Build +++ b/tools/perf/arch/powerpc/Build @@ -1,2 +1,2 @@ -libperf-y += util/ -libperf-y += tests/ +perf-y += util/ +perf-y += tests/ diff --git a/tools/perf/arch/powerpc/tests/Build b/tools/perf/arch/powerpc/tests/Build index d827ef384b33..3526ab0af9f9 100644 --- a/tools/perf/arch/powerpc/tests/Build +++ b/tools/perf/arch/powerpc/tests/Build @@ -1,4 +1,4 @@ -libperf-$(CONFIG_DWARF_UNWIND) += regs_load.o -libperf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o +perf-$(CONFIG_DWARF_UNWIND) += regs_load.o +perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o -libperf-y += arch-tests.o +perf-y += arch-tests.o diff --git a/tools/perf/arch/powerpc/tests/dwarf-unwind.c b/tools/perf/arch/powerpc/tests/dwarf-unwind.c index 5f39efef0856..5c178e4a1995 100644 --- a/tools/perf/arch/powerpc/tests/dwarf-unwind.c +++ b/tools/perf/arch/powerpc/tests/dwarf-unwind.c @@ -3,6 +3,7 @@ #include "perf_regs.h" #include "thread.h" #include "map.h" +#include "map_groups.h" #include "event.h" #include "debug.h" #include "tests/tests.h" diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index ba98bd006488..7cf0b8803097 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -1,11 +1,11 @@ -libperf-y += header.o -libperf-y += sym-handling.o -libperf-y += kvm-stat.o -libperf-y += perf_regs.o -libperf-y += mem-events.o +perf-y += header.o +perf-y += sym-handling.o +perf-y += kvm-stat.o +perf-y += perf_regs.o +perf-y += mem-events.o -libperf-$(CONFIG_DWARF) += dwarf-regs.o -libperf-$(CONFIG_DWARF) += skip-callchain-idx.o +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += skip-callchain-idx.o -libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o -libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o diff --git a/tools/perf/arch/powerpc/util/kvm-stat.c b/tools/perf/arch/powerpc/util/kvm-stat.c index 596ad6aedaac..f9db341c47b6 100644 --- a/tools/perf/arch/powerpc/util/kvm-stat.c +++ b/tools/perf/arch/powerpc/util/kvm-stat.c @@ -3,6 +3,8 @@ #include "util/kvm-stat.h" #include "util/parse-events.h" #include "util/debug.h" +#include "util/evsel.h" +#include "util/evlist.h" #include "book3s_hv_exits.h" #include "book3s_hcalls.h" diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 7c6eeb4633fe..2918bb16c892 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -16,6 +16,9 @@ #include "util/thread.h" #include "util/callchain.h" #include "util/debug.h" +#include "util/dso.h" +#include "util/map.h" +#include "util/symbol.h" /* * When saving the callchain on Power, the kernel conservatively saves diff --git a/tools/perf/arch/s390/Build b/tools/perf/arch/s390/Build index 54afe4a467e7..e4e5f33c84d8 100644 --- a/tools/perf/arch/s390/Build +++ b/tools/perf/arch/s390/Build @@ -1 +1 @@ -libperf-y += util/ +perf-y += util/ diff --git a/tools/perf/arch/s390/util/Build b/tools/perf/arch/s390/util/Build index 4a233683c684..22797f043b84 100644 --- a/tools/perf/arch/s390/util/Build +++ b/tools/perf/arch/s390/util/Build @@ -1,9 +1,9 @@ -libperf-y += header.o -libperf-y += kvm-stat.o +perf-y += header.o +perf-y += kvm-stat.o -libperf-$(CONFIG_DWARF) += dwarf-regs.o -libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o -libperf-y += machine.o +perf-y += machine.o -libperf-$(CONFIG_AUXTRACE) += auxtrace.o +perf-$(CONFIG_AUXTRACE) += auxtrace.o diff --git a/tools/perf/arch/s390/util/kvm-stat.c b/tools/perf/arch/s390/util/kvm-stat.c index aaabab5e2830..7e3961a4b292 100644 --- a/tools/perf/arch/s390/util/kvm-stat.c +++ b/tools/perf/arch/s390/util/kvm-stat.c @@ -11,6 +11,7 @@ #include <errno.h> #include "../../util/kvm-stat.h" +#include "../../util/evsel.h" #include <asm/sie.h> define_exit_reasons_table(sie_exit_reasons, sie_intercept_code); diff --git a/tools/perf/arch/sh/Build b/tools/perf/arch/sh/Build index 54afe4a467e7..e4e5f33c84d8 100644 --- a/tools/perf/arch/sh/Build +++ b/tools/perf/arch/sh/Build @@ -1 +1 @@ -libperf-y += util/ +perf-y += util/ diff --git a/tools/perf/arch/sh/util/Build b/tools/perf/arch/sh/util/Build index 954e287bbb89..e813e618954b 100644 --- a/tools/perf/arch/sh/util/Build +++ b/tools/perf/arch/sh/util/Build @@ -1 +1 @@ -libperf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/sparc/Build b/tools/perf/arch/sparc/Build index 54afe4a467e7..e4e5f33c84d8 100644 --- a/tools/perf/arch/sparc/Build +++ b/tools/perf/arch/sparc/Build @@ -1 +1 @@ -libperf-y += util/ +perf-y += util/ diff --git a/tools/perf/arch/sparc/util/Build b/tools/perf/arch/sparc/util/Build index 954e287bbb89..e813e618954b 100644 --- a/tools/perf/arch/sparc/util/Build +++ b/tools/perf/arch/sparc/util/Build @@ -1 +1 @@ -libperf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/arch/x86/Build b/tools/perf/arch/x86/Build index db52fa22d3a1..a7dd46a5b678 100644 --- a/tools/perf/arch/x86/Build +++ b/tools/perf/arch/x86/Build @@ -1,2 +1,2 @@ -libperf-y += util/ -libperf-y += tests/ +perf-y += util/ +perf-y += tests/ diff --git a/tools/perf/arch/x86/tests/Build b/tools/perf/arch/x86/tests/Build index 586849ff83a0..3d83d0c6982d 100644 --- a/tools/perf/arch/x86/tests/Build +++ b/tools/perf/arch/x86/tests/Build @@ -1,8 +1,8 @@ -libperf-$(CONFIG_DWARF_UNWIND) += regs_load.o -libperf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o +perf-$(CONFIG_DWARF_UNWIND) += regs_load.o +perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o -libperf-y += arch-tests.o -libperf-y += rdpmc.o -libperf-y += perf-time-to-tsc.o -libperf-$(CONFIG_AUXTRACE) += insn-x86.o -libperf-$(CONFIG_X86_64) += bp-modify.o +perf-y += arch-tests.o +perf-y += rdpmc.o +perf-y += perf-time-to-tsc.o +perf-$(CONFIG_AUXTRACE) += insn-x86.o +perf-$(CONFIG_X86_64) += bp-modify.o diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c index 7879df34569a..6ad0a1cedb13 100644 --- a/tools/perf/arch/x86/tests/dwarf-unwind.c +++ b/tools/perf/arch/x86/tests/dwarf-unwind.c @@ -3,6 +3,7 @@ #include "perf_regs.h" #include "thread.h" #include "map.h" +#include "map_groups.h" #include "event.h" #include "debug.h" #include "tests/tests.h" diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 844b8f335532..7aab0be5fc5f 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -1,18 +1,18 @@ -libperf-y += header.o -libperf-y += tsc.o -libperf-y += pmu.o -libperf-y += kvm-stat.o -libperf-y += perf_regs.o -libperf-y += group.o -libperf-y += machine.o -libperf-y += event.o +perf-y += header.o +perf-y += tsc.o +perf-y += pmu.o +perf-y += kvm-stat.o +perf-y += perf_regs.o +perf-y += group.o +perf-y += machine.o +perf-y += event.o -libperf-$(CONFIG_DWARF) += dwarf-regs.o -libperf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o -libperf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o -libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o +perf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o -libperf-$(CONFIG_AUXTRACE) += auxtrace.o -libperf-$(CONFIG_AUXTRACE) += intel-pt.o -libperf-$(CONFIG_AUXTRACE) += intel-bts.o +perf-$(CONFIG_AUXTRACE) += auxtrace.o +perf-$(CONFIG_AUXTRACE) += intel-pt.o +perf-$(CONFIG_AUXTRACE) += intel-bts.o diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c index 081353d7b095..865a9762f22e 100644 --- a/tools/perf/arch/x86/util/kvm-stat.c +++ b/tools/perf/arch/x86/util/kvm-stat.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <errno.h> #include "../../util/kvm-stat.h" +#include "../../util/evsel.h" #include <asm/svm.h> #include <asm/vmx.h> #include <asm/kvm.h> diff --git a/tools/perf/arch/xtensa/Build b/tools/perf/arch/xtensa/Build index 54afe4a467e7..e4e5f33c84d8 100644 --- a/tools/perf/arch/xtensa/Build +++ b/tools/perf/arch/xtensa/Build @@ -1 +1 @@ -libperf-y += util/ +perf-y += util/ diff --git a/tools/perf/arch/xtensa/util/Build b/tools/perf/arch/xtensa/util/Build index 954e287bbb89..e813e618954b 100644 --- a/tools/perf/arch/xtensa/util/Build +++ b/tools/perf/arch/xtensa/util/Build @@ -1 +1 @@ -libperf-$(CONFIG_DWARF) += dwarf-regs.o +perf-$(CONFIG_DWARF) += dwarf-regs.o diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 44195514b19e..98ad783efc69 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -34,6 +34,7 @@ #include <sys/types.h> #include <linux/kernel.h> #include <linux/time64.h> +#include <linux/numa.h> #include <numa.h> #include <numaif.h> @@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node) CPU_ZERO(&mask); - if (target_node == -1) { + if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) CPU_SET(cpu, &mask); } else { @@ -339,7 +340,7 @@ static void bind_to_memnode(int node) unsigned long nodemask; int ret; - if (node == -1) + if (node == NUMA_NO_NODE) return; BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); @@ -1363,7 +1364,7 @@ static void init_thread_data(void) int cpu; /* Allow all nodes by default: */ - td->bind_node = -1; + td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ CPU_ZERO(&td->bind_cpumask); diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 93d679eaf1f4..67f9d9ffacfb 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -27,6 +27,7 @@ #include "util/thread.h" #include "util/sort.h" #include "util/hist.h" +#include "util/map.h" #include "util/session.h" #include "util/tool.h" #include "util/data.h" @@ -227,7 +228,7 @@ static int perf_evsel__add_sample(struct perf_evsel *evsel, * the DSO? */ if (al->sym != NULL) { - rb_erase(&al->sym->rb_node, + rb_erase_cached(&al->sym->rb_node, &al->map->dso->symbols); symbol__delete(al->sym); dso__reset_find_symbol_cache(al->map->dso); @@ -305,7 +306,7 @@ static void hists__find_annotations(struct hists *hists, struct perf_evsel *evsel, struct perf_annotate *ann) { - struct rb_node *nd = rb_first(&hists->entries), *next; + struct rb_node *nd = rb_first_cached(&hists->entries), *next; int key = K_RIGHT; while (nd) { @@ -440,7 +441,7 @@ static int __cmd_annotate(struct perf_annotate *ann) } if (total_nr_samples == 0) { - ui__error("The %s file has no samples!\n", session->data->file.path); + ui__error("The %s data has no samples!\n", session->data->path); goto out; } @@ -577,7 +578,7 @@ int cmd_annotate(int argc, const char **argv) if (quiet) perf_quiet_option(); - data.file.path = input_name; + data.path = input_name; annotate.session = perf_session__new(&data, false, &annotate.tool); if (annotate.session == NULL) diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c index 115110a4796a..10457b10e568 100644 --- a/tools/perf/builtin-buildid-cache.c +++ b/tools/perf/builtin-buildid-cache.c @@ -416,8 +416,8 @@ int cmd_buildid_cache(int argc, const char **argv) nsi = nsinfo__new(ns_id); if (missing_filename) { - data.file.path = missing_filename; - data.force = force; + data.path = missing_filename; + data.force = force; session = perf_session__new(&data, false, NULL); if (session == NULL) diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 78abbe8d9d5f..f403e19488b5 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -52,11 +52,9 @@ static int perf_session__list_build_ids(bool force, bool with_hits) { struct perf_session *session; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = force, }; symbol__elf_init(); diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index d340d2e42776..4272763a5e96 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -33,6 +33,7 @@ #include "ui/browsers/hists.h" #include "thread.h" #include "mem2node.h" +#include "symbol.h" struct c2c_hists { struct hists hists; @@ -1969,7 +1970,7 @@ static void calc_width(struct c2c_hist_entry *c2c_he) set_nodestr(c2c_he); } -static int filter_cb(struct hist_entry *he) +static int filter_cb(struct hist_entry *he, void *arg __maybe_unused) { struct c2c_hist_entry *c2c_he; @@ -1986,7 +1987,7 @@ static int filter_cb(struct hist_entry *he) return 0; } -static int resort_cl_cb(struct hist_entry *he) +static int resort_cl_cb(struct hist_entry *he, void *arg __maybe_unused) { struct c2c_hist_entry *c2c_he; struct c2c_hists *c2c_hists; @@ -2073,7 +2074,7 @@ static int setup_nodes(struct perf_session *session) #define HAS_HITMS(__h) ((__h)->stats.lcl_hitm || (__h)->stats.rmt_hitm) -static int resort_hitm_cb(struct hist_entry *he) +static int resort_hitm_cb(struct hist_entry *he, void *arg __maybe_unused) { struct c2c_hist_entry *c2c_he; c2c_he = container_of(he, struct c2c_hist_entry, he); @@ -2088,14 +2089,14 @@ static int resort_hitm_cb(struct hist_entry *he) static int hists__iterate_cb(struct hists *hists, hists__resort_cb_t cb) { - struct rb_node *next = rb_first(&hists->entries); + struct rb_node *next = rb_first_cached(&hists->entries); int ret = 0; while (next) { struct hist_entry *he; he = rb_entry(next, struct hist_entry, rb_node); - ret = cb(he); + ret = cb(he, NULL); if (ret) break; next = rb_next(&he->rb_node); @@ -2215,7 +2216,7 @@ static void print_pareto(FILE *out) if (WARN_ONCE(ret, "failed to setup sort entries\n")) return; - nd = rb_first(&c2c.hists.hists.entries); + nd = rb_first_cached(&c2c.hists.hists.entries); for (; nd; nd = rb_next(nd)) { struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); @@ -2283,7 +2284,7 @@ static void perf_c2c__hists_fprintf(FILE *out, struct perf_session *session) static void c2c_browser__update_nr_entries(struct hist_browser *hb) { u64 nr_entries = 0; - struct rb_node *nd = rb_first(&hb->hists->entries); + struct rb_node *nd = rb_first_cached(&hb->hists->entries); while (nd) { struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node); @@ -2343,7 +2344,7 @@ static int perf_c2c__browse_cacheline(struct hist_entry *he) struct c2c_cacheline_browser *cl_browser; struct hist_browser *browser; int key = -1; - const char help[] = + static const char help[] = " ENTER Toggle callchains (if present) \n" " n Toggle Node details info \n" " s Toggle full length of symbol and source line columns \n" @@ -2424,7 +2425,7 @@ static int perf_c2c__hists_browse(struct hists *hists) { struct hist_browser *browser; int key = -1; - const char help[] = + static const char help[] = " d Display cacheline details \n" " ENTER Toggle callchains (if present) \n" " q Quit \n"; @@ -2749,8 +2750,8 @@ static int perf_c2c__report(int argc, const char **argv) if (!input_name || !strlen(input_name)) input_name = "perf.data"; - data.file.path = input_name; - data.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; err = setup_display(display); if (err) diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index 39db2ee32d48..58fe0e88215c 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -429,7 +429,7 @@ get_pair_fmt(struct hist_entry *he, struct diff_hpp_fmt *dfmt) static void hists__baseline_only(struct hists *hists) { - struct rb_root *root; + struct rb_root_cached *root; struct rb_node *next; if (hists__has(hists, need_collapse)) @@ -437,13 +437,13 @@ static void hists__baseline_only(struct hists *hists) else root = hists->entries_in; - next = rb_first(root); + next = rb_first_cached(root); while (next != NULL) { struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node_in); next = rb_next(&he->rb_node_in); if (!hist_entry__next_pair(he)) { - rb_erase(&he->rb_node_in, root); + rb_erase_cached(&he->rb_node_in, root); hist_entry__delete(he); } } @@ -451,7 +451,7 @@ static void hists__baseline_only(struct hists *hists) static void hists__precompute(struct hists *hists) { - struct rb_root *root; + struct rb_root_cached *root; struct rb_node *next; if (hists__has(hists, need_collapse)) @@ -459,7 +459,7 @@ static void hists__precompute(struct hists *hists) else root = hists->entries_in; - next = rb_first(root); + next = rb_first_cached(root); while (next != NULL) { struct hist_entry *he, *pair; struct data__file *d; @@ -708,7 +708,7 @@ static void data__fprintf(void) data__for_each_file(i, d) fprintf(stdout, "# [%d] %s %s\n", - d->idx, d->data.file.path, + d->idx, d->data.path, !d->idx ? "(Baseline)" : ""); fprintf(stdout, "#\n"); @@ -779,14 +779,14 @@ static int __cmd_diff(void) data__for_each_file(i, d) { d->session = perf_session__new(&d->data, false, &tool); if (!d->session) { - pr_err("Failed to open %s\n", d->data.file.path); + pr_err("Failed to open %s\n", d->data.path); ret = -1; goto out_delete; } ret = perf_session__process_events(d->session); if (ret) { - pr_err("Failed to process %s\n", d->data.file.path); + pr_err("Failed to process %s\n", d->data.path); goto out_delete; } @@ -1289,9 +1289,9 @@ static int data_init(int argc, const char **argv) data__for_each_file(i, d) { struct perf_data *data = &d->data; - data->file.path = use_default ? defaults[i] : argv[i]; - data->mode = PERF_DATA_MODE_READ, - data->force = force, + data->path = use_default ? defaults[i] : argv[i]; + data->mode = PERF_DATA_MODE_READ, + data->force = force, d->idx = i; } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index e06e822ce634..6e4f63b0da4a 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -23,9 +23,7 @@ static int __cmd_evlist(const char *file_name, struct perf_attr_details *details struct perf_session *session; struct perf_evsel *pos; struct perf_data data = { - .file = { - .path = file_name, - }, + .path = file_name, .mode = PERF_DATA_MODE_READ, .force = details->force, }; diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index eda41673c4f3..24086b7f1b14 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -12,6 +12,7 @@ #include "util/color.h" #include "util/evlist.h" #include "util/evsel.h" +#include "util/map.h" #include "util/session.h" #include "util/tool.h" #include "util/debug.h" @@ -19,6 +20,7 @@ #include "util/data.h" #include "util/auxtrace.h" #include "util/jit.h" +#include "util/symbol.h" #include "util/thread.h" #include <subcmd/parse-options.h> @@ -768,10 +770,8 @@ int cmd_inject(int argc, const char **argv) .input_name = "-", .samples = LIST_HEAD_INIT(inject.samples), .output = { - .file = { - .path = "-", - }, - .mode = PERF_DATA_MODE_WRITE, + .path = "-", + .mode = PERF_DATA_MODE_WRITE, }, }; struct perf_data data = { @@ -784,7 +784,7 @@ int cmd_inject(int argc, const char **argv) "Inject build-ids into the output stream"), OPT_STRING('i', "input", &inject.input_name, "file", "input file name"), - OPT_STRING('o', "output", &inject.output.file.path, "file", + OPT_STRING('o', "output", &inject.output.path, "file", "output file name"), OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat, "Merge sched-stat and sched-switch for getting events " @@ -832,7 +832,7 @@ int cmd_inject(int argc, const char **argv) inject.tool.ordered_events = inject.sched_stat; - data.file.path = inject.input_name; + data.path = inject.input_name; inject.session = perf_session__new(&data, true, &inject.tool); if (inject.session == NULL) return -1; diff --git a/tools/perf/builtin-kallsyms.c b/tools/perf/builtin-kallsyms.c index 90d1a2305b72..bc7a2bc7aed7 100644 --- a/tools/perf/builtin-kallsyms.c +++ b/tools/perf/builtin-kallsyms.c @@ -13,6 +13,7 @@ #include <subcmd/parse-options.h> #include "debug.h" #include "machine.h" +#include "map.h" #include "symbol.h" static int __cmd_kallsyms(int argc, const char **argv) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index b63bca4b0c2a..fa520f4b8095 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -6,6 +6,7 @@ #include "util/evsel.h" #include "util/util.h" #include "util/config.h" +#include "util/map.h" #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" @@ -334,7 +335,7 @@ static int build_alloc_func_list(void) struct alloc_func *func; struct machine *machine = &kmem_session->machines.host; regex_t alloc_func_regex; - const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?"; + static const char pattern[] = "^_?_?(alloc|get_free|get_zeroed)_pages?"; ret = regcomp(&alloc_func_regex, pattern, REG_EXTENDED); if (ret) { @@ -1924,7 +1925,7 @@ int cmd_kmem(int argc, const char **argv) NULL }; struct perf_session *session; - const char errmsg[] = "No %s allocation events found. Have you run 'perf kmem record --%s'?\n"; + static const char errmsg[] = "No %s allocation events found. Have you run 'perf kmem record --%s'?\n"; int ret = perf_config(kmem_config, NULL); if (ret) @@ -1948,7 +1949,7 @@ int cmd_kmem(int argc, const char **argv) return __cmd_record(argc, argv); } - data.file.path = input_name; + data.path = input_name; kmem_session = session = perf_session__new(&data, false, &perf_kmem); if (session == NULL) diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 3d4cbc4e87c7..dbb6f737a3e2 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1080,11 +1080,9 @@ static int read_events(struct perf_kvm_stat *kvm) .ordered_events = true, }; struct perf_data file = { - .file = { - .path = kvm->file_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = kvm->force, + .path = kvm->file_name, + .mode = PERF_DATA_MODE_READ, + .force = kvm->force, }; kvm->tool = eops; diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c index ead221e49f00..c9f98d00c0e9 100644 --- a/tools/perf/builtin-list.c +++ b/tools/perf/builtin-list.c @@ -82,9 +82,9 @@ int cmd_list(int argc, const char **argv) else if (strcmp(argv[i], "sdt") == 0) print_sdt_events(NULL, NULL, raw_dump); else if (strcmp(argv[i], "metric") == 0) - metricgroup__print(true, false, NULL, raw_dump); + metricgroup__print(true, false, NULL, raw_dump, details_flag); else if (strcmp(argv[i], "metricgroup") == 0) - metricgroup__print(false, true, NULL, raw_dump); + metricgroup__print(false, true, NULL, raw_dump, details_flag); else if ((sep = strchr(argv[i], ':')) != NULL) { int sep_idx; @@ -102,7 +102,7 @@ int cmd_list(int argc, const char **argv) s[sep_idx] = '\0'; print_tracepoint_events(s, s + sep_idx + 1, raw_dump); print_sdt_events(s, s + sep_idx + 1, raw_dump); - metricgroup__print(true, true, s, raw_dump); + metricgroup__print(true, true, s, raw_dump, details_flag); free(s); } else { if (asprintf(&s, "*%s*", argv[i]) < 0) { @@ -119,7 +119,7 @@ int cmd_list(int argc, const char **argv) details_flag); print_tracepoint_events(NULL, s, raw_dump); print_sdt_events(NULL, s, raw_dump); - metricgroup__print(true, true, NULL, raw_dump); + metricgroup__print(true, true, NULL, raw_dump, details_flag); free(s); } } diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 6e0189df2b3b..b9810a8d350a 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -866,11 +866,9 @@ static int __cmd_report(bool display_info) .ordered_events = true, }; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = force, }; session = perf_session__new(&data, false, &eops); diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index 57393e94d156..f45c8b502f63 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -13,6 +13,7 @@ #include "util/data.h" #include "util/mem-events.h" #include "util/debug.h" +#include "util/map.h" #include "util/symbol.h" #define MEM_OPERATION_LOAD 0x1 @@ -238,11 +239,9 @@ static int process_sample_event(struct perf_tool *tool, static int report_raw_events(struct perf_mem *mem) { struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = mem->force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = mem->force, }; int ret; struct perf_session *session = perf_session__new(&data, false, diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 99de91698de1..46d3c2deeb40 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -32,6 +32,7 @@ #include "perf.h" #include "builtin.h" +#include "namespaces.h" #include "util/util.h" #include "util/strlist.h" #include "util/strfilter.h" diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 882285fb9f64..f3f7f3100336 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -23,7 +23,6 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" -#include "util/drv_configs.h" #include "util/session.h" #include "util/tool.h" #include "util/symbol.h" @@ -39,8 +38,10 @@ #include "util/bpf-loader.h" #include "util/trigger.h" #include "util/perf-hooks.h" +#include "util/cpu-set-sched.h" #include "util/time-utils.h" #include "util/units.h" +#include "util/bpf-event.h" #include "asm/bug.h" #include <errno.h> @@ -81,12 +82,17 @@ struct record { bool timestamp_boundary; struct switch_output switch_output; unsigned long long samples; + cpu_set_t affinity_mask; }; static volatile int auxtrace_record__snapshot_started; static DEFINE_TRIGGER(auxtrace_snapshot_trigger); static DEFINE_TRIGGER(switch_output_trigger); +static const char *affinity_tags[PERF_AFFINITY_MAX] = { + "SYS", "NODE", "CPU" +}; + static bool switch_output_signal(struct record *rec) { return rec->switch_output.signal && @@ -531,9 +537,13 @@ static int record__mmap_evlist(struct record *rec, struct record_opts *opts = &rec->opts; char msg[512]; + if (opts->affinity != PERF_AFFINITY_SYS) + cpu__setup_cpunode_map(); + if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, opts->auxtrace_mmap_pages, - opts->auxtrace_snapshot_mode, opts->nr_cblocks) < 0) { + opts->auxtrace_snapshot_mode, + opts->nr_cblocks, opts->affinity) < 0) { if (errno == EPERM) { pr_err("Permission error mapping pages.\n" "Consider increasing " @@ -566,7 +576,6 @@ static int record__open(struct record *rec) struct perf_evlist *evlist = rec->evlist; struct perf_session *session = rec->session; struct record_opts *opts = &rec->opts; - struct perf_evsel_config_term *err_term; int rc = 0; /* @@ -619,14 +628,6 @@ try_again: goto out; } - if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) { - pr_err("failed to set config \"%s\" on event %s with %d (%s)\n", - err_term->val.drv_cfg, perf_evsel__name(pos), errno, - str_error_r(errno, msg, sizeof(msg))); - rc = -1; - goto out; - } - rc = record__mmap(rec); if (rc) goto out; @@ -659,10 +660,9 @@ static int process_sample_event(struct perf_tool *tool, static int process_buildids(struct record *rec) { - struct perf_data *data = &rec->data; struct perf_session *session = rec->session; - if (data->size == 0) + if (perf_data__size(&rec->data) == 0) return 0; /* @@ -722,6 +722,16 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; +static void record__adjust_affinity(struct record *rec, struct perf_mmap *map) +{ + if (rec->opts.affinity != PERF_AFFINITY_SYS && + !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) { + CPU_ZERO(&rec->affinity_mask); + CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask); + sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask); + } +} + static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist, bool overwrite) { @@ -749,6 +759,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli struct perf_mmap *map = &maps[i]; if (map->base) { + record__adjust_affinity(rec, map); if (!record__aio_enabled(rec)) { if (perf_mmap__push(map, rec, record__pushfn) != 0) { rc = -1; @@ -839,7 +850,7 @@ record__finish_output(struct record *rec) return; rec->session->header.data_size += rec->bytes_written; - data->size = lseek(perf_data__fd(data), 0, SEEK_CUR); + data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR); if (!rec->no_buildid) { process_buildids(rec); @@ -907,7 +918,7 @@ record__switch_output(struct record *rec, bool at_exit) if (!quiet) fprintf(stderr, "[ perf record: Dump %s.%s ]\n", - data->file.path, timestamp); + data->path, timestamp); /* Output tracking events */ if (!at_exit) { @@ -1082,6 +1093,11 @@ static int record__synthesize(struct record *rec, bool tail) return err; } + err = perf_event__synthesize_bpf_events(tool, process_synthesized_event, + machine, opts); + if (err < 0) + pr_warning("Couldn't synthesize bpf events.\n"); + err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads, process_synthesized_event, opts->sample_address, 1); @@ -1445,7 +1461,7 @@ out_child: fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s%s%s ]\n", perf_data__size(data) / 1024.0 / 1024.0, - data->file.path, postfix, samples); + data->path, postfix, samples); } out_delete_session: @@ -1639,6 +1655,21 @@ static int parse_clockid(const struct option *opt, const char *str, int unset) return -1; } +static int record__parse_affinity(const struct option *opt, const char *str, int unset) +{ + struct record_opts *opts = (struct record_opts *)opt->value; + + if (unset || !str) + return 0; + + if (!strcasecmp(str, "node")) + opts->affinity = PERF_AFFINITY_NODE; + else if (!strcasecmp(str, "cpu")) + opts->affinity = PERF_AFFINITY_CPU; + + return 0; +} + static int record__parse_mmap_pages(const struct option *opt, const char *str, int unset __maybe_unused) @@ -1831,7 +1862,7 @@ static struct option __record_options[] = { OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu", "list of cpus to monitor"), OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), - OPT_STRING('o', "output", &record.data.file.path, "file", + OPT_STRING('o', "output", &record.data.path, "file", "output file name"), OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit, &record.opts.no_inherit_set, @@ -1839,6 +1870,7 @@ static struct option __record_options[] = { OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize, "synthesize non-sample events at the end of output"), OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"), + OPT_BOOLEAN(0, "bpf-event", &record.opts.bpf_event, "record bpf events"), OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq, "Fail if the specified frequency can't be used"), OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'", @@ -1946,6 +1978,9 @@ static struct option __record_options[] = { &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)", record__aio_parse), #endif + OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu", + "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer", + record__parse_affinity), OPT_END() }; @@ -1980,6 +2015,9 @@ int cmd_record(int argc, const char **argv) # undef REASON #endif + CPU_ZERO(&rec->affinity_mask); + rec->opts.affinity = PERF_AFFINITY_SYS; + rec->evlist = perf_evlist__new(); if (rec->evlist == NULL) return -ENOMEM; @@ -2143,6 +2181,8 @@ int cmd_record(int argc, const char **argv) if (verbose > 0) pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks); + pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]); + err = __cmd_record(&record, argc, argv); out: perf_evlist__delete(rec->evlist); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 4958095be4fc..1532ebde6c4b 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -16,6 +16,7 @@ #include <linux/list.h> #include <linux/rbtree.h> #include <linux/err.h> +#include "util/map.h" #include "util/symbol.h" #include "util/callchain.h" #include "util/values.h" @@ -615,6 +616,21 @@ static int report__collapse_hists(struct report *rep) return ret; } +static int hists__resort_cb(struct hist_entry *he, void *arg) +{ + struct report *rep = arg; + struct symbol *sym = he->ms.sym; + + if (rep->symbol_ipc && sym && !sym->annotate2) { + struct perf_evsel *evsel = hists_to_evsel(he->hists); + + symbol__annotate2(sym, he->ms.map, evsel, + &annotation__default_options, NULL); + } + + return 0; +} + static void report__output_resort(struct report *rep) { struct ui_progress prog; @@ -622,8 +638,10 @@ static void report__output_resort(struct report *rep) ui_progress__init(&prog, rep->nr_entries, "Sorting events for output..."); - evlist__for_each_entry(rep->session->evlist, pos) - perf_evsel__output_resort(pos, &prog); + evlist__for_each_entry(rep->session->evlist, pos) { + perf_evsel__output_resort_cb(pos, &prog, + hists__resort_cb, rep); + } ui_progress__finish(); } @@ -753,7 +771,8 @@ static int tasks_print(struct report *rep, FILE *fp) for (i = 0; i < THREADS__TABLE_SIZE; i++) { struct threads *threads = &machine->threads[i]; - for (nd = rb_first(&threads->entries); nd; nd = rb_next(nd)) { + for (nd = rb_first_cached(&threads->entries); nd; + nd = rb_next(nd)) { task = tasks + itask++; task->thread = rb_entry(nd, struct thread, rb_node); @@ -880,7 +899,7 @@ static int __cmd_report(struct report *rep) rep->nr_entries += evsel__hists(pos)->nr_entries; if (rep->nr_entries == 0) { - ui__error("The %s file has no samples!\n", data->file.path); + ui__error("The %s data has no samples!\n", data->path); return 0; } @@ -956,9 +975,9 @@ int cmd_report(int argc, const char **argv) int branch_mode = -1; bool branch_call_mode = false; #define CALLCHAIN_DEFAULT_OPT "graph,0.5,caller,function,percent" - const char report_callchain_help[] = "Display call graph (stack chain/backtrace):\n\n" - CALLCHAIN_REPORT_HELP - "\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT; + static const char report_callchain_help[] = "Display call graph (stack chain/backtrace):\n\n" + CALLCHAIN_REPORT_HELP + "\n\t\t\t\tDefault: " CALLCHAIN_DEFAULT_OPT; char callchain_default_opt[] = CALLCHAIN_DEFAULT_OPT; const char * const report_usage[] = { "perf report [<options>]", @@ -1188,8 +1207,8 @@ int cmd_report(int argc, const char **argv) input_name = "perf.data"; } - data.file.path = input_name; - data.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; repeat: session = perf_session__new(&data, false, &report.tool); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index cbf39dab19c1..275f2d92a7bf 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -213,7 +213,7 @@ struct perf_sched { u64 all_runtime; u64 all_count; u64 cpu_last_switched[MAX_CPUS]; - struct rb_root atom_root, sorted_atom_root, merged_atom_root; + struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root; struct list_head sort_list, cmp_pid; bool force; bool skip_merge; @@ -271,7 +271,7 @@ struct evsel_runtime { struct idle_thread_runtime { struct thread_runtime tr; struct thread *last_thread; - struct rb_root sorted_root; + struct rb_root_cached sorted_root; struct callchain_root callchain; struct callchain_cursor cursor; }; @@ -950,10 +950,10 @@ thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms * } static struct work_atoms * -thread_atoms_search(struct rb_root *root, struct thread *thread, +thread_atoms_search(struct rb_root_cached *root, struct thread *thread, struct list_head *sort_list) { - struct rb_node *node = root->rb_node; + struct rb_node *node = root->rb_root.rb_node; struct work_atoms key = { .thread = thread }; while (node) { @@ -976,10 +976,11 @@ thread_atoms_search(struct rb_root *root, struct thread *thread, } static void -__thread_latency_insert(struct rb_root *root, struct work_atoms *data, +__thread_latency_insert(struct rb_root_cached *root, struct work_atoms *data, struct list_head *sort_list) { - struct rb_node **new = &(root->rb_node), *parent = NULL; + struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; + bool leftmost = true; while (*new) { struct work_atoms *this; @@ -992,12 +993,14 @@ __thread_latency_insert(struct rb_root *root, struct work_atoms *data, if (cmp > 0) new = &((*new)->rb_left); - else + else { new = &((*new)->rb_right); + leftmost = false; + } } rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); + rb_insert_color_cached(&data->node, root, leftmost); } static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread) @@ -1447,15 +1450,15 @@ static int sort_dimension__add(const char *tok, struct list_head *list) static void perf_sched__sort_lat(struct perf_sched *sched) { struct rb_node *node; - struct rb_root *root = &sched->atom_root; + struct rb_root_cached *root = &sched->atom_root; again: for (;;) { struct work_atoms *data; - node = rb_first(root); + node = rb_first_cached(root); if (!node) break; - rb_erase(node, root); + rb_erase_cached(node, root); data = rb_entry(node, struct work_atoms, node); __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); } @@ -1782,11 +1785,9 @@ static int perf_sched__read_events(struct perf_sched *sched) }; struct perf_session *session; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = sched->force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; int rc = -1; @@ -2762,12 +2763,12 @@ static size_t callchain__fprintf_folded(FILE *fp, struct callchain_node *node) return ret; } -static size_t timehist_print_idlehist_callchain(struct rb_root *root) +static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root) { size_t ret = 0; FILE *fp = stdout; struct callchain_node *chain; - struct rb_node *rb_node = rb_first(root); + struct rb_node *rb_node = rb_first_cached(root); printf(" %16s %8s %s\n", "Idle time (msec)", "Count", "Callchains"); printf(" %.16s %.8s %.50s\n", graph_dotted_line, graph_dotted_line, @@ -2868,7 +2869,7 @@ static void timehist_print_summary(struct perf_sched *sched, if (itr == NULL) continue; - callchain_param.sort(&itr->sorted_root, &itr->callchain, + callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain, 0, &callchain_param); printf(" CPU %2d:", i); @@ -2955,11 +2956,9 @@ static int perf_sched__timehist(struct perf_sched *sched) { "sched:sched_migrate_task", timehist_migrate_task_event, }, }; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = sched->force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = sched->force, }; struct perf_session *session; @@ -3074,11 +3073,12 @@ static void print_bad_events(struct perf_sched *sched) } } -static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data) +static void __merge_work_atoms(struct rb_root_cached *root, struct work_atoms *data) { - struct rb_node **new = &(root->rb_node), *parent = NULL; + struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; struct work_atoms *this; const char *comm = thread__comm_str(data->thread), *this_comm; + bool leftmost = true; while (*new) { int cmp; @@ -3092,6 +3092,7 @@ static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data) new = &((*new)->rb_left); } else if (cmp < 0) { new = &((*new)->rb_right); + leftmost = false; } else { this->num_merged++; this->total_runtime += data->total_runtime; @@ -3109,7 +3110,7 @@ static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data) data->num_merged++; rb_link_node(&data->node, parent, new); - rb_insert_color(&data->node, root); + rb_insert_color_cached(&data->node, root, leftmost); } static void perf_sched__merge_lat(struct perf_sched *sched) @@ -3120,8 +3121,8 @@ static void perf_sched__merge_lat(struct perf_sched *sched) if (sched->skip_merge) return; - while ((node = rb_first(&sched->atom_root))) { - rb_erase(node, &sched->atom_root); + while ((node = rb_first_cached(&sched->atom_root))) { + rb_erase_cached(node, &sched->atom_root); data = rb_entry(node, struct work_atoms, node); __merge_work_atoms(&sched->merged_atom_root, data); } @@ -3143,7 +3144,7 @@ static int perf_sched__lat(struct perf_sched *sched) printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); printf(" -----------------------------------------------------------------------------------------------------------------\n"); - next = rb_first(&sched->sorted_atom_root); + next = rb_first_cached(&sched->sorted_atom_root); while (next) { struct work_atoms *work_list; @@ -3336,7 +3337,7 @@ static int __cmd_record(int argc, const char **argv) int cmd_sched(int argc, const char **argv) { - const char default_sort_order[] = "avg, max, switch, runtime"; + static const char default_sort_order[] = "avg, max, switch, runtime"; struct perf_sched sched = { .tool = { .sample = perf_sched__process_tracepoint_sample, diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index ac221f137ed2..2d8cb1d1682c 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -10,6 +10,7 @@ #include "util/perf_regs.h" #include "util/session.h" #include "util/tool.h" +#include "util/map.h" #include "util/symbol.h" #include "util/thread.h" #include "util/trace-event.h" @@ -148,6 +149,7 @@ static struct { unsigned int print_ip_opts; u64 fields; u64 invalid_fields; + u64 user_set_fields; } output[OUTPUT_TYPE_MAX] = { [PERF_TYPE_HARDWARE] = { @@ -344,7 +346,7 @@ static int perf_evsel__do_check_stype(struct perf_evsel *evsel, if (attr->sample_type & sample_type) return 0; - if (output[type].user_set) { + if (output[type].user_set_fields & field) { if (allow_user_set) return 0; evname = perf_evsel__name(evsel); @@ -2559,6 +2561,10 @@ static int parse_output_fields(const struct option *opt __maybe_unused, pr_warning("Overriding previous field request for %s events.\n", event_type(type)); + /* Don't override defaults for +- */ + if (strchr(tok, '+') || strchr(tok, '-')) + goto parse; + output[type].fields = 0; output[type].user_set = true; output[type].wildcard_set = false; @@ -2627,10 +2633,13 @@ parse: pr_warning("\'%s\' not valid for %s events. Ignoring.\n", all_output_options[i].str, event_type(j)); } else { - if (change == REMOVE) + if (change == REMOVE) { output[j].fields &= ~all_output_options[i].field; - else + output[j].user_set_fields &= ~all_output_options[i].field; + } else { output[j].fields |= all_output_options[i].field; + output[j].user_set_fields |= all_output_options[i].field; + } output[j].user_set = true; output[j].wildcard_set = true; } @@ -2643,6 +2652,10 @@ parse: rc = -EINVAL; goto out; } + if (change == REMOVE) + output[type].fields &= ~all_output_options[i].field; + else + output[type].fields |= all_output_options[i].field; output[type].user_set = true; output[type].wildcard_set = true; } @@ -2942,10 +2955,8 @@ int find_scripts(char **scripts_array, char **scripts_path_array) DIR *scripts_dir, *lang_dir; struct perf_session *session; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, + .path = input_name, + .mode = PERF_DATA_MODE_READ, }; char *temp; int i = 0; @@ -3418,8 +3429,8 @@ int cmd_script(int argc, const char **argv) argc = parse_options_subcommand(argc, argv, options, script_subcommands, script_usage, PARSE_OPT_STOP_AT_NON_OPTION); - data.file.path = input_name; - data.force = symbol_conf.force; + data.path = input_name; + data.force = symbol_conf.force; if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) { rec_script_path = get_script_path(argv[1], RECORD_SUFFIX); @@ -3645,7 +3656,7 @@ int cmd_script(int argc, const char **argv) goto out_delete; } - input = open(data.file.path, O_RDONLY); /* input_name */ + input = open(data.path, O_RDONLY); /* input_name */ if (input < 0) { err = -errno; perror("failed to open file"); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 63a3afc7f32b..7b8f09b0b8bf 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -52,7 +52,6 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" -#include "util/drv_configs.h" #include "util/color.h" #include "util/stat.h" #include "util/header.h" @@ -83,7 +82,6 @@ #include <unistd.h> #include <sys/time.h> #include <sys/resource.h> -#include <sys/wait.h> #include "sane_ctype.h" @@ -418,7 +416,6 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) int status = 0; const bool forks = (argc > 0); bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; - struct perf_evsel_config_term *err_term; if (interval) { ts.tv_sec = interval / USEC_PER_MSEC; @@ -515,13 +512,6 @@ try_again: return -1; } - if (perf_evlist__apply_drv_configs(evsel_list, &counter, &err_term)) { - pr_err("failed to set config \"%s\" on event %s with %d (%s)\n", - err_term->val.drv_cfg, perf_evsel__name(counter), errno, - str_error_r(errno, msg, sizeof(msg))); - return -1; - } - if (STAT_RECORD) { int err, fd = perf_data__fd(&perf_stat.data); @@ -1332,7 +1322,7 @@ static int __cmd_record(int argc, const char **argv) PARSE_OPT_STOP_AT_NON_OPTION); if (output_name) - data->file.path = output_name; + data->path = output_name; if (stat_config.run_count != 1 || forever) { pr_err("Cannot use -r option with perf stat record.\n"); @@ -1533,8 +1523,8 @@ static int __cmd_report(int argc, const char **argv) input_name = "perf.data"; } - perf_stat.data.file.path = input_name; - perf_stat.data.mode = PERF_DATA_MODE_READ; + perf_stat.data.path = input_name; + perf_stat.data.mode = PERF_DATA_MODE_READ; session = perf_session__new(&perf_stat.data, false, &perf_stat.tool); if (session == NULL) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 775b99833e51..9b98687a27b9 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -1602,11 +1602,9 @@ static int __cmd_timechart(struct timechart *tchart, const char *output_name) { "syscalls:sys_exit_select", process_exit_poll }, }; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = tchart->force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = tchart->force, }; struct perf_session *session = perf_session__new(&data, false, diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f64e312db787..231a90daa958 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -22,13 +22,14 @@ #include "perf.h" #include "util/annotate.h" +#include "util/bpf-event.h" #include "util/config.h" #include "util/color.h" -#include "util/drv_configs.h" #include "util/evlist.h" #include "util/evsel.h" #include "util/event.h" #include "util/machine.h" +#include "util/map.h" #include "util/session.h" #include "util/symbol.h" #include "util/thread.h" @@ -366,7 +367,7 @@ static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) if (p) *p = 0; - next = rb_first(&hists->entries); + next = rb_first_cached(&hists->entries); while (next) { n = rb_entry(next, struct hist_entry, rb_node); if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) { @@ -1184,10 +1185,6 @@ static void init_process_thread(struct perf_top *top) static int __cmd_top(struct perf_top *top) { - char msg[512]; - struct perf_evsel *pos; - struct perf_evsel_config_term *err_term; - struct perf_evlist *evlist = top->evlist; struct record_opts *opts = &top->record_opts; pthread_t thread, thread_process; int ret; @@ -1215,6 +1212,12 @@ static int __cmd_top(struct perf_top *top) init_process_thread(top); + ret = perf_event__synthesize_bpf_events(&top->tool, perf_event__process, + &top->session->machines.host, + &top->record_opts); + if (ret < 0) + pr_warning("Couldn't synthesize bpf events.\n"); + machine__synthesize_threads(&top->session->machines.host, &opts->target, top->evlist->threads, false, top->nr_threads_synthesize); @@ -1232,14 +1235,6 @@ static int __cmd_top(struct perf_top *top) if (ret) goto out_delete; - ret = perf_evlist__apply_drv_configs(evlist, &pos, &err_term); - if (ret) { - pr_err("failed to set config \"%s\" on event %s with %d (%s)\n", - err_term->val.drv_cfg, perf_evsel__name(pos), errno, - str_error_r(errno, msg, sizeof(msg))); - goto out_delete; - } - top->session->evlist = top->evlist; perf_session__set_id_hdr_size(top->session); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index b36061cd1ab8..f5b3a1e9c1dd 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -19,6 +19,7 @@ #include <traceevent/event-parse.h> #include <api/fs/tracing_path.h> #include <bpf/bpf.h> +#include "util/bpf_map.h" #include "builtin.h" #include "util/cgroup.h" #include "util/color.h" @@ -29,6 +30,8 @@ #include "util/evlist.h" #include <subcmd/exec-cmd.h> #include "util/machine.h" +#include "util/map.h" +#include "util/symbol.h" #include "util/path.h" #include "util/session.h" #include "util/thread.h" @@ -85,6 +88,9 @@ struct trace { *augmented; } events; } syscalls; + struct { + struct bpf_map *map; + } dump; struct record_opts opts; struct perf_evlist *evlist; struct machine *host; @@ -1039,6 +1045,9 @@ static const size_t trace__entry_str_size = 2048; static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd) { + if (fd < 0) + return NULL; + if (fd > ttrace->files.max) { struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file)); @@ -2766,7 +2775,8 @@ static int trace__set_filter_loop_pids(struct trace *trace) if (parent == NULL) break; - if (!strcmp(thread__comm_str(parent), "sshd")) { + if (!strcmp(thread__comm_str(parent), "sshd") || + strstarts(thread__comm_str(parent), "gnome-terminal")) { pids[nr++] = parent->tid; break; } @@ -2991,6 +3001,9 @@ static int trace__run(struct trace *trace, int argc, const char **argv) if (err < 0) goto out_error_apply_filters; + if (trace->dump.map) + bpf_map__fprintf(trace->dump.map, trace->output); + err = perf_evlist__mmap(evlist, trace->opts.mmap_pages); if (err < 0) goto out_error_mmap; @@ -3141,11 +3154,9 @@ static int trace__replay(struct trace *trace) { "probe:vfs_getname", trace__vfs_getname, }, }; struct perf_data data = { - .file = { - .path = input_name, - }, - .mode = PERF_DATA_MODE_READ, - .force = trace->force, + .path = input_name, + .mode = PERF_DATA_MODE_READ, + .force = trace->force, }; struct perf_session *session; struct perf_evsel *evsel; @@ -3680,6 +3691,7 @@ int cmd_trace(int argc, const char **argv) .max_stack = UINT_MAX, .max_events = ULONG_MAX, }; + const char *map_dump_str = NULL; const char *output_name = NULL; const struct option trace_options[] = { OPT_CALLBACK('e', "event", &trace, "event", @@ -3712,6 +3724,9 @@ int cmd_trace(int argc, const char **argv) OPT_CALLBACK(0, "duration", &trace, "float", "show only events with duration > N.M ms", trace__set_duration), +#ifdef HAVE_LIBBPF_SUPPORT + OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"), +#endif OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), OPT_INCR('v', "verbose", &verbose, "be more verbose"), OPT_BOOLEAN('T', "time", &trace.full_time, @@ -3806,6 +3821,14 @@ int cmd_trace(int argc, const char **argv) err = -1; + if (map_dump_str) { + trace.dump.map = bpf__find_map_by_name(map_dump_str); + if (trace.dump.map == NULL) { + pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str); + goto out; + } + } + if (trace.trace_pgfaults) { trace.opts.sample_address = true; trace.opts.sample_time = true; @@ -3865,7 +3888,8 @@ int cmd_trace(int argc, const char **argv) goto init_augmented_syscall_tp; } - if (strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_enter") == 0) { + if (trace.syscalls.events.augmented->priv == NULL && + strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) { struct perf_evsel *augmented = trace.syscalls.events.augmented; if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) || perf_evsel__init_augmented_syscall_tp_args(augmented)) diff --git a/tools/perf/design.txt b/tools/perf/design.txt index a28dca2582aa..0453ba26cdbd 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -222,6 +222,10 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a way to request that counting of events be restricted to times when the CPU is in user, kernel and/or hypervisor mode. +Furthermore the 'exclude_host' and 'exclude_guest' bits provide a way +to request counting of events restricted to guest and host contexts when +using Linux as the hypervisor. + The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap operations, these can be used to relate userspace IP addresses to actual code, even after the mapping (or even the whole process) is gone, diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c b/tools/perf/examples/bpf/augmented_raw_syscalls.c index 53c233370fae..f9b2161e1ca4 100644 --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c @@ -18,23 +18,13 @@ #include <pid_filter.h> /* bpf-output associated map */ -struct bpf_map SEC("maps") __augmented_syscalls__ = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = __NR_CPUS__, -}; +bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__); struct syscall { bool enabled; }; -struct bpf_map SEC("maps") syscalls = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(struct syscall), - .max_entries = 512, -}; +bpf_map(syscalls, ARRAY, int, struct syscall, 512); struct syscall_enter_args { unsigned long long common_tp_fields; @@ -141,8 +131,8 @@ int sys_enter(struct syscall_enter_args *args) len = sizeof(augmented_args.args); } - perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len); - return 0; + /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ + return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len); } SEC("raw_syscalls:sys_exit") diff --git a/tools/perf/examples/bpf/augmented_syscalls.c b/tools/perf/examples/bpf/augmented_syscalls.c index 2ae44813ef2d..524fdb8534b3 100644 --- a/tools/perf/examples/bpf/augmented_syscalls.c +++ b/tools/perf/examples/bpf/augmented_syscalls.c @@ -19,12 +19,8 @@ #include <stdio.h> #include <linux/socket.h> -struct bpf_map SEC("maps") __augmented_syscalls__ = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = __NR_CPUS__, -}; +/* bpf-output associated map */ +bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__); struct syscall_exit_args { unsigned long long common_tp_fields; @@ -55,9 +51,9 @@ int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size; \ len &= sizeof(augmented_args.filename.value) - 1; \ } \ - perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ - &augmented_args, len); \ - return 0; \ + /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ \ + return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, len); \ } \ int syscall_exit(syscall)(struct syscall_exit_args *args) \ { \ @@ -125,10 +121,10 @@ int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ /* addrlen = augmented_args.args.addrlen; */ \ /* */ \ probe_read(&augmented_args.addr, addrlen, args->addr_ptr); \ - perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ - &augmented_args, \ - sizeof(augmented_args) - sizeof(augmented_args.addr) + addrlen); \ - return 0; \ + /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ \ + return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, \ + sizeof(augmented_args) - sizeof(augmented_args.addr) + addrlen);\ } \ int syscall_exit(syscall)(struct syscall_exit_args *args) \ { \ diff --git a/tools/perf/examples/bpf/etcsnoop.c b/tools/perf/examples/bpf/etcsnoop.c index b59e8812ee8c..e81b535346c0 100644 --- a/tools/perf/examples/bpf/etcsnoop.c +++ b/tools/perf/examples/bpf/etcsnoop.c @@ -21,12 +21,8 @@ #include <stdio.h> -struct bpf_map SEC("maps") __augmented_syscalls__ = { - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(u32), - .max_entries = __NR_CPUS__, -}; +/* bpf-output associated map */ +bpf_map(__augmented_syscalls__, PERF_EVENT_ARRAY, int, u32, __NR_CPUS__); struct augmented_filename { int size; @@ -49,11 +45,11 @@ int syscall_enter(syscall)(struct syscall_enter_##syscall##_args *args) \ args->filename_ptr); \ if (__builtin_memcmp(augmented_args.filename.value, etc, 4) != 0) \ return 0; \ - perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ - &augmented_args, \ - (sizeof(augmented_args) - sizeof(augmented_args.filename.value) + \ - augmented_args.filename.size)); \ - return 0; \ + /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ \ + return perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, \ + &augmented_args, \ + (sizeof(augmented_args) - sizeof(augmented_args.filename.value) + \ + augmented_args.filename.size)); \ } struct syscall_enter_openat_args { diff --git a/tools/perf/include/bpf/bpf.h b/tools/perf/include/bpf/bpf.h index e667577207dc..5df7ed9d9020 100644 --- a/tools/perf/include/bpf/bpf.h +++ b/tools/perf/include/bpf/bpf.h @@ -18,6 +18,14 @@ struct bpf_map { unsigned int numa_node; }; +#define bpf_map(name, _type, type_key, type_val, _max_entries) \ +struct bpf_map SEC("maps") name = { \ + .type = BPF_MAP_TYPE_##_type, \ + .key_size = sizeof(type_key), \ + .value_size = sizeof(type_val), \ + .max_entries = _max_entries, \ +} + /* * FIXME: this should receive .max_entries as a parameter, as careful * tuning of these limits is needed to avoid hitting limits that @@ -26,13 +34,7 @@ struct bpf_map { * For the current need, 'perf trace --filter-pids', 64 should * be good enough, but this surely needs to be revisited. */ -#define pid_map(name, value_type) \ -struct bpf_map SEC("maps") name = { \ - .type = BPF_MAP_TYPE_HASH, \ - .key_size = sizeof(pid_t), \ - .value_size = sizeof(value_type), \ - .max_entries = 64, \ -} +#define pid_map(name, value_type) bpf_map(name, HASH, pid_t, value_type, 64) static int (*bpf_map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags) = (void *)BPF_FUNC_map_update_elem; static void *(*bpf_map_lookup_elem)(struct bpf_map *map, void *key) = (void *)BPF_FUNC_map_lookup_elem; diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 388c6dd128b8..b120e547ddc7 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -66,6 +66,7 @@ struct record_opts { bool ignore_missing_thread; bool strict_freq; bool sample_id; + bool bpf_event; unsigned int freq; unsigned int mmap_pages; unsigned int auxtrace_mmap_pages; @@ -83,6 +84,14 @@ struct record_opts { clockid_t clockid; u64 clockid_res_ns; int nr_cblocks; + int affinity; +}; + +enum perf_affinity { + PERF_AFFINITY_SYS = 0, + PERF_AFFINITY_NODE, + PERF_AFFINITY_CPU, + PERF_AFFINITY_MAX }; struct option; diff --git a/tools/perf/pmu-events/arch/powerpc/power8/metrics.json b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json new file mode 100644 index 000000000000..bffb2d4a6420 --- /dev/null +++ b/tools/perf/pmu-events/arch/powerpc/power8/metrics.json @@ -0,0 +1,2245 @@ +[ + { + "BriefDescription": "% of finished branches that were treated as BC+8", + "MetricExpr": "PM_BR_BC_8_CONV / PM_BRU_FIN * 100", + "MetricGroup": "branch_prediction", + "MetricName": "bc_8_branch_ratio_percent" + }, + { + "BriefDescription": "% of finished branches that were pairable but not treated as BC+8", + "MetricExpr": "PM_BR_BC_8 / PM_BRU_FIN * 100", + "MetricGroup": "branch_prediction", + "MetricName": "bc_8_not_converted_branch_ratio_percent" + }, + { + "BriefDescription": "Percent of mispredicted branches out of all predicted (correctly and incorrectly) branches that completed", + "MetricExpr": "PM_BR_MPRED_CMPL / (PM_BR_PRED_BR0 + PM_BR_PRED_BR1) * 100", + "MetricGroup": "branch_prediction", + "MetricName": "br_misprediction_percent" + }, + { + "BriefDescription": "% of Branch miss predictions per instruction", + "MetricExpr": "PM_BR_MPRED_CMPL / PM_RUN_INST_CMPL * 100", + "MetricGroup": "branch_prediction", + "MetricName": "branch_mispredict_rate_percent" + }, + { + "BriefDescription": "Count cache branch misprediction per instruction", + "MetricExpr": "PM_BR_MPRED_CCACHE / PM_RUN_INST_CMPL * 100", + "MetricGroup": "branch_prediction", + "MetricName": "ccache_mispredict_rate_percent" + }, + { + "BriefDescription": "Percent of count catch mispredictions out of all completed branches that required count cache predictionn", + "MetricExpr": "PM_BR_MPRED_CCACHE / (PM_BR_PRED_CCACHE_BR0 + PM_BR_PRED_CCACHE_BR1) * 100", + "MetricGroup": "branch_prediction", + "MetricName": "ccache_misprediction_percent" + }, + { + "BriefDescription": "CR MisPredictions per Instruction", + "MetricExpr": "PM_BR_MPRED_CR / PM_RUN_INST_CMPL * 100", + "MetricGroup": "branch_prediction", + "MetricName": "cr_mispredict_rate_percent" + }, + { + "BriefDescription": "Link stack branch misprediction", + "MetricExpr": "(PM_BR_MPRED_TA - PM_BR_MPRED_CCACHE) / PM_RUN_INST_CMPL * 100", + "MetricGroup": "branch_prediction", + "MetricName": "lstack_mispredict_rate_percent" + }, + { + "BriefDescription": "Percent of link stack mispredictions out of all completed branches that required link stack prediction", + "MetricExpr": "(PM_BR_MPRED_TA - PM_BR_MPRED_CCACHE) / (PM_BR_PRED_LSTACK_BR0 + PM_BR_PRED_LSTACK_BR1) * 100", + "MetricGroup": "branch_prediction", + "MetricName": "lstack_misprediction_percent" + }, + { + "BriefDescription": "TA MisPredictions per Instruction", + "MetricExpr": "PM_BR_MPRED_TA / PM_RUN_INST_CMPL * 100", + "MetricGroup": "branch_prediction", + "MetricName": "ta_mispredict_rate_percent" + }, + { + "BriefDescription": "Percent of target address mispredictions out of all completed branches that required address prediction", + "MetricExpr": "PM_BR_MPRED_TA / (PM_BR_PRED_CCACHE_BR0 + PM_BR_PRED_CCACHE_BR1 + PM_BR_PRED_LSTACK_BR0 + PM_BR_PRED_LSTACK_BR1) * 100", + "MetricGroup": "branch_prediction", + "MetricName": "ta_misprediction_percent" + }, + { + "BriefDescription": "Percent of branches completed that were taken", + "MetricExpr": "PM_BR_TAKEN_CMPL * 100 / PM_BR_CMPL", + "MetricGroup": "branch_prediction", + "MetricName": "taken_branches_percent" + }, + { + "BriefDescription": "Percent of chip+group+sys pumps that were incorrectly predicted", + "MetricExpr": "PM_PUMP_MPRED * 100 / (PM_PUMP_CPRED + PM_PUMP_MPRED)", + "MetricGroup": "bus_stats", + "MetricName": "any_pump_mpred_percent" + }, + { + "BriefDescription": "Percent of chip pumps that were correctly predicted as chip pumps the first time", + "MetricExpr": "PM_CHIP_PUMP_CPRED * 100 / PM_L2_CHIP_PUMP", + "MetricGroup": "bus_stats", + "MetricName": "chip_pump_cpred_percent" + }, + { + "BriefDescription": "Percent of group pumps that were correctly predicted as group pumps the first time", + "MetricExpr": "PM_GRP_PUMP_CPRED * 100 / PM_L2_GROUP_PUMP", + "MetricGroup": "bus_stats", + "MetricName": "group_pump_cpred_percent" + }, + { + "BriefDescription": "Percent of system pumps that were correctly predicted as group pumps the first time", + "MetricExpr": "PM_SYS_PUMP_CPRED * 100 / PM_L2_GROUP_PUMP", + "MetricGroup": "bus_stats", + "MetricName": "sys_pump_cpred_percent" + }, + { + "BriefDescription": "Cycles stalled due to CRU or BRU operations", + "MetricExpr": "PM_CMPLU_STALL_BRU_CRU / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "bru_cru_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled due to ISU Branch Operations", + "MetricExpr": "PM_CMPLU_STALL_BRU / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "bru_stall_cpi" + }, + { + "BriefDescription": "Cycles in which a Group Completed", + "MetricExpr": "PM_GRP_CMPL / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "completion_cpi" + }, + { + "BriefDescription": "Cycles stalled by CO queue full", + "MetricExpr": "PM_CMPLU_STALL_COQ_FULL / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "coq_full_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled due to CRU Operations", + "MetricExpr": "(PM_CMPLU_STALL_BRU_CRU - PM_CMPLU_STALL_BRU) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "cru_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by flushes", + "MetricExpr": "PM_CMPLU_STALL_FLUSH / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "flush_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by FXU Multi-Cycle Instructions", + "MetricExpr": "PM_CMPLU_STALL_FXLONG / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "fxu_multi_cyc_cpi" + }, + { + "BriefDescription": "Cycles stalled by FXU", + "MetricExpr": "PM_CMPLU_STALL_FXU / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "fxu_stall_cpi" + }, + { + "BriefDescription": "Other cycles stalled by FXU", + "MetricExpr": "(PM_CMPLU_STALL_FXU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_FXLONG / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "fxu_stall_other_cpi" + }, + { + "BriefDescription": "Cycles GCT empty due to Branch Mispredicts", + "MetricExpr": "PM_GCT_NOSLOT_BR_MPRED / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_br_mpred_cpi" + }, + { + "BriefDescription": "Cycles GCT empty due to Branch Mispredicts and Icache Misses", + "MetricExpr": "PM_GCT_NOSLOT_BR_MPRED_ICMISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_br_mpred_ic_miss_cpi" + }, + { + "BriefDescription": "GCT empty cycles", + "MetricExpr": "PM_GCT_NOSLOT_CYC / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_cpi" + }, + { + "BriefDescription": "Cycles GCT empty where dispatch was held", + "MetricExpr": "(PM_GCT_NOSLOT_DISP_HELD_MAP + PM_GCT_NOSLOT_DISP_HELD_SRQ + PM_GCT_NOSLOT_DISP_HELD_ISSQ + PM_GCT_NOSLOT_DISP_HELD_OTHER) / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_disp_held_cpi" + }, + { + "BriefDescription": "Cycles GCT empty where dispatch was held due to issue queue", + "MetricExpr": "PM_GCT_NOSLOT_DISP_HELD_ISSQ / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_disp_held_issq_cpi" + }, + { + "BriefDescription": "Cycles GCT empty where dispatch was held due to maps", + "MetricExpr": "PM_GCT_NOSLOT_DISP_HELD_MAP / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_disp_held_map_cpi" + }, + { + "BriefDescription": "Cycles GCT empty where dispatch was held due to syncs and other effects", + "MetricExpr": "PM_GCT_NOSLOT_DISP_HELD_OTHER / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_disp_held_other_cpi" + }, + { + "BriefDescription": "Cycles GCT empty where dispatch was held due to SRQ", + "MetricExpr": "PM_GCT_NOSLOT_DISP_HELD_SRQ / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_disp_held_srq_cpi" + }, + { + "BriefDescription": "Cycles stalled by GCT empty due to Icache misses", + "MetricExpr": "PM_GCT_NOSLOT_IC_MISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_ic_miss_cpi" + }, + { + "BriefDescription": "Cycles stalled by GCT empty due to Icache misses that resolve in the local L2 or L3", + "MetricExpr": "(PM_GCT_NOSLOT_IC_MISS - PM_GCT_NOSLOT_IC_L3MISS) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_ic_miss_l2l3_cpi" + }, + { + "BriefDescription": "Cycles stalled by GCT empty due to Icache misses that resolve off-chip", + "MetricExpr": "PM_GCT_NOSLOT_IC_L3MISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_ic_miss_l3miss_cpi" + }, + { + "BriefDescription": "Other GCT empty cycles", + "MetricExpr": "(PM_GCT_NOSLOT_CYC / PM_RUN_INST_CMPL) - (PM_GCT_NOSLOT_IC_MISS / PM_RUN_INST_CMPL) - (PM_GCT_NOSLOT_BR_MPRED / PM_RUN_INST_CMPL) - (PM_GCT_NOSLOT_BR_MPRED_ICMISS / PM_RUN_INST_CMPL) - ((PM_GCT_NOSLOT_DISP_HELD_MAP / PM_RUN_INST_CMPL) + (PM_GCT_NOSLOT_DISP_HELD_SRQ / PM_RUN_INST_CMPL) + (PM_GCT_NOSLOT_DISP_HELD_ISSQ / PM_RUN_INST_CMPL) + (PM_GCT_NOSLOT_DISP_HELD_OTHER / PM_RUN_INST_CMPL))", + "MetricGroup": "cpi_breakdown", + "MetricName": "gct_empty_other_cpi" + }, + { + "BriefDescription": "Cycles stalled by heavyweight syncs", + "MetricExpr": "PM_CMPLU_STALL_HWSYNC / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "hwsync_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by LSU", + "MetricExpr": "PM_CMPLU_STALL_LSU / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses", + "MetricExpr": "PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in distant interventions and memory", + "MetricExpr": "(PM_CMPLU_STALL_DMISS_L3MISS - PM_CMPLU_STALL_DMISS_LMEM - PM_CMPLU_STALL_DMISS_L21_L31 - PM_CMPLU_STALL_DMISS_REMOTE) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_distant_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in remote or distant caches", + "MetricExpr": "PM_CMPLU_STALL_DMISS_L21_L31 / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_l21l31_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in the local L2 or L3, where there was a conflict", + "MetricExpr": "PM_CMPLU_STALL_DMISS_L2L3_CONFLICT / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_l2l3_conflict_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in the local L2 or L3", + "MetricExpr": "PM_CMPLU_STALL_DMISS_L2L3 / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_l2l3_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in the local L2 or L3, where there was no conflict", + "MetricExpr": "(PM_CMPLU_STALL_DMISS_L2L3 - PM_CMPLU_STALL_DMISS_L2L3_CONFLICT) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_l2l3_noconflict_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in other core's caches or memory", + "MetricExpr": "PM_CMPLU_STALL_DMISS_L3MISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_l3miss_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in local memory or local L4", + "MetricExpr": "PM_CMPLU_STALL_DMISS_LMEM / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_lmem_cpi" + }, + { + "BriefDescription": "Cycles stalled by D-Cache Misses that resolved in remote interventions and memory", + "MetricExpr": "PM_CMPLU_STALL_DMISS_REMOTE / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_dcache_miss_remote_cpi" + }, + { + "BriefDescription": "Cycles stalled by ERAT Translation rejects", + "MetricExpr": "PM_CMPLU_STALL_ERAT_MISS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_erat_miss_cpi" + }, + { + "BriefDescription": "Cycles stalled by LSU load finishes", + "MetricExpr": "PM_CMPLU_STALL_LOAD_FINISH / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_ld_fin_cpi" + }, + { + "BriefDescription": "Cycles stalled by LHS rejects", + "MetricExpr": "PM_CMPLU_STALL_REJECT_LHS / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_lhs_cpi" + }, + { + "BriefDescription": "Cycles stalled by LMQ Full rejects", + "MetricExpr": "PM_CMPLU_STALL_REJ_LMQ_FULL / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_lmq_full_cpi" + }, + { + "BriefDescription": "Cycles stalled by Other LSU Operations", + "MetricExpr": "(PM_CMPLU_STALL_LSU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_REJECT / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_STORE / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_LOAD_FINISH / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_ST_FWD / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_other_cpi" + }, + { + "BriefDescription": "Cycles stalled by LSU Rejects", + "MetricExpr": "PM_CMPLU_STALL_REJECT / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_reject_cpi" + }, + { + "BriefDescription": "Cycles stalled by Other LSU Rejects", + "MetricExpr": "(PM_CMPLU_STALL_REJECT / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_REJECT_LHS / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_ERAT_MISS / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_REJ_LMQ_FULL / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_reject_other_cpi" + }, + { + "BriefDescription": "Cycles stalled by LSU store forwarding", + "MetricExpr": "PM_CMPLU_STALL_ST_FWD / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_st_fwd_cpi" + }, + { + "BriefDescription": "Cycles stalled by LSU Stores", + "MetricExpr": "PM_CMPLU_STALL_STORE / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lsu_stall_store_cpi" + }, + { + "BriefDescription": "Cycles stalled by lightweight syncs", + "MetricExpr": "PM_CMPLU_STALL_LWSYNC / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "lwsync_stall_cpi" + }, + { + "MetricExpr": "PM_CMPLU_STALL_MEM_ECC_DELAY / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "mem_ecc_delay_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by nops (nothing next to finish)", + "MetricExpr": "PM_CMPLU_STALL_NO_NTF / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "no_ntf_stall_cpi" + }, + { + "MetricExpr": "PM_NTCG_ALL_FIN / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "ntcg_all_fin_cpi" + }, + { + "MetricExpr": "PM_CMPLU_STALL_NTCG_FLUSH / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "ntcg_flush_cpi" + }, + { + "BriefDescription": "Other thread block stall cycles", + "MetricExpr": "(PM_CMPLU_STALL_THRD - PM_CMPLU_STALL_LWSYNC - PM_CMPLU_STALL_HWSYNC - PM_CMPLU_STALL_MEM_ECC_DELAY - PM_CMPLU_STALL_FLUSH - PM_CMPLU_STALL_COQ_FULL) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "other_block_stall_cpi" + }, + { + "BriefDescription": "Cycles unaccounted for", + "MetricExpr": "(PM_RUN_CYC / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL / PM_RUN_INST_CMPL) - (PM_GCT_NOSLOT_CYC / PM_RUN_INST_CMPL) - (PM_NTCG_ALL_FIN / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_THRD / PM_RUN_INST_CMPL) - (PM_GRP_CMPL / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "other_cpi" + }, + { + "BriefDescription": "Stall cycles unaccounted for", + "MetricExpr": "(PM_CMPLU_STALL / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_BRU_CRU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_FXU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_VSU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_LSU / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_NTCG_FLUSH / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_NO_NTF / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "other_stall_cpi" + }, + { + "BriefDescription": "Run cycles per run instruction", + "MetricExpr": "PM_RUN_CYC / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "run_cpi" + }, + { + "BriefDescription": "Completion Stall Cycles", + "MetricExpr": "PM_CMPLU_STALL / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "stall_cpi" + }, + { + "BriefDescription": "Cycles a thread was blocked", + "MetricExpr": "PM_CMPLU_STALL_THRD / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "thread_block_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by VSU", + "MetricExpr": "PM_CMPLU_STALL_VSU / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_cpi" + }, + { + "BriefDescription": "Cycles stalled by other VSU Operations", + "MetricExpr": "(PM_CMPLU_STALL_VSU - PM_CMPLU_STALL_VECTOR - PM_CMPLU_STALL_SCALAR) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_other_cpi" + }, + { + "BriefDescription": "Cycles stalled by VSU Scalar Operations", + "MetricExpr": "PM_CMPLU_STALL_SCALAR / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_scalar_cpi" + }, + { + "BriefDescription": "Cycles stalled by VSU Scalar Long Operations", + "MetricExpr": "PM_CMPLU_STALL_SCALAR_LONG / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_scalar_long_cpi" + }, + { + "BriefDescription": "Cycles stalled by Other VSU Scalar Operations", + "MetricExpr": "(PM_CMPLU_STALL_SCALAR / PM_RUN_INST_CMPL) - (PM_CMPLU_STALL_SCALAR_LONG / PM_RUN_INST_CMPL)", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_scalar_other_cpi" + }, + { + "BriefDescription": "Cycles stalled by VSU Vector Operations", + "MetricExpr": "PM_CMPLU_STALL_VECTOR / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_vector_cpi" + }, + { + "BriefDescription": "Cycles stalled by VSU Vector Long Operations", + "MetricExpr": "PM_CMPLU_STALL_VECTOR_LONG / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_vector_long_cpi" + }, + { + "BriefDescription": "Cycles stalled by other VSU Vector Operations", + "MetricExpr": "(PM_CMPLU_STALL_VECTOR - PM_CMPLU_STALL_VECTOR_LONG) / PM_RUN_INST_CMPL", + "MetricGroup": "cpi_breakdown", + "MetricName": "vsu_stall_vector_other_cpi" + }, + { + "BriefDescription": "% of DL1 Reloads from Distant L2 or L3 (Modified) per Inst", + "MetricExpr": "PM_DATA_FROM_DL2L3_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_dl2l3_mod_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Distant L2 or L3 (Shared) per Inst", + "MetricExpr": "PM_DATA_FROM_DL2L3_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_dl2l3_shr_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Distant L4 per Inst", + "MetricExpr": "PM_DATA_FROM_DL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_dl4_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Distant Memory per Inst", + "MetricExpr": "PM_DATA_FROM_DMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_dmem_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L2, other core per Inst", + "MetricExpr": "PM_DATA_FROM_L21_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l21_mod_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L2, other core per Inst", + "MetricExpr": "PM_DATA_FROM_L21_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l21_shr_rate_percent" + }, + { + "BriefDescription": "Percentage of L2 load hits per instruction where the L2 experienced a Load-Hit-Store conflict", + "MetricExpr": "PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l2_lhs_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from L2 per Inst", + "MetricExpr": "PM_DATA_FROM_L2MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l2_miss_rate_percent" + }, + { + "BriefDescription": "Percentage of L2 load hits per instruction where the L2 did not experience a conflict", + "MetricExpr": "PM_DATA_FROM_L2_NO_CONFLICT * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l2_no_conflict_rate_percent" + }, + { + "BriefDescription": "Percentage of L2 load hits per instruction where the L2 experienced some conflict other than Load-Hit-Store", + "MetricExpr": "PM_DATA_FROM_L2_DISP_CONFLICT_OTHER * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l2_other_conflict_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from L2 per Inst", + "MetricExpr": "PM_DATA_FROM_L2 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l2_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3 M state, other core per Inst", + "MetricExpr": "PM_DATA_FROM_L31_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l31_mod_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3 S tate, other core per Inst", + "MetricExpr": "PM_DATA_FROM_L31_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l31_shr_rate_percent" + }, + { + "BriefDescription": "Percentage of L3 load hits per instruction where the load collided with a pending prefetch", + "MetricExpr": "PM_DATA_FROM_L3_DISP_CONFLICT * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l3_conflict_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from L3 per Inst", + "MetricExpr": "PM_DATA_FROM_L3MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l3_miss_rate_percent" + }, + { + "BriefDescription": "Percentage of L3 load hits per instruction where the L3 did not experience a conflict", + "MetricExpr": "PM_DATA_FROM_L3_NO_CONFLICT * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l3_no_conflict_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from L3 per Inst", + "MetricExpr": "PM_DATA_FROM_L3 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_l3_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Local L4 per Inst", + "MetricExpr": "PM_DATA_FROM_LL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_ll4_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Local Memory per Inst", + "MetricExpr": "PM_DATA_FROM_LMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_lmem_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3, other core per Inst", + "MetricExpr": "PM_DATA_FROM_RL2L3_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_rl2l3_mod_rate_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3, other core per Inst", + "MetricExpr": "PM_DATA_FROM_RL2L3_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_rl2l3_shr_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Remote Memory per Inst", + "MetricExpr": "PM_DATA_FROM_RL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_rl4_rate_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from Remote Memory per Inst", + "MetricExpr": "PM_DATA_FROM_RMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "dl1_reload_from_rmem_rate_percent" + }, + { + "BriefDescription": "Percentage of L1 demand load misses per run instruction", + "MetricExpr": "PM_LD_MISS_L1 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_inst", + "MetricName": "l1_ld_miss_rate_percent" + }, + { + "BriefDescription": "% of DL1 misses that result in a cache reload", + "MetricExpr": "PM_L1_DCACHE_RELOAD_VALID * 100 / PM_LD_MISS_L1", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_miss_reloads_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Distant L2 or L3 (Modified)", + "MetricExpr": "PM_DATA_FROM_DL2L3_MOD * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_dl2l3_mod_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Distant L2 or L3 (Shared)", + "MetricExpr": "PM_DATA_FROM_DL2L3_SHR * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_dl2l3_shr_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Distant L4", + "MetricExpr": "PM_DATA_FROM_DL4 * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_dl4_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Distant Memory", + "MetricExpr": "PM_DATA_FROM_DMEM * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_dmem_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L2, other core", + "MetricExpr": "PM_DATA_FROM_L21_MOD * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l21_mod_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L2, other core", + "MetricExpr": "PM_DATA_FROM_L21_SHR * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l21_shr_percent" + }, + { + "BriefDescription": "Percentage of DL1 reloads from L2 with a Load-Hit-Store conflict", + "MetricExpr": "PM_DATA_FROM_L2_DISP_CONFLICT_LDHITST * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l2_lhs_percent" + }, + { + "BriefDescription": "Percentage of DL1 reloads from L2 with no conflicts", + "MetricExpr": "PM_DATA_FROM_L2_NO_CONFLICT * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l2_no_conflict_percent" + }, + { + "BriefDescription": "Percentage of DL1 reloads from L2 with some conflict other than Load-Hit-Store", + "MetricExpr": "PM_DATA_FROM_L2_DISP_CONFLICT_OTHER * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l2_other_conflict_percent" + }, + { + "BriefDescription": "% of DL1 reloads from L2", + "MetricExpr": "PM_DATA_FROM_L2 * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l2_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3, other core", + "MetricExpr": "PM_DATA_FROM_L31_MOD * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l31_mod_percent" + }, + { + "BriefDescription": "% of DL1 reloads from Private L3, other core", + "MetricExpr": "PM_DATA_FROM_L31_SHR * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l31_shr_percent" + }, + { + "BriefDescription": "Percentage of DL1 reloads from L3 where the load collided with a pending prefetch", + "MetricExpr": "PM_DATA_FROM_L3_DISP_CONFLICT * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l3_conflict_percent" + }, + { + "BriefDescription": "Percentage of L3 load hits per instruction where the line was brought into the L3 by a prefetch operation", + "MetricExpr": "PM_DATA_FROM_L3_MEPF * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l3_mepf_rate_percent" + }, + { + "BriefDescription": "Percentage of DL1 reloads from L3 without conflicts", + "MetricExpr": "PM_DATA_FROM_L3_NO_CONFLICT * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l3_no_conflict_percent" + }, + { + "BriefDescription": "% of DL1 Reloads from L3", + "MetricExpr": "PM_DATA_FROM_L3 * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_l3_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Local L4", + "MetricExpr": "PM_DATA_FROM_LL4 * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_ll4_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Local Memory", + "MetricExpr": "PM_DATA_FROM_LMEM * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_lmem_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Remote L2 or L3 (Modified)", + "MetricExpr": "PM_DATA_FROM_RL2L3_MOD * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_rl2l3_mod_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Remote L2 or L3 (Shared)", + "MetricExpr": "PM_DATA_FROM_RL2L3_SHR * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_rl2l3_shr_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Remote L4", + "MetricExpr": "PM_DATA_FROM_RL4 * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_rl4_percent" + }, + { + "BriefDescription": "% of DL1 dL1_Reloads from Remote Memory", + "MetricExpr": "PM_DATA_FROM_RMEM * 100 / PM_L1_DCACHE_RELOAD_VALID", + "MetricGroup": "dl1_reloads_percent_per_ref", + "MetricName": "dl1_reload_from_rmem_percent" + }, + { + "BriefDescription": "dL1 miss portion of CPI", + "MetricExpr": "( (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)/ (PM_RUN_CYC / PM_RUN_INST_CMPL)) * 100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "dcache_miss_cpi_percent" + }, + { + "BriefDescription": "estimate of dl2l3 distant MOD miss rates with measured DL2L3 MOD latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_DL2L3_MOD / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_DL2L3_MOD_CYC/ PM_MRK_DATA_FROM_DL2L3_MOD)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "dl2l3_mod_cpi_percent" + }, + { + "BriefDescription": "estimate of dl2l3 distant SHR miss rates with measured DL2L3 SHR latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_DL2L3_SHR / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_DL2L3_SHR_CYC/ PM_MRK_DATA_FROM_DL2L3_SHR)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "dl2l3_shr_cpi_percent" + }, + { + "BriefDescription": "estimate of distant L4 miss rates with measured DL4 latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_DL4 / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_DL4_CYC/ PM_MRK_DATA_FROM_DL4)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "dl4_cpi_percent" + }, + { + "BriefDescription": "estimate of distant memory miss rates with measured DMEM latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_DMEM / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_DMEM_CYC/ PM_MRK_DATA_FROM_DMEM)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "dmem_cpi_percent" + }, + { + "BriefDescription": "estimate of dl21 MOD miss rates with measured L21 MOD latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L21_MOD / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L21_MOD_CYC/ PM_MRK_DATA_FROM_L21_MOD)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l21_mod_cpi_percent" + }, + { + "BriefDescription": "estimate of dl21 SHR miss rates with measured L21 SHR latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L21_SHR / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L21_SHR_CYC/ PM_MRK_DATA_FROM_L21_SHR)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l21_shr_cpi_percent" + }, + { + "BriefDescription": "estimate of dl2 miss rates with measured L2 latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L2 / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L2_CYC/ PM_MRK_DATA_FROM_L2)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL) ) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l2_cpi_percent" + }, + { + "BriefDescription": "estimate of dl31 MOD miss rates with measured L31 MOD latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L31_MOD / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L31_MOD_CYC/ PM_MRK_DATA_FROM_L31_MOD)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l31_mod_cpi_percent" + }, + { + "BriefDescription": "estimate of dl31 SHR miss rates with measured L31 SHR latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L31_SHR / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L31_SHR_CYC/ PM_MRK_DATA_FROM_L31_SHR)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l31_shr_cpi_percent" + }, + { + "BriefDescription": "estimate of dl3 miss rates with measured L3 latency as a % of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_L3 / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_L3_CYC/ PM_MRK_DATA_FROM_L3)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) * 100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "l3_cpi_percent" + }, + { + "BriefDescription": "estimate of Local L4 miss rates with measured LL4 latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_LL4 / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_LL4_CYC/ PM_MRK_DATA_FROM_LL4)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "ll4_cpi_percent" + }, + { + "BriefDescription": "estimate of Local memory miss rates with measured LMEM latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_LMEM / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_LMEM_CYC/ PM_MRK_DATA_FROM_LMEM)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "lmem_cpi_percent" + }, + { + "BriefDescription": "estimate of dl2l3 remote MOD miss rates with measured RL2L3 MOD latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_RL2L3_MOD / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_RL2L3_MOD_CYC/ PM_MRK_DATA_FROM_RL2L3_MOD)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "rl2l3_mod_cpi_percent" + }, + { + "BriefDescription": "estimate of dl2l3 shared miss rates with measured RL2L3 SHR latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_RL2L3_SHR / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_RL2L3_SHR_CYC/ PM_MRK_DATA_FROM_RL2L3_SHR)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) * 100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "rl2l3_shr_cpi_percent" + }, + { + "BriefDescription": "estimate of remote L4 miss rates with measured RL4 latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_RL4 / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_RL4_CYC/ PM_MRK_DATA_FROM_RL4)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "rl4_cpi_percent" + }, + { + "BriefDescription": "estimate of remote memory miss rates with measured RMEM latency as a %of dcache miss cpi", + "MetricExpr": "(((PM_DATA_FROM_RMEM / PM_RUN_INST_CMPL) * (PM_MRK_DATA_FROM_RMEM_CYC/ PM_MRK_DATA_FROM_RMEM)) / (PM_CMPLU_STALL_DCACHE_MISS / PM_RUN_INST_CMPL)) *100", + "MetricGroup": "estimated_dcache_miss_cpi", + "MetricName": "rmem_cpi_percent" + }, + { + "BriefDescription": "Branch Mispredict flushes per instruction", + "MetricExpr": "PM_FLUSH_BR_MPRED / PM_RUN_INST_CMPL * 100", + "MetricGroup": "general", + "MetricName": "br_mpred_flush_rate_percent" + }, + { + "BriefDescription": "Cycles per instruction", + "MetricExpr": "PM_CYC / PM_INST_CMPL", + "MetricGroup": "general", + "MetricName": "cpi" + }, + { + "BriefDescription": "Percentage Cycles a group completed", + "MetricExpr": "PM_GRP_CMPL / PM_CYC * 100", + "MetricGroup": "general", + "MetricName": "cyc_grp_completed_percent" + }, + { + "BriefDescription": "Percentage Cycles a group dispatched", + "MetricExpr": "PM_1PLUS_PPC_DISP / PM_CYC * 100", + "MetricGroup": "general", + "MetricName": "cyc_grp_dispatched_percent" + }, + { + "BriefDescription": "Cycles per group", + "MetricExpr": "PM_CYC / PM_1PLUS_PPC_CMPL", + "MetricGroup": "general", + "MetricName": "cyc_per_group" + }, + { + "BriefDescription": "GCT empty cycles", + "MetricExpr": "(PM_FLUSH_DISP / PM_RUN_INST_CMPL) * 100", + "MetricGroup": "general", + "MetricName": "disp_flush_rate_percent" + }, + { + "BriefDescription": "% DTLB miss rate per inst", + "MetricExpr": "PM_DTLB_MISS / PM_RUN_INST_CMPL *100", + "MetricGroup": "general", + "MetricName": "dtlb_miss_rate_percent" + }, + { + "BriefDescription": "Flush rate (%)", + "MetricExpr": "PM_FLUSH * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "flush_rate_percent" + }, + { + "BriefDescription": "GCT slot utilization (11 to 14) as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_11_14_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_11to14_slots_percent" + }, + { + "BriefDescription": "GCT slot utilization (15 to 17) as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_15_17_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_15to17_slots_percent" + }, + { + "BriefDescription": "GCT slot utilization 18+ as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_18_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_18plus_slots_percent" + }, + { + "BriefDescription": "GCT slot utilization (1 to 2) as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_1_2_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_1to2_slots_percent" + }, + { + "BriefDescription": "GCT slot utilization (3 to 6) as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_3_6_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_3to6_slots_percent" + }, + { + "BriefDescription": "GCT slot utilization (7 to 10) as a % of cycles this thread had atleast 1 slot valid", + "MetricExpr": "PM_GCT_UTIL_7_10_ENTRIES / ( PM_RUN_CYC - PM_GCT_NOSLOT_CYC) * 100", + "MetricGroup": "general", + "MetricName": "gct_util_7to10_slots_percent" + }, + { + "BriefDescription": "Avg. group size", + "MetricExpr": "PM_INST_CMPL / PM_1PLUS_PPC_CMPL", + "MetricGroup": "general", + "MetricName": "group_size" + }, + { + "BriefDescription": "Instructions per group", + "MetricExpr": "PM_INST_CMPL / PM_1PLUS_PPC_CMPL", + "MetricGroup": "general", + "MetricName": "inst_per_group" + }, + { + "BriefDescription": "Instructions per cycles", + "MetricExpr": "PM_INST_CMPL / PM_CYC", + "MetricGroup": "general", + "MetricName": "ipc" + }, + { + "BriefDescription": "% ITLB miss rate per inst", + "MetricExpr": "PM_ITLB_MISS / PM_RUN_INST_CMPL *100", + "MetricGroup": "general", + "MetricName": "itlb_miss_rate_percent" + }, + { + "BriefDescription": "Percentage of L1 load misses per L1 load ref", + "MetricExpr": "PM_LD_MISS_L1 / PM_LD_REF_L1 * 100", + "MetricGroup": "general", + "MetricName": "l1_ld_miss_ratio_percent" + }, + { + "BriefDescription": "Percentage of L1 store misses per run instruction", + "MetricExpr": "PM_ST_MISS_L1 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l1_st_miss_rate_percent" + }, + { + "BriefDescription": "Percentage of L1 store misses per L1 store ref", + "MetricExpr": "PM_ST_MISS_L1 / PM_ST_FIN * 100", + "MetricGroup": "general", + "MetricName": "l1_st_miss_ratio_percent" + }, + { + "BriefDescription": "L2 Instruction Miss Rate (per instruction)(%)", + "MetricExpr": "PM_INST_FROM_L2MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l2_inst_miss_rate_percent" + }, + { + "BriefDescription": "L2 dmand Load Miss Rate (per run instruction)(%)", + "MetricExpr": "PM_DATA_FROM_L2MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l2_ld_miss_rate_percent" + }, + { + "BriefDescription": "L2 PTEG Miss Rate (per run instruction)(%)", + "MetricExpr": "PM_DPTEG_FROM_L2MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l2_pteg_miss_rate_percent" + }, + { + "BriefDescription": "Percentage of L2 store misses per run instruction", + "MetricExpr": "PM_ST_MISS_L1 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l2_st_miss_rate_percent" + }, + { + "BriefDescription": "L3 Instruction Miss Rate (per instruction)(%)", + "MetricExpr": "PM_INST_FROM_L3MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l3_inst_miss_rate_percent" + }, + { + "BriefDescription": "L3 demand Load Miss Rate (per run instruction)(%)", + "MetricExpr": "PM_DATA_FROM_L3MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l3_ld_miss_rate_percent" + }, + { + "BriefDescription": "L3 PTEG Miss Rate (per run instruction)(%)", + "MetricExpr": "PM_DPTEG_FROM_L3MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "general", + "MetricName": "l3_pteg_miss_rate_percent" + }, + { + "BriefDescription": "Run cycles per cycle", + "MetricExpr": "PM_RUN_CYC / PM_CYC*100", + "MetricGroup": "general", + "MetricName": "run_cycles_percent" + }, + { + "BriefDescription": "Percentage of cycles spent in SMT2 Mode", + "MetricExpr": "(PM_RUN_CYC_SMT2_MODE/PM_RUN_CYC) * 100", + "MetricGroup": "general", + "MetricName": "smt2_cycles_percent" + }, + { + "BriefDescription": "Percentage of cycles spent in SMT4 Mode", + "MetricExpr": "(PM_RUN_CYC_SMT4_MODE/PM_RUN_CYC) * 100", + "MetricGroup": "general", + "MetricName": "smt4_cycles_percent" + }, + { + "BriefDescription": "Percentage of cycles spent in SMT8 Mode", + "MetricExpr": "(PM_RUN_CYC_SMT8_MODE/PM_RUN_CYC) * 100", + "MetricGroup": "general", + "MetricName": "smt8_cycles_percent" + }, + { + "BriefDescription": "IPC of all instructions completed by the core while this thread was stalled", + "MetricExpr": "PM_CMPLU_STALL_OTHER_CMPL/PM_RUN_CYC", + "MetricGroup": "general", + "MetricName": "smt_benefit" + }, + { + "BriefDescription": "Instruction dispatch-to-completion ratio", + "MetricExpr": "PM_INST_DISP / PM_INST_CMPL", + "MetricGroup": "general", + "MetricName": "speculation" + }, + { + "BriefDescription": "Percentage of cycles spent in Single Thread Mode", + "MetricExpr": "(PM_RUN_CYC_ST_MODE/PM_RUN_CYC) * 100", + "MetricGroup": "general", + "MetricName": "st_cycles_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L2 or L3 (Modified) per Inst", + "MetricExpr": "PM_INST_FROM_DL2L3_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_dl2l3_mod_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L2 or L3 (Shared) per Inst", + "MetricExpr": "PM_INST_FROM_DL2L3_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_dl2l3_shr_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L4 per Inst", + "MetricExpr": "PM_INST_FROM_DL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_dl4_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant Memory per Inst", + "MetricExpr": "PM_INST_FROM_DMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_dmem_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L2, other core per Inst", + "MetricExpr": "PM_INST_FROM_L21_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l21_mod_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L2, other core per Inst", + "MetricExpr": "PM_INST_FROM_L21_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l21_shr_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from L2 per Inst", + "MetricExpr": "PM_INST_FROM_L2 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l2_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L3, other core per Inst", + "MetricExpr": "PM_INST_FROM_L31_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l31_mod_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L3 other core per Inst", + "MetricExpr": "PM_INST_FROM_L31_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l31_shr_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from L3 per Inst", + "MetricExpr": "PM_INST_FROM_L3 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_l3_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Local L4 per Inst", + "MetricExpr": "PM_INST_FROM_LL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_ll4_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Local Memory per Inst", + "MetricExpr": "PM_INST_FROM_LMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_lmem_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L2 or L3 (Modified) per Inst", + "MetricExpr": "PM_INST_FROM_RL2L3_MOD * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_rl2l3_mod_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L2 or L3 (Shared) per Inst", + "MetricExpr": "PM_INST_FROM_RL2L3_SHR * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_rl2l3_shr_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L4 per Inst", + "MetricExpr": "PM_INST_FROM_RL4 * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_rl4_rate_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote Memory per Inst", + "MetricExpr": "PM_INST_FROM_RMEM * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "inst_from_rmem_rate_percent" + }, + { + "BriefDescription": "Instruction Cache Miss Rate (Per run Instruction)(%)", + "MetricExpr": "PM_L1_ICACHE_MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_misses_percent_per_inst", + "MetricName": "l1_inst_miss_rate_percent" + }, + { + "BriefDescription": "% Branches per instruction", + "MetricExpr": "PM_BRU_FIN / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_mix", + "MetricName": "branches_per_inst" + }, + { + "BriefDescription": "Total Fixed point operations", + "MetricExpr": "(PM_FXU0_FIN + PM_FXU1_FIN)/PM_RUN_INST_CMPL", + "MetricGroup": "instruction_mix", + "MetricName": "fixed_per_inst" + }, + { + "BriefDescription": "FXU0 balance", + "MetricExpr": "PM_FXU0_FIN / (PM_FXU0_FIN + PM_FXU1_FIN)", + "MetricGroup": "instruction_mix", + "MetricName": "fxu0_balance" + }, + { + "BriefDescription": "Fraction of cycles that FXU0 is in use", + "MetricExpr": "PM_FXU0_FIN / PM_RUN_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu0_fin" + }, + { + "BriefDescription": "FXU0 only Busy", + "MetricExpr": "PM_FXU0_BUSY_FXU1_IDLE / PM_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu0_only_busy" + }, + { + "BriefDescription": "Fraction of cycles that FXU1 is in use", + "MetricExpr": "PM_FXU1_FIN / PM_RUN_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu1_fin" + }, + { + "BriefDescription": "FXU1 only Busy", + "MetricExpr": "PM_FXU1_BUSY_FXU0_IDLE / PM_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu1_only_busy" + }, + { + "BriefDescription": "Both FXU Busy", + "MetricExpr": "PM_FXU_BUSY / PM_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu_both_busy" + }, + { + "BriefDescription": "Both FXU Idle", + "MetricExpr": "PM_FXU_IDLE / PM_CYC", + "MetricGroup": "instruction_mix", + "MetricName": "fxu_both_idle" + }, + { + "BriefDescription": "PCT instruction loads", + "MetricExpr": "PM_LD_REF_L1 / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_mix", + "MetricName": "loads_per_inst" + }, + { + "BriefDescription": "PCT instruction stores", + "MetricExpr": "PM_ST_FIN / PM_RUN_INST_CMPL", + "MetricGroup": "instruction_mix", + "MetricName": "stores_per_inst" + }, + { + "BriefDescription": "Icache Fetchs per Icache Miss", + "MetricExpr": "(PM_L1_ICACHE_MISS - PM_IC_PREF_WRITE) / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "icache_miss_reload" + }, + { + "BriefDescription": "% of ICache reloads due to prefetch", + "MetricExpr": "PM_IC_PREF_WRITE * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "icache_pref_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L2 or L3 (Modified)", + "MetricExpr": "PM_INST_FROM_DL2L3_MOD * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_dl2l3_mod_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L2 or L3 (Shared)", + "MetricExpr": "PM_INST_FROM_DL2L3_SHR * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_dl2l3_shr_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant L4", + "MetricExpr": "PM_INST_FROM_DL4 * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_dl4_percent" + }, + { + "BriefDescription": "% of ICache reloads from Distant Memory", + "MetricExpr": "PM_INST_FROM_DMEM * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_dmem_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L2, other core", + "MetricExpr": "PM_INST_FROM_L21_MOD * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l21_mod_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L2, other core", + "MetricExpr": "PM_INST_FROM_L21_SHR * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l21_shr_percent" + }, + { + "BriefDescription": "% of ICache reloads from L2", + "MetricExpr": "PM_INST_FROM_L2 * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l2_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L3, other core", + "MetricExpr": "PM_INST_FROM_L31_MOD * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l31_mod_percent" + }, + { + "BriefDescription": "% of ICache reloads from Private L3, other core", + "MetricExpr": "PM_INST_FROM_L31_SHR * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l31_shr_percent" + }, + { + "BriefDescription": "% of ICache reloads from L3", + "MetricExpr": "PM_INST_FROM_L3 * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_l3_percent" + }, + { + "BriefDescription": "% of ICache reloads from Local L4", + "MetricExpr": "PM_INST_FROM_LL4 * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_ll4_percent" + }, + { + "BriefDescription": "% of ICache reloads from Local Memory", + "MetricExpr": "PM_INST_FROM_LMEM * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_lmem_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L2 or L3 (Modified)", + "MetricExpr": "PM_INST_FROM_RL2L3_MOD * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_rl2l3_mod_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L2 or L3 (Shared)", + "MetricExpr": "PM_INST_FROM_RL2L3_SHR * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_rl2l3_shr_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote L4", + "MetricExpr": "PM_INST_FROM_RL4 * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_rl4_percent" + }, + { + "BriefDescription": "% of ICache reloads from Remote Memory", + "MetricExpr": "PM_INST_FROM_RMEM * 100 / PM_L1_ICACHE_MISS", + "MetricGroup": "instruction_stats_percent_per_ref", + "MetricName": "inst_from_rmem_percent" + }, + { + "BriefDescription": "Average number of stores that gather in the store buffer before being sent to an L2 RC machine", + "MetricExpr": "PM_ST_CMPL / (PM_L2_ST / 2)", + "MetricGroup": "l2_stats", + "MetricName": "avg_stores_gathered" + }, + { + "BriefDescription": "L2 Store misses as a % of total L2 Store dispatches (per thread)", + "MetricExpr": "PM_L2_ST_MISS / PM_L2_ST * 100", + "MetricGroup": "l2_stats", + "MetricName": "l2_st_miss_ratio_percent" + }, + { + "BriefDescription": "Percentage of L2 store misses per drained store. A drained store may contain multiple individual stores if they target the same line", + "MetricExpr": "PM_L2_ST_MISS / (PM_L2_ST / 2)", + "MetricGroup": "l2_stats", + "MetricName": "l2_store_miss_ratio_percent" + }, + { + "BriefDescription": "average L1 miss latency using marked events", + "MetricExpr": "PM_MRK_LD_MISS_L1_CYC / PM_MRK_LD_MISS_L1", + "MetricGroup": "latency", + "MetricName": "average_dl1miss_latency" + }, + { + "BriefDescription": "Average icache miss latency", + "MetricExpr": "(PM_IC_DEMAND_CYC / PM_IC_DEMAND_REQ)", + "MetricGroup": "latency", + "MetricName": "average_il1_miss_latency" + }, + { + "BriefDescription": "average service time for SYNC", + "MetricExpr": "PM_LSU_SRQ_SYNC_CYC / PM_LSU_SRQ_SYNC", + "MetricGroup": "latency", + "MetricName": "average_sync_cyc" + }, + { + "BriefDescription": "Cycles LMQ slot0 was active on an average", + "MetricExpr": "PM_LSU_LMQ_S0_VALID / PM_LSU_LMQ_S0_ALLOC", + "MetricGroup": "latency", + "MetricName": "avg_lmq_life_time" + }, + { + "BriefDescription": "Average number of cycles LRQ stays active for one load. Slot 0 is VALID ONLY FOR EVEN THREADS", + "MetricExpr": "PM_LSU_LRQ_S0_VALID / PM_LSU_LRQ_S0_ALLOC", + "MetricGroup": "latency", + "MetricName": "avg_lrq_life_time_even" + }, + { + "BriefDescription": "Average number of cycles LRQ stays active for one load. Slot 43 is valid ONLY FOR ODD THREADS", + "MetricExpr": "PM_LSU_LRQ_S43_VALID / PM_LSU_LRQ_S43_ALLOC", + "MetricGroup": "latency", + "MetricName": "avg_lrq_life_time_odd" + }, + { + "BriefDescription": "Average number of cycles SRQ stays active for one load. Slot 0 is VALID ONLY FOR EVEN THREADS", + "MetricExpr": "PM_LSU_SRQ_S0_VALID / PM_LSU_SRQ_S0_ALLOC", + "MetricGroup": "latency", + "MetricName": "avg_srq_life_time_even" + }, + { + "BriefDescription": "Average number of cycles SRQ stays active for one load. Slot 39 is valid ONLY FOR ODD THREADS", + "MetricExpr": "PM_LSU_SRQ_S39_VALID / PM_LSU_SRQ_S39_ALLOC", + "MetricGroup": "latency", + "MetricName": "avg_srq_life_time_odd" + }, + { + "BriefDescription": "Marked background kill latency, measured in L2", + "MetricExpr": "PM_MRK_FAB_RSP_BKILL_CYC / PM_MRK_FAB_RSP_BKILL", + "MetricGroup": "latency", + "MetricName": "bkill_latency" + }, + { + "BriefDescription": "Marked dclaim latency, measured in L2", + "MetricExpr": "PM_MRK_FAB_RSP_DCLAIM_CYC / PM_MRK_FAB_RSP_DCLAIM", + "MetricGroup": "latency", + "MetricName": "dclaim_latency" + }, + { + "BriefDescription": "Marked L2L3 remote Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_DL2L3_MOD_CYC/ PM_MRK_DATA_FROM_DL2L3_MOD", + "MetricGroup": "latency", + "MetricName": "dl2l3_mod_latency" + }, + { + "BriefDescription": "Marked L2L3 distant Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_DL2L3_SHR_CYC/ PM_MRK_DATA_FROM_DL2L3_SHR", + "MetricGroup": "latency", + "MetricName": "dl2l3_shr_latency" + }, + { + "BriefDescription": "Distant L4 average load latency", + "MetricExpr": "PM_MRK_DATA_FROM_DL4_CYC/ PM_MRK_DATA_FROM_DL4", + "MetricGroup": "latency", + "MetricName": "dl4_latency" + }, + { + "BriefDescription": "Marked Dmem Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_DMEM_CYC/ PM_MRK_DATA_FROM_DMEM", + "MetricGroup": "latency", + "MetricName": "dmem_latency" + }, + { + "BriefDescription": "estimated exposed miss latency for dL1 misses, ie load miss when we were NTC", + "MetricExpr": "PM_MRK_LD_MISS_EXPOSED_CYC / PM_MRK_LD_MISS_EXPOSED", + "MetricGroup": "latency", + "MetricName": "exposed_dl1miss_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from L2.1 in the M state", + "MetricExpr": "PM_MRK_DATA_FROM_L21_MOD_CYC/ PM_MRK_DATA_FROM_L21_MOD", + "MetricGroup": "latency", + "MetricName": "l21_mod_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from L2.1 in the S state", + "MetricExpr": "PM_MRK_DATA_FROM_L21_SHR_CYC/ PM_MRK_DATA_FROM_L21_SHR", + "MetricGroup": "latency", + "MetricName": "l21_shr_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L2 and suffered a conflict at RC machine dispatch time due to load-hit-store", + "MetricExpr": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST_CYC/ PM_MRK_DATA_FROM_L2_DISP_CONFLICT_LDHITST", + "MetricGroup": "latency", + "MetricName": "l2_disp_conflict_ldhitst_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L2 and suffered a conflict at RC machine dispatch time NOT due load-hit-store", + "MetricExpr": "PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER_CYC/ PM_MRK_DATA_FROM_L2_DISP_CONFLICT_OTHER", + "MetricGroup": "latency", + "MetricName": "l2_disp_conflict_other_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L2", + "MetricExpr": "PM_MRK_DATA_FROM_L2_CYC/ PM_MRK_DATA_FROM_L2", + "MetricGroup": "latency", + "MetricName": "l2_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that were satisfied by lines prefetched into the L3. This information is forwarded from the L3", + "MetricExpr": "PM_MRK_DATA_FROM_L2_MEPF_CYC/ PM_MRK_DATA_FROM_L2", + "MetricGroup": "latency", + "MetricName": "l2_mepf_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L2 and suffered no conflicts", + "MetricExpr": "PM_MRK_DATA_FROM_L2_NO_CONFLICT_CYC/ PM_MRK_DATA_FROM_L2", + "MetricGroup": "latency", + "MetricName": "l2_no_conflict_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L3 and beyond", + "MetricExpr": "PM_MRK_DATA_FROM_L2MISS_CYC/ PM_MRK_DATA_FROM_L2MISS", + "MetricGroup": "latency", + "MetricName": "l2miss_latency" + }, + { + "BriefDescription": "Marked L31 Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_L31_MOD_CYC/ PM_MRK_DATA_FROM_L31_MOD", + "MetricGroup": "latency", + "MetricName": "l31_mod_latency" + }, + { + "BriefDescription": "Marked L31 Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_L31_SHR_CYC/ PM_MRK_DATA_FROM_L31_SHR", + "MetricGroup": "latency", + "MetricName": "l31_shr_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L3", + "MetricExpr": "PM_MRK_DATA_FROM_L3_CYC/ PM_MRK_DATA_FROM_L3", + "MetricGroup": "latency", + "MetricName": "l3_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that came from the L3 and suffered no conflicts", + "MetricExpr": "PM_MRK_DATA_FROM_L3_NO_CONFLICT_CYC/ PM_MRK_DATA_FROM_L2", + "MetricGroup": "latency", + "MetricName": "l3_no_conflict_latency" + }, + { + "BriefDescription": "Average load latency for all marked demand loads that come from beyond the L3", + "MetricExpr": "PM_MRK_DATA_FROM_L3MISS_CYC/ PM_MRK_DATA_FROM_L3MISS", + "MetricGroup": "latency", + "MetricName": "l3miss_latency" + }, + { + "BriefDescription": "Average latency for marked reloads that hit in the L3 on the MEPF state. i.e. lines that were prefetched into the L3", + "MetricExpr": "PM_MRK_DATA_FROM_L3_MEPF_CYC/ PM_MRK_DATA_FROM_L3_MEPF", + "MetricGroup": "latency", + "MetricName": "l3pref_latency" + }, + { + "BriefDescription": "Local L4 average load latency", + "MetricExpr": "PM_MRK_DATA_FROM_LL4_CYC/ PM_MRK_DATA_FROM_LL4", + "MetricGroup": "latency", + "MetricName": "ll4_latency" + }, + { + "BriefDescription": "Marked Lmem Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_LMEM_CYC/ PM_MRK_DATA_FROM_LMEM", + "MetricGroup": "latency", + "MetricName": "lmem_latency" + }, + { + "BriefDescription": "Latency for marked reloads that hit in the L2 or L3 of any other core on a different chip", + "MetricExpr": "PM_MRK_DATA_FROM_OFF_CHIP_CACHE_CYC/ PM_MRK_DATA_FROM_OFF_CHIP_CACHE", + "MetricGroup": "latency", + "MetricName": "off_chip_cache_latency" + }, + { + "BriefDescription": "Latency for marked reloads that hit in the L2 or L3 of any other core on the same chip", + "MetricExpr": "PM_MRK_DATA_FROM_ON_CHIP_CACHE_CYC/ PM_MRK_DATA_FROM_ON_CHIP_CACHE", + "MetricGroup": "latency", + "MetricName": "on_chip_cache_latency" + }, + { + "BriefDescription": "Marked L2L3 remote Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_RL2L3_MOD_CYC/ PM_MRK_DATA_FROM_RL2L3_MOD", + "MetricGroup": "latency", + "MetricName": "rl2l3_mod_latency" + }, + { + "BriefDescription": "Marked L2L3 remote Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_RL2L3_SHR_CYC/ PM_MRK_DATA_FROM_RL2L3_SHR", + "MetricGroup": "latency", + "MetricName": "rl2l3_shr_latency" + }, + { + "BriefDescription": "Remote L4 average load latency", + "MetricExpr": "PM_MRK_DATA_FROM_RL4_CYC/ PM_MRK_DATA_FROM_RL4", + "MetricGroup": "latency", + "MetricName": "rl4_latency" + }, + { + "BriefDescription": "Marked Rmem Load latency", + "MetricExpr": "PM_MRK_DATA_FROM_RMEM_CYC/ PM_MRK_DATA_FROM_RMEM", + "MetricGroup": "latency", + "MetricName": "rmem_latency" + }, + { + "BriefDescription": "ERAT miss reject ratio", + "MetricExpr": "PM_LSU_REJECT_ERAT_MISS * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "lsu_rejects", + "MetricName": "erat_reject_rate_percent" + }, + { + "BriefDescription": "ERAT miss reject ratio", + "MetricExpr": "PM_LSU_REJECT_ERAT_MISS * 100 / (PM_LSU_FIN - PM_LSU_FX_FIN)", + "MetricGroup": "lsu_rejects", + "MetricName": "erat_reject_ratio_percent" + }, + { + "BriefDescription": "LHS reject ratio", + "MetricExpr": "PM_LSU_REJECT_LHS *100/ PM_RUN_INST_CMPL", + "MetricGroup": "lsu_rejects", + "MetricName": "lhs_reject_rate_percent" + }, + { + "BriefDescription": "LHS reject ratio", + "MetricExpr": "PM_LSU_REJECT_LHS *100/ (PM_LSU_FIN - PM_LSU_FX_FIN)", + "MetricGroup": "lsu_rejects", + "MetricName": "lhs_reject_ratio_percent" + }, + { + "BriefDescription": "LMQ full reject ratio", + "MetricExpr": "PM_LSU_REJECT_LMQ_FULL * 100 / PM_RUN_INST_CMPL", + "MetricGroup": "lsu_rejects", + "MetricName": "lmq_full_reject_rate_percent" + }, + { + "BriefDescription": "ERAT miss reject ratio", + "MetricExpr": "PM_LSU_REJECT_LMQ_FULL * 100 / PM_LD_REF_L1", + "MetricGroup": "lsu_rejects", + |