diff options
Diffstat (limited to 'tools')
130 files changed, 7638 insertions, 1526 deletions
diff --git a/tools/bpf/bpftool/Documentation/bpftool-gen.rst b/tools/bpf/bpftool/Documentation/bpftool-gen.rst index 84cf0639696f..7cd6681137f3 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-gen.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-gen.rst @@ -14,16 +14,37 @@ SYNOPSIS *OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] } - *COMMAND* := { **skeleton** | **help** } + *COMMAND* := { **object** | **skeleton** | **help** } GEN COMMANDS ============= -| **bpftool** **gen skeleton** *FILE* +| **bpftool** **gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] +| **bpftool** **gen skeleton** *FILE* [**name** *OBJECT_NAME*] | **bpftool** **gen help** DESCRIPTION =========== + **bpftool gen object** *OUTPUT_FILE* *INPUT_FILE* [*INPUT_FILE*...] + Statically link (combine) together one or more *INPUT_FILE*'s + into a single resulting *OUTPUT_FILE*. All the files involved + are BPF ELF object files. + + The rules of BPF static linking are mostly the same as for + user-space object files, but in addition to combining data + and instruction sections, .BTF and .BTF.ext (if present in + any of the input files) data are combined together. .BTF + data is deduplicated, so all the common types across + *INPUT_FILE*'s will only be represented once in the resulting + BTF information. + + BPF static linking allows to partition BPF source code into + individually compiled files that are then linked into + a single resulting BPF object file, which can be used to + generated BPF skeleton (with **gen skeleton** command) or + passed directly into **libbpf** (using **bpf_object__open()** + family of APIs). + **bpftool gen skeleton** *FILE* Generate BPF skeleton C header file for a given *FILE*. @@ -75,10 +96,13 @@ DESCRIPTION specific maps, programs, etc. As part of skeleton, few custom functions are generated. - Each of them is prefixed with object name, derived from - object file name. I.e., if BPF object file name is - **example.o**, BPF object name will be **example**. The - following custom functions are provided in such case: + Each of them is prefixed with object name. Object name can + either be derived from object file name, i.e., if BPF object + file name is **example.o**, BPF object name will be + **example**. Object name can be also specified explicitly + through **name** *OBJECT_NAME* parameter. The following + custom functions are provided (assuming **example** as + the object name): - **example__open** and **example__open_opts**. These functions are used to instantiate skeleton. It @@ -130,26 +154,19 @@ OPTIONS EXAMPLES ======== -**$ cat example.c** +**$ cat example1.bpf.c** :: #include <stdbool.h> #include <linux/ptrace.h> #include <linux/bpf.h> - #include "bpf_helpers.h" + #include <bpf/bpf_helpers.h> const volatile int param1 = 42; bool global_flag = true; struct { int x; } data = {}; - struct { - __uint(type, BPF_MAP_TYPE_HASH); - __uint(max_entries, 128); - __type(key, int); - __type(value, long); - } my_map SEC(".maps"); - SEC("raw_tp/sys_enter") int handle_sys_enter(struct pt_regs *ctx) { @@ -161,6 +178,21 @@ EXAMPLES return 0; } +**$ cat example2.bpf.c** + +:: + + #include <linux/ptrace.h> + #include <linux/bpf.h> + #include <bpf/bpf_helpers.h> + + struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, int); + __type(value, long); + } my_map SEC(".maps"); + SEC("raw_tp/sys_exit") int handle_sys_exit(struct pt_regs *ctx) { @@ -170,9 +202,17 @@ EXAMPLES } This is example BPF application with two BPF programs and a mix of BPF maps -and global variables. +and global variables. Source code is split across two source code files. -**$ bpftool gen skeleton example.o** +**$ clang -target bpf -g example1.bpf.c -o example1.bpf.o** +**$ clang -target bpf -g example2.bpf.c -o example2.bpf.o** +**$ bpftool gen object example.bpf.o example1.bpf.o example2.bpf.o** + +This set of commands compiles *example1.bpf.c* and *example2.bpf.c* +individually and then statically links respective object files into the final +BPF ELF object file *example.bpf.o*. + +**$ bpftool gen skeleton example.bpf.o name example | tee example.skel.h** :: @@ -227,7 +267,7 @@ and global variables. #endif /* __EXAMPLE_SKEL_H__ */ -**$ cat example_user.c** +**$ cat example.c** :: @@ -270,7 +310,7 @@ and global variables. return err; } -**# ./example_user** +**# ./example** :: diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool index fdffbc64c65c..d67518bcbd44 100644 --- a/tools/bpf/bpftool/bash-completion/bpftool +++ b/tools/bpf/bpftool/bash-completion/bpftool @@ -981,12 +981,25 @@ _bpftool() ;; gen) case $command in - skeleton) + object) _filedir + return 0 + ;; + skeleton) + case $prev in + $command) + _filedir + return 0 + ;; + *) + _bpftool_once_attr 'name' + return 0 + ;; + esac ;; *) [[ $prev == $object ]] && \ - COMPREPLY=( $( compgen -W 'skeleton help' -- "$cur" ) ) + COMPREPLY=( $( compgen -W 'object skeleton help' -- "$cur" ) ) ;; esac ;; diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 985610c3f193..62953bbf68b4 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -546,6 +546,7 @@ static int do_dump(int argc, char **argv) NEXT_ARG(); if (argc < 1) { p_err("expecting value for 'format' option\n"); + err = -EINVAL; goto done; } if (strcmp(*argv, "c") == 0) { @@ -555,11 +556,13 @@ static int do_dump(int argc, char **argv) } else { p_err("unrecognized format specifier: '%s', possible values: raw, c", *argv); + err = -EINVAL; goto done; } NEXT_ARG(); } else { p_err("unrecognized option: '%s'", *argv); + err = -EINVAL; goto done; } } diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index 65303664417e..1828bba19020 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -57,6 +57,7 @@ const char * const attach_type_name[__MAX_BPF_ATTACH_TYPE] = { [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", [BPF_LIRC_MODE2] = "lirc_mode2", [BPF_FLOW_DISSECTOR] = "flow_dissector", diff --git a/tools/bpf/bpftool/gen.c b/tools/bpf/bpftool/gen.c index 4033c46d83e7..31ade77f5ef8 100644 --- a/tools/bpf/bpftool/gen.c +++ b/tools/bpf/bpftool/gen.c @@ -273,7 +273,7 @@ static int do_skeleton(int argc, char **argv) char header_guard[MAX_OBJ_NAME_LEN + sizeof("__SKEL_H__")]; size_t i, map_cnt = 0, prog_cnt = 0, file_sz, mmap_sz; DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts); - char obj_name[MAX_OBJ_NAME_LEN], *obj_data; + char obj_name[MAX_OBJ_NAME_LEN] = "", *obj_data; struct bpf_object *obj = NULL; const char *file, *ident; struct bpf_program *prog; @@ -288,6 +288,28 @@ static int do_skeleton(int argc, char **argv) } file = GET_ARG(); + while (argc) { + if (!REQ_ARGS(2)) + return -1; + + if (is_prefix(*argv, "name")) { + NEXT_ARG(); + + if (obj_name[0] != '\0') { + p_err("object name already specified"); + return -1; + } + + strncpy(obj_name, *argv, MAX_OBJ_NAME_LEN - 1); + obj_name[MAX_OBJ_NAME_LEN - 1] = '\0'; + } else { + p_err("unknown arg %s", *argv); + return -1; + } + + NEXT_ARG(); + } + if (argc) { p_err("extra unknown arguments"); return -1; @@ -310,7 +332,8 @@ static int do_skeleton(int argc, char **argv) p_err("failed to mmap() %s: %s", file, strerror(errno)); goto out; } - get_obj_name(obj_name, file); + if (obj_name[0] == '\0') + get_obj_name(obj_name, file); opts.object_name = obj_name; obj = bpf_object__open_mem(obj_data, file_sz, &opts); if (IS_ERR(obj)) { @@ -591,6 +614,47 @@ out: return err; } +static int do_object(int argc, char **argv) +{ + struct bpf_linker *linker; + const char *output_file, *file; + int err = 0; + + if (!REQ_ARGS(2)) { + usage(); + return -1; + } + + output_file = GET_ARG(); + + linker = bpf_linker__new(output_file, NULL); + if (!linker) { + p_err("failed to create BPF linker instance"); + return -1; + } + + while (argc) { + file = GET_ARG(); + + err = bpf_linker__add_file(linker, file); + if (err) { + p_err("failed to link '%s': %s (%d)", file, strerror(err), err); + goto out; + } + } + + err = bpf_linker__finalize(linker); + if (err) { + p_err("failed to finalize ELF file: %s (%d)", strerror(err), err); + goto out; + } + + err = 0; +out: + bpf_linker__free(linker); + return err; +} + static int do_help(int argc, char **argv) { if (json_output) { @@ -599,7 +663,8 @@ static int do_help(int argc, char **argv) } fprintf(stderr, - "Usage: %1$s %2$s skeleton FILE\n" + "Usage: %1$s %2$s object OUTPUT_FILE INPUT_FILE [INPUT_FILE...]\n" + " %1$s %2$s skeleton FILE [name OBJECT_NAME]\n" " %1$s %2$s help\n" "\n" " " HELP_SPEC_OPTIONS "\n" @@ -610,6 +675,7 @@ static int do_help(int argc, char **argv) } static const struct cmd cmds[] = { + { "object", do_object }, { "skeleton", do_skeleton }, { "help", do_help }, { 0 } diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c index b86f450e6fce..d9afb730136a 100644 --- a/tools/bpf/bpftool/main.c +++ b/tools/bpf/bpftool/main.c @@ -276,7 +276,7 @@ static int do_batch(int argc, char **argv) int n_argc; FILE *fp; char *cp; - int err; + int err = 0; int i; if (argc < 2) { @@ -370,7 +370,6 @@ static int do_batch(int argc, char **argv) } else { if (!json_output) printf("processed %d commands\n", lines); - err = 0; } err_close: if (fp != stdin) diff --git a/tools/bpf/bpftool/map.c b/tools/bpf/bpftool/map.c index b400364ee054..09ae0381205b 100644 --- a/tools/bpf/bpftool/map.c +++ b/tools/bpf/bpftool/map.c @@ -100,7 +100,7 @@ static int do_dump_btf(const struct btf_dumper *d, void *value) { __u32 value_id; - int ret; + int ret = 0; /* start of key-value pair */ jsonw_start_object(d->jw); diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index f2b915b20546..3f067d2d7584 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -76,6 +76,7 @@ enum dump_mode { static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_PARSER] = "stream_parser", [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", + [BPF_SK_SKB_VERDICT] = "skb_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", [__MAX_BPF_ATTACH_TYPE] = NULL, diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c index 80d966cfcaa1..7550fd9c3188 100644 --- a/tools/bpf/resolve_btfids/main.c +++ b/tools/bpf/resolve_btfids/main.c @@ -115,10 +115,10 @@ struct object { static int verbose; -int eprintf(int level, int var, const char *fmt, ...) +static int eprintf(int level, int var, const char *fmt, ...) { va_list args; - int ret; + int ret = 0; if (var >= level) { va_start(args, fmt); @@ -385,7 +385,7 @@ static int elf_collect(struct object *obj) static int symbols_collect(struct object *obj) { Elf_Scn *scn = NULL; - int n, i, err = 0; + int n, i; GElf_Shdr sh; char *name; @@ -402,11 +402,10 @@ static int symbols_collect(struct object *obj) * Scan symbols and look for the ones starting with * __BTF_ID__* over .BTF_ids section. */ - for (i = 0; !err && i < n; i++) { - char *tmp, *prefix; + for (i = 0; i < n; i++) { + char *prefix; struct btf_id *id; GElf_Sym sym; - int err = -1; if (!gelf_getsym(obj->efile.symbols, i, &sym)) return -1; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2d3036e292a9..69902603012c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -957,6 +957,7 @@ enum bpf_attach_type { BPF_XDP_CPUMAP, BPF_SK_LOOKUP, BPF_XDP, + BPF_SK_SKB_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -1117,6 +1118,10 @@ enum bpf_link_type { * offset to another bpf function */ #define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 /* flags for BPF_MAP_UPDATE_ELEM command */ enum { diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8b281f722e5b..f6afee209620 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1154,6 +1154,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_HYPERCALL_MSR (1 << 0) #define KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL (1 << 1) #define KVM_XEN_HVM_CONFIG_SHARED_INFO (1 << 2) +#define KVM_XEN_HVM_CONFIG_RUNSTATE (1 << 3) struct kvm_xen_hvm_config { __u32 flags; @@ -1621,12 +1622,24 @@ struct kvm_xen_vcpu_attr { union { __u64 gpa; __u64 pad[8]; + struct { + __u64 state; + __u64 state_entry_time; + __u64 time_running; + __u64 time_runnable; + __u64 time_blocked; + __u64 time_offline; + } runstate; } u; }; /* Available with KVM_CAP_XEN_HVM / KVM_XEN_HVM_CONFIG_SHARED_INFO */ #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO 0x0 #define KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO 0x1 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR 0x2 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT 0x3 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA 0x4 +#define KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST 0x5 /* Secure Encrypted Virtualization command */ enum sev_cmd_id { diff --git a/tools/kvm/kvm_stat/kvm_stat.service b/tools/kvm/kvm_stat/kvm_stat.service index 71aabaffe779..8f13b843d5b4 100644 --- a/tools/kvm/kvm_stat/kvm_stat.service +++ b/tools/kvm/kvm_stat/kvm_stat.service @@ -9,6 +9,7 @@ Type=simple ExecStart=/usr/bin/kvm_stat -dtcz -s 10 -L /var/log/kvm_stat.csv ExecReload=/bin/kill -HUP $MAINPID Restart=always +RestartSec=60s SyslogIdentifier=kvm_stat SyslogLevel=debug diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build index 190366d05588..9b057cc7650a 100644 --- a/tools/lib/bpf/Build +++ b/tools/lib/bpf/Build @@ -1,3 +1,3 @@ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ - btf_dump.o ringbuf.o + btf_dump.o ringbuf.o strset.o linker.o diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile index 8170f88e8ea6..e43e1896cb4b 100644 --- a/tools/lib/bpf/Makefile +++ b/tools/lib/bpf/Makefile @@ -215,7 +215,7 @@ define do_install if [ ! -d '$(DESTDIR_SQ)$2' ]; then \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \ fi; \ - $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2' + $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2' endef install_lib: all_cmd @@ -228,7 +228,6 @@ install_headers: $(BPF_HELPER_DEFS) $(call do_install,bpf.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf.h,$(prefix)/include/bpf,644); \ $(call do_install,btf.h,$(prefix)/include/bpf,644); \ - $(call do_install,libbpf_util.h,$(prefix)/include/bpf,644); \ $(call do_install,libbpf_common.h,$(prefix)/include/bpf,644); \ $(call do_install,xsk.h,$(prefix)/include/bpf,644); \ $(call do_install,bpf_helpers.h,$(prefix)/include/bpf,644); \ diff --git a/tools/lib/bpf/bpf_helpers.h b/tools/lib/bpf/bpf_helpers.h index ae6c975e0b87..cc2e51c64a54 100644 --- a/tools/lib/bpf/bpf_helpers.h +++ b/tools/lib/bpf/bpf_helpers.h @@ -29,9 +29,10 @@ */ #define SEC(NAME) __attribute__((section(NAME), used)) -#ifndef __always_inline +/* Avoid 'linux/stddef.h' definition of '__always_inline'. */ +#undef __always_inline #define __always_inline inline __attribute__((always_inline)) -#endif + #ifndef __noinline #define __noinline __attribute__((noinline)) #endif @@ -39,8 +40,22 @@ #define __weak __attribute__((weak)) #endif +/* When utilizing vmlinux.h with BPF CO-RE, user BPF programs can't include + * any system-level headers (such as stddef.h, linux/version.h, etc), and + * commonly-used macros like NULL and KERNEL_VERSION aren't available through + * vmlinux.h. This just adds unnecessary hurdles and forces users to re-define + * them on their own. So as a convenience, provide such definitions here. + */ +#ifndef NULL +#define NULL ((void *)0) +#endif + +#ifndef KERNEL_VERSION +#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)) +#endif + /* - * Helper macro to manipulate data structures + * Helper macros to manipulate data structures */ #ifndef offsetof #define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 3aa58f2ac183..d30e67e7e1e5 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -21,6 +21,7 @@ #include "libbpf.h" #include "libbpf_internal.h" #include "hashmap.h" +#include "strset.h" #define BTF_MAX_NR_TYPES 0x7fffffffU #define BTF_MAX_STR_OFFSET 0x7fffffffU @@ -67,7 +68,7 @@ struct btf { * | | | * hdr | | * types_data----+ | - * strs_data------------------+ + * strset__data(strs_set)-----+ * * +----------+---------+-----------+ * | Header | Types | Strings | @@ -105,20 +106,15 @@ struct btf { */ int start_str_off; + /* only one of strs_data or strs_set can be non-NULL, depending on + * whether BTF is in a modifiable state (strs_set is used) or not + * (strs_data points inside raw_data) + */ void *strs_data; - size_t strs_data_cap; /* used size stored in hdr->str_len */ - - /* lookup index for each unique string in strings section */ - struct hashmap *strs_hash; + /* a set of unique strings */ + struct strset *strs_set; /* whether strings are already deduplicated */ bool strs_deduped; - /* extra indirection layer to make strings hashmap work with stable - * string offsets and ability to transparently choose between - * btf->strs_data or btf_dedup->strs_data as a source of strings. - * This is used for BTF strings dedup to transfer deduplicated strings - * data back to struct btf without re-building strings index. - */ - void **strs_data_ptr; /* BTF object FD, if loaded into kernel */ int fd; @@ -142,8 +138,8 @@ static inline __u64 ptr_to_u64(const void *ptr) * On success, memory pointer to the beginning of unused memory is returned. * On error, NULL is returned. */ -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt) +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt) { size_t new_cnt; void *new_data; @@ -179,14 +175,14 @@ void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, /* Ensure given dynamically allocated memory region has enough allocated space * to accommodate *need_cnt* elements of size *elem_sz* bytes each */ -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt) { void *p; if (need_cnt <= *cap_cnt) return 0; - p = btf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); + p = libbpf_add_mem(data, cap_cnt, elem_sz, *cap_cnt, SIZE_MAX, need_cnt - *cap_cnt); if (!p) return -ENOMEM; @@ -197,8 +193,8 @@ static int btf_add_type_idx_entry(struct btf *btf, __u32 type_off) { __u32 *p; - p = btf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), - btf->nr_types, BTF_MAX_NR_TYPES, 1); + p = libbpf_add_mem((void **)&btf->type_offs, &btf->type_offs_cap, sizeof(__u32), + btf->nr_types, BTF_MAX_NR_TYPES, 1); if (!p) return -ENOMEM; @@ -435,7 +431,7 @@ const struct btf *btf__base_btf(const struct btf *btf) } /* internal helper returning non-const pointer to a type */ -static struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id) { if (type_id == 0) return &btf_void; @@ -738,7 +734,7 @@ void btf__free(struct btf *btf) */ free(btf->hdr); free(btf->types_data); - free(btf->strs_data); + strset__free(btf->strs_set); } free(btf->raw_data); free(btf->raw_data_swapped); @@ -1246,6 +1242,11 @@ void btf__set_fd(struct btf *btf, int fd) btf->fd = fd; } +static const void *btf_strs_data(const struct btf *btf) +{ + return btf->strs_data ? btf->strs_data : strset__data(btf->strs_set); +} + static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian) { struct btf_header *hdr = btf->hdr; @@ -1286,7 +1287,7 @@ static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endi } p += hdr->type_len; - memcpy(p, btf->strs_data, hdr->str_len); + memcpy(p, btf_strs_data(btf), hdr->str_len); p += hdr->str_len; *size = data_sz; @@ -1320,7 +1321,7 @@ const char *btf__str_by_offset(const struct btf *btf, __u32 offset) if (offset < btf->start_str_off) return btf__str_by_offset(btf->base_btf, offset); else if (offset - btf->start_str_off < btf->hdr->str_len) - return btf->strs_data + (offset - btf->start_str_off); + return btf_strs_data(btf) + (offset - btf->start_str_off); else return NULL; } @@ -1474,25 +1475,6 @@ int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, return 0; } -static size_t strs_hash_fn(const void *key, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str = strs + (long)key; - - return str_hash(str); -} - -static bool strs_hash_equal_fn(const void *key1, const void *key2, void *ctx) -{ - const struct btf *btf = ctx; - const char *strs = *btf->strs_data_ptr; - const char *str1 = strs + (long)key1; - const char *str2 = strs + (long)key2; - - return strcmp(str1, str2) == 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -1511,10 +1493,9 @@ static void btf_invalidate_raw_data(struct btf *btf) */ static int btf_ensure_modifiable(struct btf *btf) { - void *hdr, *types, *strs, *strs_end, *s; - struct hashmap *hash = NULL; - long off; - int err; + void *hdr, *types; + struct strset *set = NULL; + int err = -ENOMEM; if (btf_is_modifiable(btf)) { /* any BTF modification invalidates raw_data */ @@ -1525,44 +1506,25 @@ static int btf_ensure_modifiable(struct btf *btf) /* split raw data into three memory regions */ hdr = malloc(btf->hdr->hdr_len); types = malloc(btf->hdr->type_len); - strs = malloc(btf->hdr->str_len); - if (!hdr || !types || !strs) + if (!hdr || !types) goto err_out; memcpy(hdr, btf->hdr, btf->hdr->hdr_len); memcpy(types, btf->types_data, btf->hdr->type_len); - memcpy(strs, btf->strs_data, btf->hdr->str_len); - - /* make hashmap below use btf->strs_data as a source of strings */ - btf->strs_data_ptr = &btf->strs_data; /* build lookup index for all strings */ - hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, btf); - if (IS_ERR(hash)) { - err = PTR_ERR(hash); - hash = NULL; + set = strset__new(BTF_MAX_STR_OFFSET, btf->strs_data, btf->hdr->str_len); + if (IS_ERR(set)) { + err = PTR_ERR(set); goto err_out; } - strs_end = strs + btf->hdr->str_len; - for (off = 0, s = strs; s < strs_end; off += strlen(s) + 1, s = strs + off) { - /* hashmap__add() returns EEXIST if string with the same - * content already is in the hash map - */ - err = hashmap__add(hash, (void *)off, (void *)off); - if (err == -EEXIST) - continue; /* duplicate */ - if (err) - goto err_out; - } - /* only when everything was successful, update internal state */ btf->hdr = hdr; btf->types_data = types; btf->types_data_cap = btf->hdr->type_len; - btf->strs_data = strs; - btf->strs_data_cap = btf->hdr->str_len; - btf->strs_hash = hash; + btf->strs_data = NULL; + btf->strs_set = set; /* if BTF was created from scratch, all strings are guaranteed to be * unique and deduplicated */ @@ -1577,17 +1539,10 @@ static int btf_ensure_modifiable(struct btf *btf) return 0; err_out: - hashmap__free(hash); + strset__free(set); free(hdr); free(types); - free(strs); - return -ENOMEM; -} - -static void *btf_add_str_mem(struct btf *btf, size_t add_sz) -{ - return btf_add_mem(&btf->strs_data, &btf->strs_data_cap, 1, - btf->hdr->str_len, BTF_MAX_STR_OFFSET, add_sz); + return err; } /* Find an offset in BTF string section that corresponds to a given string *s*. @@ -1598,34 +1553,23 @@ static void *btf_add_str_mem(struct btf *btf, size_t add_sz) */ int btf__find_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } /* BTF needs to be in a modifiable state to build string lookup index */ if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* see btf__add_str() for why we do this */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; - - new_off = btf->hdr->str_len; - memcpy(p, s, len); + off = strset__find_str(btf->strs_set, s); + if (off < 0) + return off; - if (hashmap__find(btf->strs_hash, (void *)new_off, (void **)&old_off)) - return btf->start_str_off + old_off; - - return -ENOENT; + return btf->start_str_off + off; } /* Add a string s to the BTF string section. @@ -1635,56 +1579,30 @@ int btf__find_str(struct btf *btf, const char *s) */ int btf__add_str(struct btf *btf, const char *s) { - long old_off, new_off, len; - void *p; - int err; + int off; if (btf->base_btf) { - int ret; - - ret = btf__find_str(btf->base_btf, s); - if (ret != -ENOENT) - return ret; + off = btf__find_str(btf->base_btf, s); + if (off != -ENOENT) + return off; } if (btf_ensure_modifiable(btf)) return -ENOMEM; - /* Hashmap keys are always offsets within btf->strs_data, so to even - * look up some string from the "outside", we need to first append it - * at the end, so that it can be addressed with an offset. Luckily, - * until btf->hdr->str_len is incremented, that string is just a piece - * of garbage for the rest of BTF code, so no harm, no foul. On the - * other hand, if the string is unique, it's already appended and - * ready to be used, only a simple btf->hdr->str_len increment away. - */ - len = strlen(s) + 1; - p = btf_add_str_mem(btf, len); - if (!p) - return -ENOMEM; + off = strset__add_str(btf->strs_set, s); + if (off < 0) + return off; - new_off = btf->hdr->str_len; - memcpy(p, s, len); + btf->hdr->str_len = strset__data_size(btf->strs_set); - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(btf->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) - return btf->start_str_off + old_off; /* duplicated string, return existing offset */ - if (err) - return err; - - btf->hdr->str_len += len; /* new unique string, adjust data length */ - return btf->start_str_off + new_off; + return btf->start_str_off + off; } static void *btf_add_type_mem(struct btf *btf, size_t add_sz) { - return btf_add_mem(&btf->types_data, &btf->types_data_cap, 1, - btf->hdr->type_len, UINT_MAX, add_sz); + return libbpf_add_mem(&btf->types_data, &btf->types_data_cap, 1, + btf->hdr->type_len, UINT_MAX, add_sz); } static __u32 btf_type_info(int kind, int vlen, int kflag) @@ -1711,6 +1629,54 @@ static int btf_commit_type(struct btf *btf, int data_sz) return btf->start_id + btf->nr_types - 1; } +struct btf_pipe { + const struct btf *src; + struct btf *dst; +}; + +static int btf_rewrite_str(__u32 *str_off, void *ctx) +{ + struct btf_pipe *p = ctx; + int off; + + if (!*str_off) /* nothing to do for empty strings */ + return 0; + + off = btf__add_str(p->dst, btf__str_by_offset(p->src, *str_off)); + if (off < 0) + return off; + + *str_off = off; + return 0; +} + +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +{ + struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_type *t; + int sz, err; + + sz = btf_type_size(src_type); + if (sz < 0) + return sz; + + /* deconstruct BTF, if necessary, and invalidate raw_data */ + if (btf_ensure_modifiable(btf)) + return -ENOMEM; + + t = btf_add_type_mem(btf, sz); + if (!t) + return -ENOMEM; + + memcpy(t, src_type, sz); + + err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + if (err) + return err; + + return btf_commit_type(btf, sz); +} + /* * Append new BTF_KIND_INT type with: * - *name* - non-empty, non-NULL type name; @@ -3016,10 +2982,7 @@ struct btf_dedup { /* Various option modifying behavior of algorithm */ struct btf_dedup_opts opts; /* temporary strings deduplication state */ - void *strs_data; - size_t strs_cap; - size_t strs_len; - struct hashmap* strs_hash; + struct strset *strs_set; }; static long hash_combine(long h, long value) @@ -3155,95 +3118,28 @@ done: return d; } -typedef int (*str_off_fn_t)(__u32 *str_off_ptr, void *ctx); - /* * Iterate over all possible places in .BTF and .BTF.ext that can reference * string and pass pointer to it to a provided callback `fn`. */ -static int btf_for_each_str_off(struct btf_dedup *d, str_off_fn_t fn, void *ctx) +static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void *ctx) { - void *line_data_cur, *line_data_end; - int i, j, r, rec_size; - struct btf_type *t; + int i, r; for (i = 0; i < d->btf->nr_types; i++) { - t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = fn(&t->name_off, ctx); + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + + r = btf_type_visit_str_offs(t, fn, ctx); if (r) return r; - - switch (btf_kind(t)) { - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *m = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_ENUM: { - struct btf_enum *m = btf_enum(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - case BTF_KIND_FUNC_PROTO: { - struct btf_param *m = btf_params(t); - __u16 vlen = btf_vlen(t); - - for (j = 0; j < vlen; j++) { - r = fn(&m->name_off, ctx); - if (r) - return r; - m++; - } - break; - } - default: - break; - } } if (!d->btf_ext) return 0; - line_data_cur = d->btf_ext->line_info.info; - line_data_end = d->btf_ext->line_info.info + d->btf_ext->line_info.len; - rec_size = d->btf_ext->line_info.rec_size; - - while (line_data_cur < line_data_end) { - struct btf_ext_info_sec *sec = line_data_cur; - struct bpf_line_info_min *line_info; - __u32 num_info = sec->num_info; - - r = fn(&sec->sec_name_off, ctx); - if (r) - return r; - - line_data_cur += sizeof(struct btf_ext_info_sec); - for (i = 0; i < num_info; i++) { - line_info = line_data_cur; - r = fn(&line_info->file_name_off, ctx); - if (r) - return r; - r = fn(&line_info->line_off, ctx); - if (r) - return r; - line_data_cur += rec_size; - } - } + r = btf_ext_visit_str_offs(d->btf_ext, fn, ctx); + if (r) + return r; return 0; } @@ -3252,10 +3148,8 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) { struct btf_dedup *d = ctx; __u32 str_off = *str_off_ptr; - long old_off, new_off, len; const char *s; - void *p; - int err; + int off, err; /* don't touch empty string or string in main BTF */ if (str_off == 0 || str_off < d->btf->start_str_off) @@ -3272,29 +3166,11 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) return err; } - len = strlen(s) + 1; - - new_off = d->strs_len; - p = btf_add_mem(&d->strs_data, &d->strs_cap, 1, new_off, BTF_MAX_STR_OFFSET, len); - if (!p) - return -ENOMEM; - - memcpy(p, s, len); + off = strset__add_str(d->strs_set, s); + if (off < 0) + return off; - /* Now attempt to add the string, but only if the string with the same - * contents doesn't exist already (HASHMAP_ADD strategy). If such - * string exists, we'll get its offset in old_off (that's old_key). - */ - err = hashmap__insert(d->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); - if (err == -EEXIST) { - *str_off_ptr = d->btf->start_str_off + old_off; - } else if (err) { - return err; - } else { - *str_off_ptr = d->btf->start_str_off + new_off; - d->strs_len += len; - } + *str_off_ptr = d->btf->start_str_off + off; return 0; } @@ -3311,39 +3187,23 @@ static int strs_dedup_remap_str_off(__u32 *str_off_ptr, void *ctx) */ static int btf_dedup_strings(struct btf_dedup *d) { - char *s; int err; if (d->btf->strs_deduped) return 0; - /* temporarily switch to use btf_dedup's strs_data for strings for hash - * functions; later we'll just transfer hashmap to struct btf as is, - * along the strs_data - */ - d->btf->strs_data_ptr = &d->strs_data; - - d->strs_hash = hashmap__new(strs_hash_fn, strs_hash_equal_fn, d->btf); - if (IS_ERR(d->strs_hash)) { - err = PTR_ERR(d->strs_hash); - d->strs_hash = NULL; + d->strs_set = strset__new(BTF_MAX_STR_OFFSET, NULL, 0); + if (IS_ERR(d->strs_set)) { + err = PTR_ERR(d->strs_set); goto err_out; } if (!d->btf->base_btf) { - s = btf_add_mem(&d->strs_data, &d->strs_cap, 1, d->strs_len, BTF_MAX_STR_OFFSET, 1); - if (!s) - return -ENOMEM; - /* initial empty string */ - s[0] = 0; - d->strs_len = 1; - /* insert empty string; we won't be looking it up during strings * dedup, but it's good to have it for generic BTF string lookups */ - err = hashmap__insert(d->strs_hash, (void *)0, (void *)0, - HASHMAP_ADD, NULL, NULL); - if (err) + err = strset__add_str(d->strs_set, ""); + if (err < 0) goto err_out; } @@ -3353,28 +3213,16 @@ static int btf_dedup_strings(struct btf_dedup *d) goto err_out; /* replace BTF string data and hash with deduped ones */ - free(d->btf->strs_data); - hashmap__free(d->btf->strs_hash); - d->btf->strs_data = d->strs_data; - d->btf->strs_data_cap = d->strs_cap; - d->btf->hdr->str_len = d->strs_len; - d->btf->strs_hash = d->strs_hash; - /* now point strs_data_ptr back to btf->strs_data */ - d->btf->strs_data_ptr = &d->btf->strs_data; - - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; + strset__free(d->btf->strs_set); + d->btf->hdr->str_len = strset__data_size(d->strs_set); + d->btf->strs_set = d->strs_set; + d->strs_set = NULL; d->btf->strs_deduped = true; return 0; err_out: - free(d->strs_data); - hashmap__free(d->strs_hash); - d->strs_data = d->strs_hash = NULL; - d->strs_len = d->strs_cap = 0; - - /* restore strings pointer for existing d->btf->strs_hash back */ - d->btf->strs_data_ptr = &d->strs_data; + strset__free(d->strs_set); + d->strs_set = NULL; return err; } @@ -4498,15 +4346,18 @@ static int btf_dedup_compact_types(struct btf_dedup *d) * then mapping it to a deduplicated type ID, stored in btf_dedup->hypot_map, * which is populated during compaction phase. */ -static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_type_id(__u32 *type_id, void *ctx) { + struct btf_dedup *d = ctx; __u32 resolved_type_id, new_type_id; - resolved_type_id = resolve_type_id(d, type_id); + resolved_type_id = resolve_type_id(d, *type_id); new_type_id = d->hypot_map[resolved_type_id]; if (new_type_id > BTF_MAX_NR_TYPES) return -EINVAL; - return new_type_id; + + *type_id = new_type_id; + return 0; } /* @@ -4519,109 +4370,25 @@ static int btf_dedup_remap_type_id(struct btf_dedup *d, __u32 type_id) * referenced from any BTF type (e.g., struct fields, func proto args, etc) to * their final deduped type IDs. */ -static int btf_dedup_remap_type(struct btf_dedup *d, __u32 type_id) +static int btf_dedup_remap_types(struct btf_dedup *d) { - struct btf_type *t = btf_type_by_id(d->btf, type_id); int i, r; - switch (btf_kind(t)) { - case BTF_KIND_INT: - case BTF_KIND_ENUM: - case BTF_KIND_FLOAT: - break; - - case BTF_KIND_FWD: - case BTF_KIND_CONST: - case BTF_KIND_VOLATILE: - case BTF_KIND_RESTRICT: - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_FUNC: - case BTF_KIND_VAR: - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) - return r; - t->type = r; - break; - - case BTF_KIND_ARRAY: { - struct btf_array *arr_info = btf_array(t); - - r = btf_dedup_remap_type_id(d, arr_info->type); - if (r < 0) - return r; - arr_info->type = r; - r = btf_dedup_remap_type_id(d, arr_info->index_type); - if (r < 0) - return r; - arr_info->index_type = r; - break; - } - - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *member = btf_members(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, member->type); - if (r < 0) - return r; - member->type = r; - member++; - } - break; - } - - case BTF_KIND_FUNC_PROTO: { - struct btf_param *param = btf_params(t); - __u16 vlen = btf_vlen(t); + for (i = 0; i < d->btf->nr_types; i++) { + struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); - r = btf_dedup_remap_type_id(d, t->type); - if (r < 0) + r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + if (r) return r; - t->type = r; - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, param->type); - if (r < 0) - return r; - param->type = r; - param++; - } - break; - } - - case BTF_KIND_DATASEC: { - struct btf_var_secinfo *var = btf_var_secinfos(t); - __u16 vlen = btf_vlen(t); - - for (i = 0; i < vlen; i++) { - r = btf_dedup_remap_type_id(d, var->type); - if (r < 0) - return r; - var->type = r; - var++; - } - break; - } - - default: - return -EINVAL; } - return 0; -} + if (!d->btf_ext) + return 0; -static int btf_dedup_remap_types(struct btf_dedup *d) -{ - int i, r; + r = btf_ext_visit_type_ids(d->btf_ext, btf_dedup_remap_type_id, d); + if (r) + return r; - for (i = 0; i < d->btf->nr_types; i++) { - r = btf_dedup_remap_type(d, d->btf->start_id + i); - if (r < 0) - return r; - } return 0; } @@ -4675,3 +4442,200 @@ struct btf *libbpf_find_kernel_btf(void) pr_warn("failed to find valid kernel BTF\n"); return ERR_PTR(-ESRCH); } + +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) +{ + int i, n, err; + + switch (btf_kind(t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + return 0; + + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + return visit(&t->type, ctx); + + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + err = visit(&a->type, ctx); + err = err ?: visit(&a->index_type, ctx); + return err; + } + + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + err = visit(&t->type, ctx); + if (err) + return err; + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + case BTF_KIND_DATASEC: { + struct btf_var_secinfo *m = btf_var_secinfos(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->type, ctx); + if (err) + return err; + } + return 0; + } + + default: + return -EINVAL; + } +} + +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) +{ + int i, n, err; + + err = visit(&t->name_off, ctx); + if (err) + return err; + + switch (btf_kind(t)) { + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + struct btf_member *m = btf_members(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + struct btf_enum *m = btf_enum(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *m = btf_params(t); + + for (i = 0, n = btf_vlen(t); i < n; i++, m++) { + err = visit(&m->name_off, ctx); + if (err) + return err; + } + break; + } + default: + break; + } + + return 0; +} + +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_func_info_min *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->type_id, ctx); + if (err < 0) + return err; + } + } + + return 0; +} + +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx) +{ + const struct btf_ext_info *seg; + struct btf_ext_info_sec *sec; + int i, err; + + seg = &btf_ext->func_info; + for_each_btf_ext_sec(seg, sec) { + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + } + + seg = &btf_ext->line_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_line_info_min *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->file_name_off, ctx); + if (err) + return err; + err = visit(&rec->line_off, ctx); + if (err) + return err; + } + } + + seg = &btf_ext->core_relo_info; + for_each_btf_ext_sec(seg, sec) { + struct bpf_core_relo *rec; + + err = visit(&sec->sec_name_off, ctx); + if (err) + return err; + + for_each_btf_ext_rec(seg, sec, i, rec) { + err = visit(&rec->access_str_off, ctx); + if (err) + return err; + } + } + + return 0; +} diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h index 029a9cfc8c2d..b54f1c3ebd57 100644 --- a/tools/lib/bpf/btf.h +++ b/tools/lib/bpf/btf.h @@ -93,6 +93,8 @@ LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); +LIBBPF_API int btf__add_type(struct btf *btf, const struct btf *src_btf, + const struct btf_type *src_type); LIBBPF_API int btf__add_int(struct btf *btf, const char *name, size_t byte_sz, int encoding); LIBBPF_API int btf__add_float(struct btf *btf, const char *name, size_t byte_sz); @@ -174,6 +176,7 @@ struct btf_dump_emit_type_decl_opts { int indent_level; /* strip all the const/volatile/restrict mods */ bool strip_mods; + size_t :0; }; #define btf_dump_emit_type_decl_opts__last_field strip_mods diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c index 5e957fcceee6..5e2809d685bf 100644 --- a/tools/lib/bpf/btf_dump.c +++ b/tools/lib/bpf/btf_dump.c @@ -166,11 +166,11 @@ static int btf_dump_resize(struct btf_dump *d) if (last_id <= d->last_id) return 0; - if (btf_ensure_mem((void **)&d->type_states, &d->type_states_cap, - sizeof(*d->type_states), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->type_states, &d->type_states_cap, + sizeof(*d->type_states), last_id + 1)) return -ENOMEM; - if (btf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, - sizeof(*d->cached_names), last_id + 1)) + if (libbpf_ensure_mem((void **)&d->cached_names, &d->cached_names_cap, + sizeof(*d->cached_names), last_id + 1)) return -ENOMEM; if (d->last_id == 0) { @@ -464,7 +464,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return err; case BTF_KIND_ARRAY: - return btf_dump_order_type(d, btf_array(t)->type, through_ptr); + return btf_dump_order_type(d, btf_array(t)->type, false); case BTF_KIND_STRUCT: case BTF_KIND_UNION: { diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 2f351d3ad3e7..7aad78dbb4b4 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -55,10 +55,6 @@ #include "libbpf_internal.h" #include "hashmap.h" -#ifndef EM_BPF -#define EM_BPF 247 -#endif - #ifndef BPF_FS_MAGIC #define BPF_FS_MAGIC 0xcafe4a11 #endif @@ -189,7 +185,8 @@ enum reloc_type { RELO_LD64, RELO_CALL, RELO_DATA, - RELO_EXTERN, + RELO_EXTERN_VAR, + RELO_EXTERN_FUNC, RELO_SUBPROG_ADDR, }; @@ -577,14 +574,19 @@ static bool insn_is_subprog_call(const struct bpf_insn *insn) insn->off == 0; } -static bool is_ldimm64(struct bpf_insn *insn) +static bool is_ldimm64_insn(struct bpf_insn *insn) { return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +static bool is_call_insn(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL); +} + static bool insn_is_pseudo_func(struct bpf_insn *insn) { - return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; + return is_ldimm64_insn(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } static int @@ -1134,11 +1136,6 @@ static void bpf_object__elf_finish(struct bpf_object *obj) obj->efile.obj_buf_sz = 0; } -/* if libelf is old and doesn't support mmap(), fall back to read() */ -#ifndef ELF_C_READ_MMAP -#define ELF_C_READ_MMAP ELF_C_READ -#endif - static int bpf_object__elf_init(struct bpf_object *obj) { int err = 0; @@ -1194,7 +1191,8 @@ static int bpf_object__elf_init(struct bpf_object *obj) if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) { pr_warn("elf: failed to get section names strings from %s: %s\n", obj->path, elf_errmsg(-1)); - return -LIBBPF_ERRNO__FORMAT; + err = -LIBBPF_ERRNO__FORMAT; + goto errout; } /* Old LLVM set e_machine to EM_NONE */ @@ -1929,9 +1927,9 @@ resolve_func_ptr(const struct btf *btf, __u32 id, __u32 *res_id) return btf_is_func_proto(t) ? t : NULL; } -static const char *btf_kind_str(const struct btf_type *t) +static const char *__btf_kind_str(__u16 kind) { - switch (btf_kind(t)) { + switch (kind) { case BTF_KIND_UNKN: return "void"; case BTF_KIND_INT: return "int"; case BTF_KIND_PTR: return "ptr"; @@ -1953,6 +1951,16 @@ static const char *btf_kind_str(const struct btf_type *t) } } +static const char *btf_kind_str(const struct btf_type *t) +{ + return __btf_kind_str(btf_kind(t)); +} + +static enum btf_func_linkage btf_func_linkage(const struct btf_type *t) +{ + return (enum btf_func_linkage)BTF_INFO_VLEN(t->info); +} + /* * Fetch integer attribute of BTF map definition. Such attributes are * represented using a pointer to an array, in which dimensionality of array @@ -2807,7 +2815,7 @@ static bool ignore_elf_section(GElf_Shdr *hdr, const char *name) return true; /* ignore .llvm_addrsig section as well */ - if (hdr->sh_type == 0x6FFF4C03 /* SHT_LLVM_ADDRSIG */) + if (hdr->sh_type == SHT_LLVM_ADDRSIG) return true; /* no subprograms will lead to an empty .text section, ignore it */ @@ -3017,7 +3025,7 @@ static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx) static int find_extern_btf_id(const struct btf *btf, const char *ext_name) { const struct btf_type *t; - const char *var_name; + const char *tname; int i, n; if (!btf) @@ -3027,14 +3035,18 @@ static int find_extern_btf_id(const struct btf *btf, const char *ext_name) for (i = 1; i <= n; i++) { t = btf__type_by_id(btf, i); - if (!btf_is_var(t)) + if (!btf_is_var(t) && !btf_is_func(t)) continue; - var_name = btf__name_by_offset(btf, t->name_off); - if (strcmp(var_name, ext_name)) + tname = btf__name_by_offset(btf, t->name_off); + if (strcmp(tname, ext_name)) continue; - if (btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + if (btf_is_var(t) && + btf_var(t)->linkage != BTF_VAR_GLOBAL_EXTERN) + return -EINVAL; + + if (btf_is_func(t) && btf_func_linkage(t) != BTF_FUNC_EXTERN) return -EINVAL; return i; @@ -3147,12 +3159,48 @@ static int find_int_btf_id(const struct btf *btf) return 0; } +static int add_dummy_ksym_var(struct btf *btf) +{ + int i, int_btf_id, sec_btf_id, dummy_var_btf_id; + const struct btf_var_secinfo *vs; + const struct btf_type *sec; + + sec_btf_id = btf__find_by_name_kind(btf, KSYMS_SEC, + BTF_KIND_DATASEC); + if (sec_btf_id < 0) + return 0; + + sec = btf__type_by_id(btf, sec_btf_id); + vs = btf_var_secinfos(sec); + for (i = 0; i < btf_vlen(sec); i++, vs++) { + const struct btf_type *vt; + + vt = btf__type_by_id(btf, vs->type); + if (btf_is_func(vt)) + break; + } + + /* No func in ksyms sec. No need to add dummy var. */ + if (i == btf_vlen(sec)) + return 0; + + int_btf_id = find_int_btf_id(btf); + dummy_var_btf_id = btf__add_var(btf, + "dummy_ksym", + BTF_VAR_GLOBAL_ALLOCATED, + int_btf_id); + if (dummy_var_btf_id < 0) + pr_warn("cannot create a dummy_ksym var\n"); + + return dummy_var_btf_id; +} + static int bpf_object__collect_externs(struct bpf_object *obj) { struct btf_type *sec, *kcfg_sec = NULL, *ksym_sec = NULL; const struct btf_type *t; struct extern_desc *ext; - int i, n, off; + int i, n, off, dummy_var_btf_id; const char *ext_name, *sec_name; Elf_Scn *scn; GElf_Shdr sh; @@ -3164,6 +3212,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj) if (elf_sec_hdr(obj, scn, &sh)) return -LIBBPF_ERRNO__FORMAT; + dummy_var_btf_id = add_dummy_ksym_var(obj->btf); + if (dummy_var_btf_id < 0) + return dummy_var_btf_id; + n = sh.sh_size / sh.sh_entsize; pr_debug("looking for externs among %d symbols...\n", n); @@ -3208,6 +3260,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) sec_name = btf__name_by_offset(obj->btf, sec->name_off); if (strcmp(sec_name, KCONFIG_SEC) == 0) { + if (btf_is_func(t)) { + pr_warn("extern function %s is unsupported under %s section\n", + ext->name, KCONFIG_SEC); + return -ENOTSUP; + } kcfg_sec = sec; ext->type = EXT_KCFG; ext->kcfg.sz = btf__resolve_size(obj->btf, t->type); @@ -3229,6 +3286,11 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { + if (btf_is_func(t) && ext->is_weak) { + pr_warn("extern weak function %s is unsupported\n", + ext->name); + return -ENOTSUP; + } ksym_sec = sec; ext->type = EXT_KSYM; skip_mods_and_typedefs(obj->btf, t->type, @@ -3255,7 +3317,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj) * extern variables in DATASEC */ int int_btf_id = find_int_btf_id(obj->btf); + /* For extern function, a dummy_var added earlier + * will be used to replace the vs->type and + * its name string will be used to refill + * the missing param's name. + */ + const struct btf_type *dummy_var; + dummy_var = btf__type_by_id(obj->btf, dummy_var_btf_id); for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; if (ext->type != EXT_KSYM) @@ -3274,12 +3343,32 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext_name = btf__name_by_offset(obj->btf, vt->name_off); ext = find_extern_by_name(obj, ext_name); if (!ext) { - pr_warn("failed to find extern definition for BTF var '%s'\n", - ext_name); + pr_warn("failed to find extern definition for BTF %s '%s'\n", + btf_kind_str(vt), ext_name); return -ESRCH; } - btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; - vt->type = int_btf_id; + if (btf_is_func(vt)) { + const struct btf_type *func_proto; + struct btf_param *param; + int j; + + func_proto = btf__type_by_id(obj->btf, + vt->type); + param = btf_params(func_proto); + /* Reuse the dummy_var string if the + * func proto does not have param name. + */ + for (j = 0; j < btf_vlen(func_proto); j++) + if (param[j].type && !param[j].name_off) + param[j].name_off = + dummy_var->name_off; + vs->type = dummy_var_btf_id; + vt->info &= ~0xffff; + vt->info |= BTF_FUNC_GLOBAL; + } else { + btf_var(vt)->linkage = BTF_VAR_GLOBAL_ALLOCATED; + vt->type = int_btf_id; + } vs->offset = off; vs->size = sizeof(int); } @@ -3411,31 +3500,7 @@ static int bpf_program__record_reloc(struct bpf_program *prog, reloc_desc->processed = false; - /* sub-program call relocation */ - if (insn->code == (BPF_JMP | BPF_CALL)) { - if (insn->src_reg != BPF_PSEUDO_CALL) { - pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); - return -LIBBPF_ERRNO__RELOC; - } - /* text_shndx can be 0, if no default "main" program exists */ - if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { - sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); - pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", - prog->name, sym_name, sym_sec_name); - return -LIBBPF_ERRNO__RELOC; - } - if (sym->st_value % BPF_INSN_SZ) { - pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", - prog->name, sym_name, (size_t)sym->st_value); - return -LIBBPF_ERRNO__RELOC; - } - reloc_desc->type = RELO_CALL; - reloc_desc->insn_idx = insn_idx; - reloc_desc->sym_off = sym->st_value; - return 0; - } - - if (!is_ldimm64(insn)) { + if (!is_call_insn(insn) && !is_ldimm64_insn(insn)) { pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n", prog->name, sym_name, insn_idx, insn->code); return -LIBBPF_ERRNO__RELOC; @@ -3458,12 +3523,39 @@ static int bpf_program__record_reloc(struct bpf_program *prog, } pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); - reloc_desc->type = RELO_EXTERN; + if (insn->code == (BPF_JMP | BPF_CALL)) + reloc_desc->type = RELO_EXTERN_FUNC; + else + reloc_desc->type = RELO_EXTERN_VAR; reloc_desc->insn_idx = insn_idx; reloc_desc->sym_off = i; /* sym_off stores extern index */ return 0; } + /* sub-program call relocation */ + if (is_call_insn(insn)) { + if (insn->src_reg != BPF_PSEUDO_CALL) { + pr_warn("prog '%s': incorrect bpf_call opcode\n", prog->name); + return -LIBBPF_ERRNO__RELOC; + } + /* text_shndx can be 0, if no default "main" program exists */ + if (!shdr_idx || shdr_idx != obj->efile.text_shndx) { + sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + pr_warn("prog '%s': bad call relo against '%s' in section '%s'\n", + prog->name, sym_name, sym_sec_name); + return -LIBBPF_ERRNO__RELOC; + } + if (sym->st_value % BPF_INSN_SZ) { + pr_warn("prog '%s': bad call relo against '%s' at offset %zu\n", + prog->name, sym_name, (size_t)sym->st_value); + return -LIBBPF_ERRNO__RELOC; + } + reloc_desc->type = RELO_CALL; + reloc_desc->insn_idx = insn_idx; + reloc_desc->sym_off = sym->st_value; + return 0; + } + if (!shdr_idx || shdr_idx >= SHN_LORESERVE) { pr_warn("prog '%s': invalid relo against '%s' in special section 0x%x; forgot to initialize global var?..\n", prog->name, sym_name, shdr_idx); @@ -4867,8 +4959,8 @@ static int load_module_btfs(struct bpf_object *obj) goto err_out; } - err = btf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, - sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) goto err_out; @@ -5703,7 +5795,7 @@ poison: /* poison second part of ldimm64 to avoid confusing error from * verifier about "unknown opcode 00" */ - if (is_ldimm64(insn)) + if (is_ldimm64_insn(insn)) bpf_core_poison_insn(prog, relo_idx, insn_idx + 1, insn + 1); bpf_core_poison_insn(prog, relo_idx, insn_idx, insn); return 0; @@ -5779,7 +5871,7 @@ poison: case BPF_LD: { __u64 imm; - if (!is_ldimm64(insn) || + if (!is_ldimm64_insn(insn) || insn[0].src_reg != 0 || insn[0].off != 0 || insn_idx + 1 >= prog->insns_cnt || insn[1].code != 0 || insn[1].dst_reg != 0 || @@ -6221,7 +6313,7 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) insn[0].imm = obj->maps[relo->map_idx].fd; relo->processed = true; break; - case RELO_EXTERN: + case RELO_EXTERN_VAR: ext = &obj->externs[relo->sym_off]; if (ext->type == EXT_KCFG) { insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; @@ -6239,6 +6331,12 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } relo->processed = true; break; + case RELO_EXTERN_FUNC: + ext = &obj->externs[relo->sym_off]; + insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; + insn[0].imm = ext->ksym.kernel_btf_id; + relo->processed = true; + break; case RELO_SUBPROG_ADDR: insn[0].src_reg = BPF_PSEUDO_FUNC; /* will be handled as a follow up pass */ @@ -7359,6 +7457,7 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) { char sym_type, sym_name[500]; unsigned long long sym_addr; + const struct btf_type *t; struct extern_desc *ext; int ret, err = 0; FILE *f; @@ -7385,6 +7484,10 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj) if (!ext || ext->type != EXT_KSYM) continue; + t = btf__type_by_id(obj->btf, ext->btf_id); + if (!btf_is_var(t)) + continue; + if (ext->is_set && ext->ksym.addr != sym_addr) { pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", sym_name, ext->ksym.addr, sym_addr); @@ -7403,75 +7506,151 @@ out: return err; } -static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + int *res_btf_fd) { - struct extern_desc *ext; + int i, id, btf_fd, err; struct btf *btf; - int i, j, id, btf_fd, err; - for (i = 0; i < obj->nr_extern; i++) { - const struct btf_type *targ_var, *targ_type; - __u32 targ_type_id, local_type_id; - const char *targ_var_name; - int ret; - - ext = &obj->externs[i]; - if (ext->type != EXT_KSYM || !ext->ksym.type_id) - continue; + btf = obj->btf_vmlinux; + btf_fd = 0; + id = btf__find_by_name_kind(btf, ksym_name, kind); - btf = obj->btf_vmlinux; - btf_fd = 0; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id == -ENOENT) { - err = load_module_btfs(obj); - if (err) - return err; + if (id == -ENOENT) { + err = load_module_btfs(obj); + if (err) + return err; - for (j = 0; j < obj->btf_module_cnt; j++) { - btf = obj->btf_modules[j].btf; - /* we assume module BTF FD is always >0 */ - btf_fd = obj->btf_modules[j].fd; - id = btf__find_by_name_kind(btf, ext->name, BTF_KIND_VAR); - if (id != -ENOENT) - break; - } - } - if (id <= 0) { - pr_warn("extern (ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", - ext->name); - return -ESRCH; + for (i = 0; i < obj->btf_module_cnt; i++) { + btf = obj->btf_modules[i].btf; + /* we assume module BTF FD is always >0 */ + btf_fd = obj->btf_modules[i].fd; + id = btf__find_by_name_kind(btf, ksym_name, kind); + if (id != -ENOENT) + break; } + } + if (id <= 0) { + pr_warn("extern (%s ksym) '%s': failed to find BTF ID in kernel BTF(s).\n", + __btf_kind_str(kind), ksym_name); + return -ESRCH; + } - /* find local type_id */ - local_type_id = ext->ksym.type_id; + *res_btf = btf; + *res_btf_fd = btf_fd; + return id; +} - /* find target type_id */ - targ_var = btf__type_by_id(btf, id); - targ_var_name = btf__name_by_offset(btf, targ_var->name_off); - targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); +static int bpf_object__resolve_ksym_var_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + const struct btf_type *targ_var, *targ_type; + __u32 targ_type_id, local_type_id; + const char *targ_var_name; + int id, btf_fd = 0, err; + struct btf *btf = NULL; - ret = bpf_core_types_are_compat(obj->btf, local_type_id, - btf, targ_type_id); - if (ret <= 0) { - const struct btf_type *local_type; - const char *targ_name, *local_name; + id = find_ksym_btf_id(obj, ext->name, BTF_KIND_VAR, &btf, &btf_fd); + if (id < 0) + return id; - local_type = btf__type_by_id(obj->btf, local_type_id); - local_name = btf__name_by_offset(obj->btf, local_type->name_off); - targ_name = btf__name_by_offset(btf, targ_type->name_off); + /* find local type_id */ + local_type_id = ext->ksym.type_id; - pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", - ext->name, local_type_id, - btf_kind_str(local_type), local_name, targ_type_id, - btf_kind_str(targ_type), targ_name); - return -EINVAL; - } + /* find target type_id */ + targ_var = btf__type_by_id(btf, id); + targ_var_name = btf__name_by_offset(btf, targ_var->name_off); + targ_type = skip_mods_and_typedefs(btf, targ_var->type, &targ_type_id); - ext->is_set = true; - ext->ksym.kernel_btf_obj_fd = btf_fd; - ext->ksym.kernel_btf_id = id; - pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n", - ext->name, id, btf_kind_str(targ_var), targ_var_name); + err = bpf_core_types_are_compat(obj->btf, local_type_id, + btf, targ_type_id); + if (err <= 0) { + const struct btf_type *local_type; + const char *targ_name, *local_name; + + local_type = btf__type_by_id(obj->btf, local_type_id); + local_name = btf__name_by_offset(obj->btf, local_type->name_off); + targ_name = btf__name_by_offset(btf, targ_type->name_off); + + pr_warn("extern (var ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n", + ext->name, local_type_id, + btf_kind_str(local_type), local_name, targ_type_id, + btf_kind_str(targ_type), targ_name); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = btf_fd; + ext->ksym.kernel_btf_id = id; + pr_debug("extern (var ksym) '%s': resolved to [%d] %s %s\n", + ext->name, id, btf_kind_str(targ_var), targ_var_name); + + return 0; +} + +static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, + struct extern_desc *ext) +{ + int local_func_proto_id, kfunc_proto_id, kfunc_id; + const struct btf_type *kern_func; + struct btf *kern_btf = NULL; + int ret, kern_btf_fd = 0; + + local_func_proto_id = ext->ksym.type_id; + + kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, + &kern_btf, &kern_btf_fd); + if (kfunc_id < 0) { + pr_warn("extern (func ksym) '%s': not found in kernel BTF\n", + ext->name); + return kfunc_id; + } + + if (kern_btf != obj->btf_vmlinux) { + pr_warn("extern (func ksym) '%s': function in kernel module is not supported\n", + ext->name); + return -ENOTSUP; + } + + kern_func = btf__type_by_id(kern_btf, kfunc_id); + kfunc_proto_id = kern_func->type; + + ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, + kern_btf, kfunc_proto_id); + if (ret <= 0) { + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", + ext->name, local_func_proto_id, kfunc_proto_id); + return -EINVAL; + } + + ext->is_set = true; + ext->ksym.kernel_btf_obj_fd = kern_btf_fd; + ext->ksym.kernel_btf_id = kfunc_id; + pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", + ext->name, kfunc_id); + + return 0; +} + +static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj) +{ + const struct btf_type *t; + struct extern_desc *ext; + int i, err; + + for (i = 0; i < obj->nr_extern; i++) { + ext = &obj->externs[i]; + if (ext->type != EXT_KSYM || !ext->ksym.type_id) + continue; + + t = btf__type_by_id(obj->btf, ext->btf_id); + if (btf_is_var(t)) + err = bpf_object__resolve_ksym_var_btf_id(obj, ext); + else + err = bpf_object__resolve_ksym_func_btf_id(obj, ext); + if (err) + return err; } return 0; } @@ -8278,6 +8457,16 @@ int bpf_object__btf_fd(const struct bpf_object *obj) return obj->btf ? btf__fd(obj->btf) : -1; } +int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) +{ + if (obj->loaded) + return -EINVAL; + + obj->kern_version = kern_version; + + return 0; +} + int bpf_object__set_priv(struct bpf_object *obj, void *priv, bpf_object_clear_priv_t clear_priv) { @@ -8466,7 +8655,7 @@ int bpf_program__nth_fd(const struct bpf_program *prog, int n) return fd; } -enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog) +enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog) { return prog->type; } @@ -8511,7 +8700,7 @@ BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog) +bpf_program__get_expected_attach_type(const struct bpf_program *prog) { return prog->expected_attach_type; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3c35eb401931..f500621d28e5 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -143,6 +143,7 @@ LIBBPF_API int bpf_object__unload(struct bpf_object *obj); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); +LIBBPF_API int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version); struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); @@ -361,12 +362,12 @@ LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); -LIBBPF_API enum bpf_prog_type bpf_program__get_type(struct bpf_program *prog); +LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type); LIBBPF_API enum bpf_attach_type -bpf_program__get_expected_attach_type(struct bpf_program *prog); +bpf_program__get_expected_attach_type(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type); @@ -507,6 +508,7 @@ struct xdp_link_info { struct bpf_xdp_set_link_opts { size_t sz; int old_fd; + size_t :0; }; #define bpf_xdp_set_link_opts__last_field old_fd @@ -759,6 +761,19 @@ enum libbpf_tristate { TRI_MODULE = 2, }; +struct bpf_linker_opts { + /* size of this struct, for forward/backward compatiblity */ + size_t sz; +}; +#define bpf_linker_opts__last_field sz + +struct bpf_linker; + +LIBBPF_API struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts); +LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filename); +LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker); +LIBBPF_API void bpf_linker__free(struct bpf_linker *linker); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index ec898f464ab9..f5990f7208ce 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -354,4 +354,10 @@ LIBBPF_0.3.0 { LIBBPF_0.4.0 { global: btf__add_float; + btf__add_type; + bpf_linker__add_file; + bpf_linker__finalize; + bpf_linker__free; + bpf_linker__new; + bpf_object__set_kversion; } LIBBPF_0.3.0; diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h index 343f6eb05637..6017902c687e 100644 --- a/tools/lib/bpf/libbpf_internal.h +++ b/tools/lib/bpf/libbpf_internal.h @@ -20,6 +20,26 @@ #include "libbpf.h" +#ifndef EM_BPF +#define EM_BPF 247 +#endif + +#ifndef R_BPF_64_64 +#define R_BPF_64_64 1 +#endif +#ifndef R_BPF_64_32 +#define R_BPF_64_32 10 +#endif + +#ifndef SHT_LLVM_ADDRSIG +#define SHT_LLVM_ADDRSIG 0x6FFF4C03 +#endif + +/* if libelf is old and doesn't support mmap(), fall back to read() */ +#ifndef ELF_C_READ_MMAP +#define ELF_C_READ_MMAP ELF_C_READ +#endif + #define BTF_INFO_ENC(kind, kind_flag, vlen) \ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN)) #define BTF_TYPE_ENC(name, info, size_or_type) (name), (info), (size_or_type) @@ -107,9 +127,14 @@ static inline void *libbpf_reallocarray(void *ptr, size_t nmemb, size_t size) return realloc(ptr, total); } -void *btf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, - size_t cur_cnt, size_t max_cnt, size_t add_cnt); -int btf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); +struct btf; +struct btf_type; + +struct btf_type *btf_type_by_id(struct btf *btf, __u32 type_id); + +void *libbpf_add_mem(void **data, size_t *cap_cnt, size_t elem_sz, + size_t cur_cnt, size_t max_cnt, size_t add_cnt); +int libbpf_ensure_mem(void **data, size_t *cap_cnt, size_t elem_sz, size_t need_cnt); static inline bool libbpf_validate_opts(const char *opts, size_t opts_sz, size_t user_sz, @@ -351,4 +376,11 @@ struct bpf_core_relo { enum bpf_core_relo_kind kind; }; +typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); +typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); +int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); +int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); +int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); +int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/tools/lib/bpf/libbpf_util.h b/tools/lib/bpf/libbpf_util.h deleted file mode 100644 index cfbcfc063c81..000000000000 --- a/tools/lib/bpf/libbpf_util.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ -/* Copyright (c) 2019 Facebook */ - -#ifndef __LIBBPF_LIBBPF_UTIL_H -#define __LIBBPF_LIBBPF_UTIL_H - -#include <stdbool.h> -#include <linux/compiler.h> - -#ifdef __cplusplus -extern "C" { -#endif - -/* Use these barrier functions instead of smp_[rw]mb() when they are - * used in a libbpf header file. That way they can be built into the - * application that uses libbpf. - */ -#if defined(__i386__) || defined(__x86_64__) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile("" : : : "memory"); \ - WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - asm volatile("" : : : "memory"); \ - ___p1; \ - }) -#elif defined(__aarch64__) -# define libbpf_smp_store_release(p, v) \ - asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1; \ - asm volatile ("ldar %w0, %1" \ - : "=r" (___p1) : "Q" (*p) : "memory"); \ - ___p1; \ - }) -#elif defined(__riscv) -# define libbpf_smp_store_release(p, v) \ - do { \ - asm volatile ("fence rw,w" : : : "memory"); \ - WRITE_ONCE(*p, v); \ - } while (0) -# define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - asm volatile ("fence r,rw" : : : "memory"); \ - ___p1; \ - }) -#endif - -#ifndef libbpf_smp_store_release -#define libbpf_smp_store_release(p, v) \ - do { \ - __sync_synchronize(); \ - WRITE_ONCE(*p, v); \ - } while (0) -#endif - -#ifndef libbpf_smp_load_acquire -#define libbpf_smp_load_acquire(p) \ - ({ \ - typeof(*p) ___p1 = READ_ONCE(*p); \ - __sync_synchronize(); \ - ___p1; \ - }) -#endif - -#ifdef __cplusplus -} /* extern "C" */ -#endif - -#endif diff --git a/tools/lib/bpf/linker.c b/tools/lib/bpf/linker.c new file mode 100644 index 000000000000..46b16cbdcda3 --- /dev/null +++ b/tools/lib/bpf/linker.c @@ -0,0 +1,1963 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * BPF static linker + * + * Copyright (c) 2021 Facebook + */ +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <linux/err.h> +#include <linux/btf.h> +#include <elf.h> +#include <libelf.h> +#include <gelf.h> +#include <fcntl.h> +#include "libbpf.h" +#include "btf.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct src_sec { + const char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + /* positional (not necessarily ELF) index of a matching section in a final object file */ + int dst_id; + /* section data offset in a matching output section */ + int dst_off; + /* whether section is omitted from the final ELF file */ + bool skipped; + /* whether section is an ephemeral section, not mapped to an ELF section */ + bool ephemeral; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* corresponding BTF DATASEC type ID */ + int sec_type_id; +}; + +struct src_obj { + const char *filename; + int fd; + Elf *elf; + /* Section header strings section index */ + size_t shstrs_sec_idx; + /* SYMTAB section index */ + size_t symtab_sec_idx; + + struct btf *btf; + struct btf_ext *btf_ext; + + /* List of sections (including ephemeral). Slot zero is unused. */ + struct src_sec *secs; + int sec_cnt; + + /* mapping of symbol indices from src to dst ELF */ + int *sym_map; + /* mapping from the src BTF type IDs to dst ones */ + int *btf_type_map; +}; + +/* single .BTF.ext data section */ +struct btf_ext_sec_data { + size_t rec_cnt; + __u32 rec_sz; + void *recs; +}; + +struct dst_sec { + char *sec_name; + /* positional (not necessarily ELF) index in an array of sections */ + int id; + + /* ELF info */ + size_t sec_idx; + Elf_Scn *scn; + Elf64_Shdr *shdr; + Elf_Data *data; + + /* final output section size */ + int sec_sz; + /* final output contents of the section */ + void *raw_data; + + /* corresponding STT_SECTION symbol index in SYMTAB */ + int sec_sym_idx; + + /* section's DATASEC variable info, emitted on BTF finalization */ + bool has_btf; + int sec_var_cnt; + struct btf_var_secinfo *sec_vars; + + /* section's .BTF.ext data */ + struct btf_ext_sec_data func_info; + struct btf_ext_sec_data line_info; + struct btf_ext_sec_data core_relo_info; +}; + +struct bpf_linker { + char *filename; + int fd; + Elf *elf; + Elf64_Ehdr *elf_hdr; + + /* Output sections metadata */ + struct dst_sec *secs; + int sec_cnt; + + struct strset *strtab_strs; /* STRTAB unique strings */ + size_t strtab_sec_idx; /* STRTAB section index */ + size_t symtab_sec_idx; /* SYMTAB section index */ + + struct btf *btf; + struct btf_ext *btf_ext; +}; + +#define pr_warn_elf(fmt, ...) \ +do { \ + libbpf_print(LIBBPF_WARN, "libbpf: " fmt ": %s\n", ##__VA_ARGS__, elf_errmsg(-1)); \ +} while (0) + +static int init_output_elf(struct bpf_linker *linker, const char *file); + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj); +static int linker_sanity_check_elf(struct src_obj *obj); +static int linker_sanity_check_btf(struct src_obj *obj); +static int linker_sanity_check_btf_ext(struct src_obj *obj); +static int linker_fixup_btf(struct src_obj *obj); +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj); +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj); + +static int finalize_btf(struct bpf_linker *linker); +static int finalize_btf_ext(struct bpf_linker *linker); + +void bpf_linker__free(struct bpf_linker *linker) +{ + int i; + + if (!linker) + return; + + free(linker->filename); + + if (linker->elf) + elf_end(linker->elf); + + if (linker->fd >= 0) + close(linker->fd); + + strset__free(linker->strtab_strs); + + btf__free(linker->btf); + btf_ext__free(linker->btf_ext); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + free(sec->sec_name); + free(sec->raw_data); + free(sec->sec_vars); + + free(sec->func_info.recs); + free(sec->line_info.recs); + free(sec->core_relo_info.recs); + } + free(linker->secs); + + free(linker); +} + +struct bpf_linker *bpf_linker__new(const char *filename, struct bpf_linker_opts *opts) +{ + struct bpf_linker *linker; + int err; + + if (!OPTS_VALID(opts, bpf_linker_opts)) + return NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn_elf("libelf initialization failed"); + return NULL; + } + + linker = calloc(1, sizeof(*linker)); + if (!linker) + return NULL; + + linker->fd = -1; + + err = init_output_elf(linker, filename); + if (err) + goto err_out; + + return linker; + +err_out: + bpf_linker__free(linker); + return NULL; +} + +static struct dst_sec *add_dst_sec(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *secs = linker->secs, *sec; + size_t new_cnt = linker->sec_cnt ? linker->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + linker->sec_cnt, 0, (new_cnt - linker->sec_cnt) * sizeof(*secs)); + + linker->secs = secs; + linker->sec_cnt = new_cnt; + + sec = &linker->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = strdup(sec_name); + if (!sec->sec_name) + return NULL; + + return sec; +} + +static Elf64_Sym *add_new_sym(struct bpf_linker *linker, size_t *sym_idx) +{ + struct dst_sec *symtab = &linker->secs[linker->symtab_sec_idx]; + Elf64_Sym *syms, *sym; + size_t sym_cnt = symtab->sec_sz / sizeof(*sym); + + syms = libbpf_reallocarray(symtab->raw_data, sym_cnt + 1, sizeof(*sym)); + if (!syms) + return NULL; + + sym = &syms[sym_cnt]; + memset(sym, 0, sizeof(*sym)); + + symtab->raw_data = syms; + symtab->sec_sz += sizeof(*sym); + symtab->shdr->sh_size += sizeof(*sym); + symtab->data->d_size += sizeof(*sym); + + if (sym_idx) + *sym_idx = sym_cnt; + + return sym; +} + +static int init_output_elf(struct bpf_linker *linker, const char *file) +{ + int err, str_off; + Elf64_Sym *init_sym; + struct dst_sec *sec; + + linker->filename = strdup(file); + if (!linker->filename) + return -ENOMEM; + + linker->fd = open(file, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (linker->fd < 0) { + err = -errno; + pr_warn("failed to create '%s': %d\n", file, err); + return err; + } + + linker->elf = elf_begin(linker->fd, ELF_C_WRITE, NULL); + if (!linker->elf) { + pr_warn_elf("failed to create ELF object"); + return -EINVAL; + } + + /* ELF header */ + linker->elf_hdr = elf64_newehdr(linker->elf); + if (!linker->elf_hdr){ + pr_warn_elf("failed to create ELF header"); + return -EINVAL; + } + + linker->elf_hdr->e_machine = EM_BPF; + linker->elf_hdr->e_type = ET_REL; +#if __BYTE_ORDER == __LITTLE_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + linker->elf_hdr->e_ident[EI_DATA] = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + + /* STRTAB */ + /* initialize strset with an empty string to conform to ELF */ + linker->strtab_strs = strset__new(INT_MAX, "", sizeof("")); + if (libbpf_get_error(linker->strtab_strs)) + return libbpf_get_error(linker->strtab_strs); + + sec = add_dst_sec(linker, ".strtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create STRTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create STRTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->elf_hdr->e_shstrndx = sec->sec_idx; + linker->strtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_STRTAB; + sec->shdr->sh_flags = SHF_STRINGS; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = 0; + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 1; + sec->shdr->sh_size = sec->sec_sz = 0; + sec->shdr->sh_entsize = 0; + + /* SYMTAB */ + sec = add_dst_sec(linker, ".symtab"); + if (!sec) + return -ENOMEM; + + sec->scn = elf_newscn(linker->elf); + if (!sec->scn) { + pr_warn_elf("failed to create SYMTAB section"); + return -EINVAL; + } + + sec->shdr = elf64_getshdr(sec->scn); + if (!sec->shdr) + return -EINVAL; + + sec->data = elf_newdata(sec->scn); + if (!sec->data) { + pr_warn_elf("failed to create SYMTAB data"); + return -EINVAL; + } + + str_off = strset__add_str(linker->strtab_strs, sec->sec_name); + if (str_off < 0) + return str_off; + + sec->sec_idx = elf_ndxscn(sec->scn); + linker->symtab_sec_idx = sec->sec_idx; + + sec->shdr->sh_name = str_off; + sec->shdr->sh_type = SHT_SYMTAB; + sec->shdr->sh_flags = 0; + sec->shdr->sh_offset = 0; + sec->shdr->sh_link = linker->strtab_sec_idx; + /* sh_info should be one greater than the index of the last local + * symbol (i.e., binding is STB_LOCAL). But why and who cares? + */ + sec->shdr->sh_info = 0; + sec->shdr->sh_addralign = 8; + sec->shdr->sh_entsize = sizeof(Elf64_Sym); + + /* .BTF */ + linker->btf = btf__new_empty(); + err = libbpf_get_error(linker->btf); + if (err) + return err; + + /* add the special all-zero symbol */ + init_sym = add_new_sym(linker, NULL); + if (!init_sym) + return -EINVAL; + + init_sym->st_name = 0; + init_sym->st_info = 0; + init_sym->st_other = 0; + init_sym->st_shndx = SHN_UNDEF; + init_sym->st_value = 0; + init_sym->st_size = 0; + + return 0; +} + +int bpf_linker__add_file(struct bpf_linker *linker, const char *filename) +{ + struct src_obj obj = {}; + int err = 0; + + if (!linker->elf) + return -EINVAL; + + err = err ?: linker_load_obj_file(linker, filename, &obj); + err = err ?: linker_append_sec_data(linker, &obj); + err = err ?: linker_append_elf_syms(linker, &obj); + err = err ?: linker_append_elf_relos(linker, &obj); + err = err ?: linker_append_btf(linker, &obj); + err = err ?: linker_append_btf_ext(linker, &obj); + + /* free up src_obj resources */ + free(obj.btf_type_map); + btf__free(obj.btf); + btf_ext__free(obj.btf_ext); + free(obj.secs); + free(obj.sym_map); + if (obj.elf) + elf_end(obj.elf); + if (obj.fd >= 0) + close(obj.fd); + + return err; +} + +static bool is_dwarf_sec_name(const char *name) +{ + /* approximation, but the actual list is too long */ + return strncmp(name, ".debug_", sizeof(".debug_") - 1) == 0; +} + +static bool is_ignored_sec(struct src_sec *sec) +{ + Elf64_Shdr *shdr = sec->shdr; + const char *name = sec->sec_name; + + /* no special handling of .strtab */ + if (shdr->sh_type == SHT_STRTAB) + return true; + + /* ignore .llvm_addrsig section as well */ + if (shdr->sh_type == SHT_LLVM_ADDRSIG) + return true; + + /* no subprograms will lead to an empty .text section, ignore it */ + if (shdr->sh_type == SHT_PROGBITS && shdr->sh_size == 0 && + strcmp(sec->sec_name, ".text") == 0) + return true; + + /* DWARF sections */ + if (is_dwarf_sec_name(sec->sec_name)) + return true; + + if (strncmp(name, ".rel", sizeof(".rel") - 1) == 0) { + name += sizeof(".rel") - 1; + /* DWARF section relocations */ + if (is_dwarf_sec_name(name)) + return true; + + /* .BTF and .BTF.ext don't need relocations */ + if (strcmp(name, BTF_ELF_SEC) == 0 || + strcmp(name, BTF_EXT_ELF_SEC) == 0) + return true; + } + + return false; +} + +static struct src_sec *add_src_sec(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *secs = obj->secs, *sec; + size_t new_cnt = obj->sec_cnt ? obj->sec_cnt + 1 : 2; + + secs = libbpf_reallocarray(secs, new_cnt, sizeof(*secs)); + if (!secs) + return NULL; + + /* zero out newly allocated memory */ + memset(secs + obj->sec_cnt, 0, (new_cnt - obj->sec_cnt) * sizeof(*secs)); + + obj->secs = secs; + obj->sec_cnt = new_cnt; + + sec = &obj->secs[new_cnt - 1]; + sec->id = new_cnt - 1; + sec->sec_name = sec_name; + + return sec; +} + +static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, struct src_obj *obj) +{ +#if __BYTE_ORDER == __LITTLE_ENDIAN + const int host_endianness = ELFDATA2LSB; +#elif __BYTE_ORDER == __BIG_ENDIAN + const int host_endianness = ELFDATA2MSB; +#else +#error "Unknown __BYTE_ORDER" +#endif + int err = 0; + Elf_Scn *scn; + Elf_Data *data; + Elf64_Ehdr *ehdr; + Elf64_Shdr *shdr; + struct src_sec *sec; + + pr_debug("linker: adding object file '%s'...\n", filename); + + obj->filename = filename; + + obj->fd = open(filename, O_RDONLY); + if (obj->fd < 0) { + err = -errno; + pr_warn("failed to open file '%s': %d\n", filename, err); + return err; + } + obj->elf = elf_begin(obj->fd, ELF_C_READ_MMAP, NULL); + if (!obj->elf) { + err = -errno; + pr_warn_elf("failed to parse ELF file '%s'", filename); + return err; + } + + /* Sanity check ELF file high-level properties */ + ehdr = elf64_getehdr(obj->elf); + if (!ehdr) { + err = -errno; + pr_warn_elf("failed to get ELF header for %s", filename); + return err; + } + if (ehdr->e_ident[EI_DATA] != host_endianness) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported byte order of ELF file %s", filename); + return err; + } + if (ehdr->e_type != ET_REL + || ehdr->e_machine != EM_BPF + || ehdr->e_ident[EI_CLASS] != ELFCLASS64) { + err = -EOPNOTSUPP; + pr_warn_elf("unsupported kind of ELF file %s", filename); + return err; + } + + if (elf_getshdrstrndx(obj->elf, &obj->shstrs_sec_idx)) { + err = -errno; + pr_warn_elf("failed to get SHSTRTAB section index for %s", filename); + return err; + } + + scn = NULL; + while ((scn = elf_nextscn(obj->elf, scn)) != NULL) { + size_t sec_idx = elf_ndxscn(scn); + const char *sec_name; + + shdr = elf64_getshdr(scn); + if (!shdr) { + err = -errno; + pr_warn_elf("failed to get section #%zu header for %s", + sec_idx, filename); + return err; + } + + sec_name = elf_strptr(obj->elf, obj->shstrs_sec_idx, shdr->sh_name); + if (!sec_name) { + err = -errno; + pr_warn_elf("failed to get section #%zu name for %s", + sec_idx, filename); + return err; + } + + data = elf_getdata(scn, 0); + if (!data) { + err = -errno; + pr_warn_elf("failed to get section #%zu (%s) data from %s", + sec_idx, sec_name, filename); + return err; + } + + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->scn = scn; + sec->shdr = shdr; + sec->data = data; + sec->sec_idx = elf_ndxscn(scn); + + if (is_ignored_sec(sec)) { + sec->skipped = true; + continue; + } + + switch (shdr->sh_type) { + case SHT_SYMTAB: + if (obj->symtab_sec_idx) { + err = -EOPNOTSUPP; + pr_warn("multiple SYMTAB sections found, not supported\n"); + return err; + } + obj->symtab_sec_idx = sec_idx; + break; + case SHT_STRTAB: + /* we'll construct our own string table */ + break; + case SHT_PROGBITS: + if (strcmp(sec_name, BTF_ELF_SEC) == 0) { + obj->btf = btf__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf); + if (err) { + pr_warn("failed to parse .BTF from %s: %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + if (strcmp(sec_name, BTF_EXT_ELF_SEC) == 0) { + obj->btf_ext = btf_ext__new(data->d_buf, shdr->sh_size); + err = libbpf_get_error(obj->btf_ext); + if (err) { + pr_warn("failed to parse .BTF.ext from '%s': %d\n", filename, err); + return err; + } + sec->skipped = true; + continue; + } + + /* data & code */ + break; + case SHT_NOBITS: + /* BSS */ + break; + case SHT_REL: + /* relocations */ + break; + default: + pr_warn("unrecognized section #%zu (%s) in %s\n", + sec_idx, sec_name, filename); + err = -EINVAL; + return err; + } + } + + err = err ?: linker_sanity_check_elf(obj); + err = err ?: linker_sanity_check_btf(obj); + err = err ?: linker_sanity_check_btf_ext(obj); + err = err ?: linker_fixup_btf(obj); + + return err; +} + +static bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +static int linker_sanity_check_elf(struct src_obj *obj) +{ + struct src_sec *sec, *link_sec; + int i, j, n; + + if (!obj->symtab_sec_idx) { + pr_warn("ELF is missing SYMTAB section in %s\n", obj->filename); + return -EINVAL; + } + if (!obj->shstrs_sec_idx) { + pr_warn("ELF is missing section headers STRTAB section in %s\n", obj->filename); + return -EINVAL; + } + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (sec->sec_name[0] == '\0') { + pr_warn("ELF section #%zu has empty name in %s\n", sec->sec_idx, obj->filename); + return -EINVAL; + } + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + return -EINVAL; + if (sec->shdr->sh_addralign != sec->data->d_align) + return -EINVAL; + + if (sec->shdr->sh_size != sec->data->d_size) + return -EINVAL; + + switch (sec->shdr->sh_type) { + case SHT_SYMTAB: { + Elf64_Sym *sym; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Sym)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + if (!sec->shdr->sh_link || sec->shdr->sh_link >= obj->sec_cnt) { + pr_warn("ELF SYMTAB section #%zu points to missing STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_link]; + if (link_sec->shdr->sh_type != SHT_STRTAB) { + pr_warn("ELF SYMTAB section #%zu points to invalid STRTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + sym = sec->data->d_buf; + for (j = 0; j < n; j++, sym++) { + if (sym->st_shndx + && sym->st_shndx < SHN_LORESERVE + && sym->st_shndx >= obj->sec_cnt) { + pr_warn("ELF sym #%d in section #%zu points to missing section #%zu in %s\n", + j, sec->sec_idx, (size_t)sym->st_shndx, obj->filename); + return -EINVAL; + } + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) { + if (sym->st_value != 0) + return -EINVAL; + } + } + break; + } + case SHT_STRTAB: + break; + case SHT_PROGBITS: + if (sec->shdr->sh_flags & SHF_EXECINSTR) { + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + return -EINVAL; + } + break; + case SHT_NOBITS: + break; + case SHT_REL: { + Elf64_Rel *relo; + struct src_sec *sym_sec; + + if (sec->shdr->sh_entsize != sizeof(Elf64_Rel)) + return -EINVAL; + if (sec->shdr->sh_size % sec->shdr->sh_entsize != 0) + return -EINVAL; + + /* SHT_REL's sh_link should point to SYMTAB */ + if (sec->shdr->sh_link != obj->symtab_sec_idx) { + pr_warn("ELF relo section #%zu points to invalid SYMTAB section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_link, obj->filename); + return -EINVAL; + } + + /* SHT_REL's sh_info points to relocated section */ + if (!sec->shdr->sh_info || sec->shdr->sh_info >= obj->sec_cnt) { + pr_warn("ELF relo section #%zu points to missing section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + link_sec = &obj->secs[sec->shdr->sh_info]; + + /* .rel<secname> -> <secname> pattern is followed */ + if (strncmp(sec->sec_name, ".rel", sizeof(".rel") - 1) != 0 + || strcmp(sec->sec_name + sizeof(".rel") - 1, link_sec->sec_name) != 0) { + pr_warn("ELF relo section #%zu name has invalid name in %s\n", + sec->sec_idx, obj->filename); + return -EINVAL; + } + + /* don't further validate relocations for ignored sections */ + if (link_sec->skipped) + break; + + /* relocatable section is data or instructions */ + if (link_sec->shdr->sh_type != SHT_PROGBITS + && link_sec->shdr->sh_type != SHT_NOBITS) { + pr_warn("ELF relo section #%zu points to invalid section #%zu in %s\n", + sec->sec_idx, (size_t)sec->shdr->sh_info, obj->filename); + return -EINVAL; + } + + /* check sanity of each relocation */ + n = sec->shdr->sh_size / sec->shdr->sh_entsize; + relo = sec->data->d_buf; + sym_sec = &obj->secs[obj->symtab_sec_idx]; + for (j = 0; j < n; j++, relo++) { + size_t sym_idx = ELF64_R_SYM(relo->r_info); + size_t sym_type = ELF64_R_TYPE(relo->r_info); + + if (sym_type != R_BPF_64_64 && sym_type != R_BPF_64_32) { + pr_warn("ELF relo #%d in section #%zu has unexpected type %zu in %s\n", + j, sec->sec_idx, sym_type, obj->filename); + return -EINVAL; + } + + if (!sym_idx || sym_idx * sizeof(Elf64_Sym) >= sym_sec->shdr->sh_size) { + pr_warn("ELF relo #%d in section #%zu points to invalid symbol #%zu in %s\n", + j, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + + if (link_sec->shdr->sh_flags & SHF_EXECINSTR) { + if (relo->r_offset % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF relo #%d in section #%zu points to missing symbol #%zu in %s\n", + j, sec->sec_idx, sym_idx, obj->filename); + return -EINVAL; + } + } + } + break; + } + case SHT_LLVM_ADDRSIG: + break; + default: + pr_warn("ELF section #%zu (%s) has unrecognized type %zu in %s\n", + sec->sec_idx, sec->sec_name, (size_t)sec->shdr->sh_type, obj->filename); + return -EINVAL; + } + } + + return 0; +} + +static int check_btf_type_id(__u32 *type_id, void *ctx) +{ + struct btf *btf = ctx; + + if (*type_id > btf__get_nr_types(btf)) + return -EINVAL; + + return 0; +} + +static int check_btf_str_off(__u32 *str_off, void *ctx) +{ + struct btf *btf = ctx; + const char *s; + + s = btf__str_by_offset(btf, *str_off); + + if (!s) + return -EINVAL; + + return 0; +} + +static int linker_sanity_check_btf(struct src_obj *obj) +{ + struct btf_type *t; + int i, n, err = 0; + + if (!obj->btf) + return 0; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + t = btf_type_by_id(obj->btf, i); + + err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); + err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + if (err) + return err; + } + + return 0; +} + +static int linker_sanity_check_btf_ext(struct src_obj *obj) +{ + int err = 0; + + if (!obj->btf_ext) + return 0; + + /* can't use .BTF.ext without .BTF */ + if (!obj->btf) + return -EINVAL; + + err = err ?: btf_ext_visit_type_ids(obj->btf_ext, check_btf_type_id, obj->btf); + err = err ?: btf_ext_visit_str_offs(obj->btf_ext, check_btf_str_off, obj->btf); + if (err) + return err; + + return 0; +} + +static int init_sec(struct bpf_linker *linker, struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + dst_sec->sec_sz = 0; + dst_sec->sec_idx = 0; + + /* ephemeral sections are just thin section shells lacking most parts */ + if (src_sec->ephemeral) + return 0; + + scn = elf_newscn(linker->elf); + if (!scn) + return -1; + data = elf_newdata(scn); + if (!data) + return -1; + shdr = elf64_getshdr(scn); + if (!shdr) + return -1; + + dst_sec->scn = scn; + dst_sec->shdr = shdr; + dst_sec->data = data; + dst_sec->sec_idx = elf_ndxscn(scn); + + name_off = strset__add_str(linker->strtab_strs, src_sec->sec_name); + if (name_off < 0) + return name_off; + + shdr->sh_name = name_off; + shdr->sh_type = src_sec->shdr->sh_type; + shdr->sh_flags = src_sec->shdr->sh_flags; + shdr->sh_size = 0; + /* sh_link and sh_info have different meaning for different types of + * sections, so we leave it up to the caller code to fill them in, if + * necessary + */ + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = src_sec->shdr->sh_addralign; + shdr->sh_entsize = src_sec->shdr->sh_entsize; + + data->d_type = src_sec->data->d_type; + data->d_size = 0; + data->d_buf = NULL; + data->d_align = src_sec->data->d_align; + data->d_off = 0; + + return 0; +} + +static struct dst_sec *find_dst_sec_by_name(struct bpf_linker *linker, const char *sec_name) +{ + struct dst_sec *sec; + int i; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static bool secs_match(struct dst_sec *dst, struct src_sec *src) +{ + if (dst->shdr->sh_type != src->shdr->sh_type) { + pr_warn("sec %s types mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_flags != src->shdr->sh_flags) { + pr_warn("sec %s flags mismatch\n", dst->sec_name); + return false; + } + if (dst->shdr->sh_entsize != src->shdr->sh_entsize) { + pr_warn("sec %s entsize mismatch\n", dst->sec_name); + return false; + } + + return true; +} + +static bool sec_content_is_same(struct dst_sec *dst_sec, struct src_sec *src_sec) +{ + if (dst_sec->sec_sz != src_sec->shdr->sh_size) + return false; + if (memcmp(dst_sec->raw_data, src_sec->data->d_buf, dst_sec->sec_sz) != 0) + return false; + return true; +} + +static int extend_sec(struct dst_sec *dst, struct src_sec *src) +{ + void *tmp; + size_t dst_align = dst->shdr->sh_addralign; + size_t src_align = src->shdr->sh_addralign; + size_t dst_align_sz, dst_final_sz; + + if (dst_align == 0) + dst_align = 1; + if (dst_align < src_align) + dst_align = src_align; + + dst_align_sz = (dst->sec_sz + dst_align - 1) / dst_align * dst_align; + + /* no need to re-align final size */ + dst_final_sz = dst_align_sz + src->shdr->sh_size; + + if (src->shdr->sh_type != SHT_NOBITS) { + tmp = realloc(dst->raw_data, dst_final_sz); + if (!tmp) + return -ENOMEM; + dst->raw_data = tmp; + + /* pad dst section, if it's alignment forced size increase */ + memset(dst->raw_data + dst->sec_sz, 0, dst_align_sz - dst->sec_sz); + /* now copy src data at a properly aligned offset */ + memcpy(dst->raw_data + dst_align_sz, src->data->d_buf, src->shdr->sh_size); + } + + dst->sec_sz = dst_final_sz; + dst->shdr->sh_size = dst_final_sz; + dst->data->d_size = dst_final_sz; + + dst->shdr->sh_addralign = dst_align; + dst->data->d_align = dst_align; + + src->dst_off = dst_align_sz; + + return 0; +} + +static bool is_data_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped) + return false; + /* ephemeral sections are data sections, e.g., .kconfig, .ksyms */ + if (sec->ephemeral) + return true; + return sec->shdr->sh_type == SHT_PROGBITS || sec->shdr->sh_type == SHT_NOBITS; +} + +static bool is_relo_sec(struct src_sec *sec) +{ + if (!sec || sec->skipped || sec->ephemeral) + return false; + return sec->shdr->sh_type == SHT_REL; +} + +static int linker_append_sec_data(struct bpf_linker *linker, struct src_obj *obj) +{ + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + + src_sec = &obj->secs[i]; + if (!is_data_sec(src_sec)) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else { + if (!secs_match(dst_sec, src_sec)) { + pr_warn("ELF sections %s are incompatible\n", src_sec->sec_name); + return -1; + } + + /* "license" and "version" sections are deduped */ + if (strcmp(src_sec->sec_name, "license") == 0 + || strcmp(src_sec->sec_name, "version") == 0) { + if (!sec_content_is_same(dst_sec, src_sec)) { + pr_warn("non-identical contents of section '%s' are not supported\n", src_sec->sec_name); + return -EINVAL; + } + src_sec->skipped = true; + src_sec->dst_id = dst_sec->id; + continue; + } + } + + /* record mapped section index */ + src_sec->dst_id = dst_sec->id; + + if (src_sec->ephemeral) + continue; + + err = extend_sec(dst_sec, src_sec); + if (err) + return err; + } + + return 0; +} + +static int linker_append_elf_syms(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf, *dst_sym; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + + obj->sym_map = calloc(n + 1, sizeof(*obj->sym_map)); + if (!obj->sym_map) + return -ENOMEM; + + for (i = 0; i < n; i++, sym++) { + struct src_sec *src_sec = NULL; + struct dst_sec *dst_sec = NULL; + const char *sym_name; + size_t dst_sym_idx; + int name_off; + + /* we already have all-zero initial symbol */ + if (sym->st_name == 0 && sym->st_info == 0 && + sym->st_other == 0 && sym->st_shndx == SHN_UNDEF && + sym->st_value == 0 && sym->st_size ==0) + continue; + + sym_name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!sym_name) { + pr_warn("can't fetch symbol name for symbol #%d in '%s'\n", i, obj->filename); + return -1; + } + + if (sym->st_shndx && sym->st_shndx < SHN_LORESERVE) { + src_sec = &obj->secs[sym->st_shndx]; + if (src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* allow only one STT_SECTION symbol per section */ + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sec->sec_sym_idx) { + obj->sym_map[i] = dst_sec->sec_sym_idx; + continue; + } + } + + name_off = strset__add_str(linker->strtab_strs, sym_name); + if (name_off < 0) + return name_off; + + dst_sym = add_new_sym(linker, &dst_sym_idx); + if (!dst_sym) + return -ENOMEM; + + dst_sym->st_name = name_off; + dst_sym->st_info = sym->st_info; + dst_sym->st_other = sym->st_other; + dst_sym->st_shndx = src_sec ? dst_sec->sec_idx : sym->st_shndx; + dst_sym->st_value = (src_sec ? src_sec->dst_off : 0) + sym->st_value; + dst_sym->st_size = sym->st_size; + + obj->sym_map[i] = dst_sym_idx; + + if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION && dst_sym) { + dst_sec->sec_sym_idx = dst_sym_idx; + dst_sym->st_value = 0; + } + + } + + return 0; +} + +static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) +{ + struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; + struct dst_sec *dst_symtab = &linker->secs[linker->symtab_sec_idx]; + int i, err; + + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec, *src_linked_sec; + struct dst_sec *dst_sec, *dst_linked_sec; + Elf64_Rel *src_rel, *dst_rel; + int j, n; + + src_sec = &obj->secs[i]; + if (!is_relo_sec(src_sec)) + continue; + + /* shdr->sh_info points to relocatable section */ + src_linked_sec = &obj->secs[src_sec->shdr->sh_info]; + if (src_linked_sec->skipped) + continue; + + dst_sec = find_dst_sec_by_name(linker, src_sec->sec_name); + if (!dst_sec) { + dst_sec = add_dst_sec(linker, src_sec->sec_name); + if (!dst_sec) + return -ENOMEM; + err = init_sec(linker, dst_sec, src_sec); + if (err) { + pr_warn("failed to init section '%s'\n", src_sec->sec_name); + return err; + } + } else if (!secs_match(dst_sec, src_sec)) { + pr_warn("Secs %s are not compatible\n", src_sec->sec_name); + return -1; + } + + /* shdr->sh_link points to SYMTAB */ + dst_sec->shdr->sh_link = linker->symtab_sec_idx; + + /* shdr->sh_info points to relocated section */ + dst_linked_sec = &linker->secs[src_linked_sec->dst_id]; + dst_sec->shdr->sh_info = dst_linked_sec->sec_idx; + + src_sec->dst_id = dst_sec->id; + err = extend_sec(dst_sec, src_sec); + if (err) + return err; + + src_rel = src_sec->data->d_buf; + dst_rel = dst_sec->raw_data + src_sec->dst_off; + n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; + for (j = 0; j < n; j++, src_rel++, dst_rel++) { + size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); + size_t sym_type = ELF64_R_TYPE(src_rel->r_info); + Elf64_Sym *src_sym, *dst_sym; + size_t dst_sym_idx; + + src_sym_idx = ELF64_R_SYM(src_rel->r_info); + src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; + + dst_sym_idx = obj->sym_map[src_sym_idx]; + dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; + dst_rel->r_offset += src_linked_sec->dst_off; + sym_type = ELF64_R_TYPE(src_rel->r_info); + dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); + + if (ELF64_ST_TYPE(src_sym->st_info) == STT_SECTION) { + struct src_sec *sec = &obj->secs[src_sym->st_shndx]; + struct bpf_insn *insn; + + if (src_linked_sec->shdr->sh_flags & SHF_EXECINSTR) { + /* calls to the very first static function inside + * .text section at offset 0 will + * reference section symbol, not the + * function symbol. Fix that up, + * otherwise it won't be possible to + * relocate calls to two different + * static functions with the same name + * (rom two different object files) + */ + insn = dst_linked_sec->raw_data + dst_rel->r_offset; + if (insn->code == (BPF_JMP | BPF_CALL)) + insn->imm += sec->dst_off / sizeof(struct bpf_insn); + else + insn->imm += sec->dst_off; + } else { + pr_warn("relocation against STT_SECTION in non-exec section is not supported!\n"); + return -EINVAL; + } + } + + } + } + + return 0; +} + +static struct src_sec *find_src_sec_by_name(struct src_obj *obj, const char *sec_name) +{ + struct src_sec *sec; + int i; + + for (i = 1; i < obj->sec_cnt; i++) { + sec = &obj->secs[i]; + + if (strcmp(sec->sec_name, sec_name) == 0) + return sec; + } + + return NULL; +} + +static Elf64_Sym *find_sym_by_name(struct src_obj *obj, size_t sec_idx, + int sym_type, const char *sym_name) +{ + struct src_sec *symtab = &obj->secs[obj->symtab_sec_idx]; + Elf64_Sym *sym = symtab->data->d_buf; + int i, n = symtab->shdr->sh_size / symtab->shdr->sh_entsize; + int str_sec_idx = symtab->shdr->sh_link; + const char *name; + + for (i = 0; i < n; i++, sym++) { + if (sym->st_shndx != sec_idx) + continue; + if (ELF64_ST_TYPE(sym->st_info) != sym_type) + continue; + + name = elf_strptr(obj->elf, str_sec_idx, sym->st_name); + if (!name) + return NULL; + + if (strcmp(sym_name, name) != 0) + continue; + + return sym; + } + + return NULL; +} + +static int linker_fixup_btf(struct src_obj *obj) +{ + const char *sec_name; + struct src_sec *sec; + int i, j, n, m; + + if (!obj->btf) + return 0; + + n = btf__get_nr_types(obj->btf); + for (i = 1; i <= n; i++) { + struct btf_var_secinfo *vi; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (btf_kind(t) != BTF_KIND_DATASEC) + continue; + + sec_name = btf__str_by_offset(obj->btf, t->name_off); + sec = find_src_sec_by_name(obj, sec_name); + if (sec) { + /* record actual section size, unless ephemeral */ + if (sec->shdr) + t->size = sec->shdr->sh_size; + } else { + /* BTF can have some sections that are not represented + * in ELF, e.g., .kconfig and .ksyms, which are used + * for special extern variables. Here we'll + * pre-create "section shells" for them to be able to + * keep track of extra per-section metadata later + * (e.g., BTF variables). + */ + sec = add_src_sec(obj, sec_name); + if (!sec) + return -ENOMEM; + + sec->ephemeral = true; + sec->sec_idx = 0; /* will match UNDEF shndx in ELF */ + } + + /* remember ELF section and its BTF type ID match */ + sec->sec_type_id = i; + + /* fix up variable offsets */ + vi = btf_var_secinfos(t); + for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { + const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); + const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); + int var_linkage = btf_var(vt)->linkage; + Elf64_Sym *sym; + + /* no need to patch up static or extern vars */ + if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) + continue; + + sym = find_sym_by_name(obj, sec->sec_idx, STT_OBJECT, var_name); + if (!sym) { + pr_warn("failed to find symbol for variable '%s' in section '%s'\n", var_name, sec_name); + return -ENOENT; + } + + vi->offset = sym->st_value; + } + } + + return 0; +} + +static int remap_type_id(__u32 *type_id, void *ctx) +{ + int *id_map = ctx; + + *type_id = id_map[*type_id]; + + return 0; +} + +static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_type *t; + int i, j, n, start_id, id; + + if (!obj->btf) + return 0; + + start_id = btf__get_nr_types(linker->btf) + 1; + n = btf__get_nr_types(obj->btf); + + obj->btf_type_map = calloc(n + 1, sizeof(int)); + if (!obj->btf_type_map) + return -ENOMEM; + + for (i = 1; i <= n; i++) { + t = btf__type_by_id(obj->btf, i); + + /* DATASECs are handled specially below */ + if (btf_kind(t) == BTF_KIND_DATASEC) + continue; + + id = btf__add_type(linker->btf, obj->btf, t); + if (id < 0) { + pr_warn("failed to append BTF type #%d from file '%s'\n", i, obj->filename); + return id; + } + + obj->btf_type_map[i] = id; + } + + /* remap all the types except DATASECs */ + n = btf__get_nr_types(linker->btf); + for (i = start_id; i <= n; i++) { + struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + + if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) + return -EINVAL; + } + + /* append DATASEC info */ + for (i = 1; i < obj->sec_cnt; i++) { + struct src_sec *src_sec; + struct dst_sec *dst_sec; + const struct btf_var_secinfo *src_var; + struct btf_var_secinfo *dst_var; + + src_sec = &obj->secs[i]; + if (!src_sec->sec_type_id || src_sec->skipped) + continue; + dst_sec = &linker->secs[src_sec->dst_id]; + + /* Mark section as having BTF regardless of the presence of + * variables. In some cases compiler might generate empty BTF + * with no variables information. E.g., when promoting local + * array/structure variable initial values and BPF object + * file otherwise has no read-only static variables in + * .rodata. We need to preserve such empty BTF and just set + * correct section size. + */ + dst_sec->has_btf = true; + + t = btf__type_by_id(obj->btf, src_sec->sec_type_id); + src_var = btf_var_secinfos(t); + n = btf_vlen(t); + for (j = 0; j < n; j++, src_var++) { + void *sec_vars = dst_sec->sec_vars; + + sec_vars = libbpf_reallocarray(sec_vars, + dst_sec->sec_var_cnt + 1, + sizeof(*dst_sec->sec_vars)); + if (!sec_vars) + return -ENOMEM; + + dst_sec->sec_vars = sec_vars; + dst_sec->sec_var_cnt++; + + dst_var = &dst_sec->sec_vars[dst_sec->sec_var_cnt - 1]; + dst_var->type = obj->btf_type_map[src_var->type]; + dst_var->size = src_var->size; + dst_var->offset = src_sec->dst_off + src_var->offset; + } + } + + return 0; +} + +static void *add_btf_ext_rec(struct btf_ext_sec_data *ext_data, const void *src_rec) +{ + void *tmp; + + tmp = libbpf_reallocarray(ext_data->recs, ext_data->rec_cnt + 1, ext_data->rec_sz); + if (!tmp) + return NULL; + ext_data->recs = tmp; + + tmp += ext_data->rec_cnt * ext_data->rec_sz; + memcpy(tmp, src_rec, ext_data->rec_sz); + + ext_data->rec_cnt++; + + return tmp; +} + +static int linker_append_btf_ext(struct bpf_linker *linker, struct src_obj *obj) +{ + const struct btf_ext_info_sec *ext_sec; + const char *sec_name, *s; + struct src_sec *src_sec; + struct dst_sec *dst_sec; + int rec_sz, str_off, i; + + if (!obj->btf_ext) + return 0; + + rec_sz = obj->btf_ext->func_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->func_info, ext_sec) { + struct bpf_func_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->func_info.rec_sz == 0) + dst_sec->func_info.rec_sz = rec_sz; + if (dst_sec->func_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->func_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->func_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + } + } + + rec_sz = obj->btf_ext->line_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->line_info, ext_sec) { + struct bpf_line_info_min *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->line_info.rec_sz == 0) + dst_sec->line_info.rec_sz = rec_sz; + if (dst_sec->line_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->line_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->line_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + + s = btf__str_by_offset(obj->btf, src_rec->file_name_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->file_name_off = str_off; + + s = btf__str_by_offset(obj->btf, src_rec->line_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->line_off = str_off; + + /* dst_rec->line_col is fine */ + } + } + + rec_sz = obj->btf_ext->core_relo_info.rec_size; + for_each_btf_ext_sec(&obj->btf_ext->core_relo_info, ext_sec) { + struct bpf_core_relo *src_rec, *dst_rec; + + sec_name = btf__name_by_offset(obj->btf, ext_sec->sec_name_off); + src_sec = find_src_sec_by_name(obj, sec_name); + if (!src_sec) { + pr_warn("can't find section '%s' referenced from .BTF.ext\n", sec_name); + return -EINVAL; + } + dst_sec = &linker->secs[src_sec->dst_id]; + + if (dst_sec->core_relo_info.rec_sz == 0) + dst_sec->core_relo_info.rec_sz = rec_sz; + if (dst_sec->core_relo_info.rec_sz != rec_sz) { + pr_warn("incompatible .BTF.ext record sizes for section '%s'\n", sec_name); + return -EINVAL; + } + + for_each_btf_ext_rec(&obj->btf_ext->core_relo_info, ext_sec, i, src_rec) { + dst_rec = add_btf_ext_rec(&dst_sec->core_relo_info, src_rec); + if (!dst_rec) + return -ENOMEM; + + dst_rec->insn_off += src_sec->dst_off; + dst_rec->type_id = obj->btf_type_map[dst_rec->type_id]; + + s = btf__str_by_offset(obj->btf, src_rec->access_str_off); + str_off = btf__add_str(linker->btf, s); + if (str_off < 0) + return -ENOMEM; + dst_rec->access_str_off = str_off; + + /* dst_rec->kind is fine */ + } + } + + return 0; +} + +int bpf_linker__finalize(struct bpf_linker *linker) +{ + struct dst_sec *sec; + size_t strs_sz; + const void *strs; + int err, i; + + if (!linker->elf) + return -EINVAL; + + err = finalize_btf(linker); + if (err) + return err; + + /* Finalize strings */ + strs_sz = strset__data_size(linker->strtab_strs); + strs = strset__data(linker->strtab_strs); + + sec = &linker->secs[linker->strtab_sec_idx]; + sec->data->d_align = 1; + sec->data->d_off = 0LL; + sec->data->d_buf = (void *)strs; + sec->data->d_type = ELF_T_BYTE; + sec->data->d_size = strs_sz; + sec->shdr->sh_size = strs_sz; + + for (i = 1; i < linker->sec_cnt; i++) { + sec = &linker->secs[i]; + + /* STRTAB is handled specially above */ + if (sec->sec_idx == linker->strtab_sec_idx) + continue; + + /* special ephemeral sections (.ksyms, .kconfig, etc) */ + if (!sec->scn) + continue; + + sec->data->d_buf = sec->raw_data; + } + + /* Finalize ELF layout */ + if (elf_update(linker->elf, ELF_C_NULL) < 0) { + err = -errno; + pr_warn_elf("failed to finalize ELF layout"); + return err; + } + + /* Write out final ELF contents */ + if (elf_update(linker->elf, ELF_C_WRITE) < 0) { + err = -errno; + pr_warn_elf("failed to write ELF contents"); + return err; + } + + elf_end(linker->elf); + close(linker->fd); + + linker->elf = NULL; + linker->fd = -1; + + return 0; +} + +static int emit_elf_data_sec(struct bpf_linker *linker, const char *sec_name, + size_t align, const void *raw_data, size_t raw_sz) +{ + Elf_Scn *scn; + Elf_Data *data; + Elf64_Shdr *shdr; + int name_off; + + name_off = strset__add_str(linker->strtab_strs, sec_name); + if (name_off < 0) + return name_off; + + scn = elf_newscn(linker->elf); + if (!scn) + return -ENOMEM; + data = elf_newdata(scn); + if (!data) + return -ENOMEM; + shdr = elf64_getshdr(scn); + if (!shdr) + return -EINVAL; + + shdr->sh_name = name_off; + shdr->sh_type = SHT_PROGBITS; + shdr->sh_flags = 0; + shdr->sh_size = raw_sz; + shdr->sh_link = 0; + shdr->sh_info = 0; + shdr->sh_addralign = align; + shdr->sh_entsize = 0; + + data->d_type = ELF_T_BYTE; + data->d_size = raw_sz; + data->d_buf = (void *)raw_data; + data->d_align = align; + data->d_off = 0; + + return 0; +} + +static int finalize_btf(struct bpf_linker *linker) +{ + struct btf *btf = linker->btf; + const void *raw_data; + int i, j, id, err; + __u32 raw_sz; + + /* bail out if no BTF data was produced */ + if (btf__get_nr_types(linker->btf) == 0) + return 0; + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (!sec->has_btf) + continue; + + id = btf__add_datasec(btf, sec->sec_name, sec->sec_sz); + if (id < 0) { + pr_warn("failed to add consolidated BTF type for datasec '%s': %d\n", + sec->sec_name, id); + return id; + } + + for (j = 0; j < sec->sec_var_cnt; j++) { + struct btf_var_secinfo *vi = &sec->sec_vars[j]; + + if (btf__add_datasec_var_info(btf, vi->type, vi->offset, vi->size)) + return -EINVAL; + } + } + + err = finalize_btf_ext(linker); + if (err) { + pr_warn(".BTF.ext generation failed: %d\n", err); + return err; + } + + err = btf__dedup(linker->btf, linker->btf_ext, NULL); + if (err) { + pr_warn("BTF dedup failed: %d\n", err); + return err; + } + + /* Emit .BTF section */ + raw_data = btf__get_raw_data(linker->btf, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF ELF section: %d\n", err); + return err; + } + + /* Emit .BTF.ext section */ + if (linker->btf_ext) { + raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + if (!raw_data) + return -ENOMEM; + + err = emit_elf_data_sec(linker, BTF_EXT_ELF_SEC, 8, raw_data, raw_sz); + if (err) { + pr_warn("failed to write out .BTF.ext ELF section: %d\n", err); + return err; + } + } + + return 0; +} + +static int emit_btf_ext_data(struct bpf_linker *linker, void *output, + const char *sec_name, struct btf_ext_sec_data *sec_data) +{ + struct btf_ext_info_sec *sec_info; + void *cur = output; + int str_off; + size_t sz; + + if (!sec_data->rec_cnt) + return 0; + + str_off = btf__add_str(linker->btf, sec_name); + if (str_off < 0) + return -ENOMEM; + + sec_info = cur; + sec_info->sec_name_off = str_off; + sec_info->num_info = sec_data->rec_cnt; + cur += sizeof(struct btf_ext_info_sec); + + sz = sec_data->rec_cnt * sec_data->rec_sz; + memcpy(cur, sec_data->recs, sz); + cur += sz; + + return cur - output; +} + +static int finalize_btf_ext(struct bpf_linker *linker) +{ + size_t funcs_sz = 0, lines_sz = 0, core_relos_sz = 0, total_sz = 0; + size_t func_rec_sz = 0, line_rec_sz = 0, core_relo_rec_sz = 0; + struct btf_ext_header *hdr; + void *data, *cur; + int i, err, sz; + + /* validate that all sections have the same .BTF.ext record sizes + * and calculate total data size for each type of data (func info, + * line info, core relos) + */ + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + if (sec->func_info.rec_cnt) { + if (func_rec_sz == 0) + func_rec_sz = sec->func_info.rec_sz; + if (func_rec_sz != sec->func_info.rec_sz) { + pr_warn("mismatch in func_info record size %zu != %u\n", + func_rec_sz, sec->func_info.rec_sz); + return -EINVAL; + } + + funcs_sz += sizeof(struct btf_ext_info_sec) + func_rec_sz * sec->func_info.rec_cnt; + } + if (sec->line_info.rec_cnt) { + if (line_rec_sz == 0) + line_rec_sz = sec->line_info.rec_sz; + if (line_rec_sz != sec->line_info.rec_sz) { + pr_warn("mismatch in line_info record size %zu != %u\n", + line_rec_sz, sec->line_info.rec_sz); + return -EINVAL; + } + + lines_sz += sizeof(struct btf_ext_info_sec) + line_rec_sz * sec->line_info.rec_cnt; + } + if (sec->core_relo_info.rec_cnt) { + if (core_relo_rec_sz == 0) + core_relo_rec_sz = sec->core_relo_info.rec_sz; + if (core_relo_rec_sz != sec->core_relo_info.rec_sz) { + pr_warn("mismatch in core_relo_info record size %zu != %u\n", + core_relo_rec_sz, sec->core_relo_info.rec_sz); + return -EINVAL; + } + + core_relos_sz += sizeof(struct btf_ext_info_sec) + core_relo_rec_sz * sec->core_relo_info.rec_cnt; + } + } + + if (!funcs_sz && !lines_sz && !core_relos_sz) + return 0; + + total_sz += sizeof(struct btf_ext_header); + if (funcs_sz) { + funcs_sz += sizeof(__u32); /* record size prefix */ + total_sz += funcs_sz; + } + if (lines_sz) { + lines_sz += sizeof(__u32); /* record size prefix */ + total_sz += lines_sz; + } + if (core_relos_sz) { + core_relos_sz += sizeof(__u32); /* record size prefix */ + total_sz += core_relos_sz; + } + + cur = data = calloc(1, total_sz); + if (!data) + return -ENOMEM; + + hdr = cur; + hdr->magic = BTF_MAGIC; + hdr->version = BTF_VERSION; + hdr->flags = 0; + hdr->hdr_len = sizeof(struct btf_ext_header); + cur += sizeof(struct btf_ext_header); + + /* All offsets are in bytes relative to the end of this header */ + hdr->func_info_off = 0; + hdr->func_info_len = funcs_sz; + hdr->line_info_off = funcs_sz; + hdr->line_info_len = lines_sz; + hdr->core_relo_off = funcs_sz + lines_sz;; + hdr->core_relo_len = core_relos_sz; + + if (funcs_sz) { + *(__u32 *)cur = func_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->func_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (lines_sz) { + *(__u32 *)cur = line_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->line_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + if (core_relos_sz) { + *(__u32 *)cur = core_relo_rec_sz; + cur += sizeof(__u32); + + for (i = 1; i < linker->sec_cnt; i++) { + struct dst_sec *sec = &linker->secs[i]; + + sz = emit_btf_ext_data(linker, cur, sec->sec_name, &sec->core_relo_info); + if (sz < 0) { + err = sz; + goto out; + } + + cur += sz; + } + } + + linker->btf_ext = btf_ext__new(data, total_sz); + err = libbpf_get_error(linker->btf_ext); + if (err) { + linker->btf_ext = NULL; + pr_warn("failed to parse final .BTF.ext data: %d\n", err); + goto out; + } + +out: + free(data); + return err; +} diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c index 4dd73de00b6f..d2cb28e9ef52 100644 --- a/tools/lib/bpf/netlink.c +++ b/tools/lib/bpf/netlink.c @@ -40,7 +40,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); if (sock < 0) return -errno; diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8caaafe7e312..e7a8d847161f 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -227,7 +227,7 @@ static int ringbuf_process_ring(struct ring* r) if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) { sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ; err = r->sample_cb(r->ctx, sample, len); - if (err) { + if (err < 0) { /* update consumer pos and bail out */ smp_store_release(r->consumer_pos, cons_pos); diff --git a/tools/lib/bpf/strset.c b/tools/lib/bpf/strset.c new file mode 100644 index 000000000000..1fb8b49de1d6 --- /dev/null +++ b/tools/lib/bpf/strset.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +#include <stdint.h> +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <linux/err.h> +#include "hashmap.h" +#include "libbpf_internal.h" +#include "strset.h" + +struct strset { + void *strs_data; + size_t strs_data_len; + size_t strs_data_cap; + size_t strs_data_max_len; + + /* lookup index for each unique string in strings set */ + struct hashmap *strs_hash; +}; + +static size_t strset_hash_fn(const void *key, void *ctx) +{ + const struct strset *s = ctx; + const char *str = s->strs_data + (long)key; + + return str_hash(str); +} + +static bool strset_equal_fn(const void *key1, const void *key2, void *ctx) +{ + const struct strset *s = ctx; + const char *str1 = s->strs_data + (long)key1; + const char *str2 = s->strs_data + (long)key2; + + return strcmp(str1, str2) == 0; +} + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz) +{ + struct strset *set = calloc(1, sizeof(*set)); + struct hashmap *hash; + int err = -ENOMEM; + + if (!set) + return ERR_PTR(-ENOMEM); + + hash = hashmap__new(strset_hash_fn, strset_equal_fn, set); + if (IS_ERR(hash)) + goto err_out; + + set->strs_data_max_len = max_data_sz; + set->strs_hash = hash; + + if (init_data) { + long off; + + set->strs_data = malloc(init_data_sz); + if (!set->strs_data) + goto err_out; + + memcpy(set->strs_data, init_data, init_data_sz); + set->strs_data_len = init_data_sz; + set->strs_data_cap = init_data_sz; + + for (off = 0; off < set->strs_data_len; off += strlen(set->strs_data + off) + 1) { + /* hashmap__add() returns EEXIST if string with the same + * content already is in the hash map + */ + err = hashmap__add(hash, (void *)off, (void *)off); + if (err == -EEXIST) + continue; /* duplicate */ + if (err) + goto err_out; + } + } + + return set; +err_out: + strset__free(set); + return ERR_PTR(err); +} + +void strset__free(struct strset *set) +{ + if (IS_ERR_OR_NULL(set)) + return; + + hashmap__free(set->strs_hash); + free(set->strs_data); +} + +size_t strset__data_size(const struct strset *set) +{ + return set->strs_data_len; +} + +const char *strset__data(const struct strset *set) +{ + return set->strs_data; +} + +static void *strset_add_str_mem(struct strset *set, size_t add_sz) +{ + return libbpf_add_mem(&set->strs_data, &set->strs_data_cap, 1, + set->strs_data_len, set->strs_data_max_len, add_sz); +} + +/* Find string offset that corresponds to a given string *s*. + * Returns: + * - >0 offset into string data, if string is found; + * - -ENOENT, if string is not in the string data; + * - <0, on any other error. + */ +int strset__find_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + + /* see strset__add_str() for why we do this */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + if (hashmap__find(set->strs_hash, (void *)new_off, (void **)&old_off)) + return old_off; + + return -ENOENT; +} + +/* Add a string s to the string data. If the string already exists, return its + * offset within string data. + * Returns: + * - > 0 offset into string data, on success; + * - < 0, on error. + */ +int strset__add_str(struct strset *set, const char *s) +{ + long old_off, new_off, len; + void *p; + int err; + + /* Hashmap keys are always offsets within set->strs_data, so to even + * look up some string from the "outside", we need to first append it + * at the end, so that it can be addressed with an offset. Luckily, + * until set->strs_data_len is incremented, that string is just a piece + * of garbage for the rest of the code, so no harm, no foul. On the + * other hand, if the string is unique, it's already appended and + * ready to be used, only a simple set->strs_data_len increment away. + */ + len = strlen(s) + 1; + p = strset_add_str_mem(set, len); + if (!p) + return -ENOMEM; + + new_off = set->strs_data_len; + memcpy(p, s, len); + + /* Now attempt to add the string, but only if the string with the same + * contents doesn't exist already (HASHMAP_ADD strategy). If such + * string exists, we'll get its offset in old_off (that's old_key). + */ + err = hashmap__insert(set->strs_hash, (void *)new_off, (void *)new_off, + HASHMAP_ADD, (const void **)&old_off, NULL); + if (err == -EEXIST) + return old_off; /* duplicated string, return existing offset */ + if (err) + return err; + + set->strs_data_len += len; /* new unique string, adjust data length */ + return new_off; +} diff --git a/tools/lib/bpf/strset.h b/tools/lib/bpf/strset.h new file mode 100644 index 000000000000..b6ddf77a83c2 --- /dev/null +++ b/tools/lib/bpf/strset.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +/* Copyright (c) 2021 Facebook */ +#ifndef __LIBBPF_STRSET_H +#define __LIBBPF_STRSET_H + +#include <stdbool.h> +#include <stddef.h> + +struct strset; + +struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t init_data_sz); +void strset__free(struct strset *set); + +const char *strset__data(const struct strset *set); +size_t strset__data_size(const struct strset *set); + +int strset__find_str(struct strset *set, const char *s); +int strset__add_str(struct strset *set, const char *s); + +#endif /* __LIBBPF_STRSET_H */ diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 526fc35c0b23..cea62cc3e456 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -28,6 +28,7 @@ #include <sys/mman.h> #include <sys/socket.h> #include <sys/types.h> +#include <linux/if_link.h> #include "bpf.h" #include "libbpf.h" @@ -59,6 +60,8 @@ struct xsk_umem { int fd; int refcount; struct list_head ctx_list; + bool rx_ring_setup_done; + bool tx_ring_setup_done; }; struct xsk_ctx { @@ -70,8 +73,10 @@ struct xsk_ctx { int ifindex; struct list_head list; int prog_fd; + int link_fd; int xsks_map_fd; char ifname[IFNAMSIZ]; + bool has_bpf_link; }; struct xsk_socket { @@ -409,7 +414,7 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) static const int log_buf_size = 16 * 1024; struct xsk_ctx *ctx = xsk->ctx; char log_buf[log_buf_size]; - int err, prog_fd; + int prog_fd; /* This is the fallback C-program: * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx) @@ -499,14 +504,41 @@ static int xsk_load_xdp_prog(struct xsk_socket *xsk) return prog_fd; } - err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, prog_fd, - xsk->config.xdp_flags); + ctx->prog_fd = prog_fd; + return 0; +} + +static int xsk_create_bpf_link(struct xsk_socket *xsk) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts); + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int link_fd; + int err; + + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); if (err) { - close(prog_fd); + pr_warn("getting XDP prog id failed\n"); return err; } - ctx->prog_fd = prog_fd; + /* if there's a netlink-based XDP prog loaded on interface, bail out + * and ask user to do the removal by himself + */ + if (prog_id) { + pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n"); + return -EINVAL; + } + + opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE); + + link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts); + if (link_fd < 0) { + pr_warn("bpf_link_create failed: %s\n", strerror(errno)); + return link_fd; + } + + ctx->link_fd = link_fd; return 0; } @@ -625,7 +657,6 @@ static int xsk_lookup_bpf_maps(struct xsk_socket *xsk) close(fd); } - err = 0; if (ctx->xsks_map_fd == -1) err = -ENOENT; @@ -642,6 +673,98 @@ static int xsk_set_bpf_maps(struct xsk_socket *xsk) &xsk->fd, 0); } +static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd) +{ + struct bpf_link_info link_info; + __u32 link_len; + __u32 id = 0; + int err; + int fd; + + while (true) { + err = bpf_link_get_next_id(id, &id); + if (err) { + if (errno == ENOENT) { + err = 0; + break; + } + pr_warn("can't get next link: %s\n", strerror(errno)); + break; + } + + fd = bpf_link_get_fd_by_id(id); + if (fd < 0) { + if (errno == ENOENT) + continue; + pr_warn("can't get link by id (%u): %s\n", id, strerror(errno)); + err = -errno; + break; + } + + link_len = sizeof(struct bpf_link_info); + memset(&link_info, 0, link_len); + err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len); + if (err) { + pr_warn("can't get link info: %s\n", strerror(errno)); + close(fd); + break; + } + if (link_info.type == BPF_LINK_TYPE_XDP) { + if (link_info.xdp.ifindex == ifindex) { + *link_fd = fd; + if (prog_id) + *prog_id = link_info.prog_id; + break; + } + } + close(fd); + } + + return err; +} + +static bool xsk_probe_bpf_link(void) +{ + DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, + .flags = XDP_FLAGS_SKB_MODE); + struct bpf_load_program_attr prog_attr; + struct bpf_insn insns[2] = { + BPF_MOV64_IMM(BPF_REG_0, XDP_PASS), + BPF_EXIT_INSN() + }; + int prog_fd, link_fd = -1; + int ifindex_lo = 1; + bool ret = false; + int err; + + err = xsk_link_lookup(ifindex_lo, NULL, &link_fd); + if (err) + return ret; + + if (link_fd >= 0) + return true; + + memset(&prog_attr, 0, sizeof(prog_attr)); + prog_attr.prog_type = BPF_PROG_TYPE_XDP; + prog_attr.insns = insns; + prog_attr.insns_cnt = ARRAY_SIZE(insns); + prog_attr.license = "GPL"; + + prog_fd = bpf_load_program_xattr(&prog_attr, NULL, 0); + if (prog_fd < 0) + return ret; + + link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts); + close(prog_fd); + + if (link_fd >= 0) { + ret = true; + close(link_fd); + } + + return ret; +} + static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) { char ifname[IFNAMSIZ]; @@ -663,64 +786,108 @@ static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk) ctx->ifname[IFNAMSIZ - 1] = 0; xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); return 0; } -static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, - int *xsks_map_fd) +static int xsk_init_xdp_res(struct xsk_socket *xsk, + int *xsks_map_fd) { - struct xsk_socket *xsk = _xdp; struct xsk_ctx *ctx = xsk->ctx; - __u32 prog_id = 0; int err; - err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, - xsk->config.xdp_flags); + err = xsk_create_bpf_maps(xsk); if (err) return err; - if (!prog_id) { - err = xsk_create_bpf_maps(xsk); - if (err) - return err; + err = xsk_load_xdp_prog(xsk); + if (err) + goto err_load_xdp_prog; - err = xsk_load_xdp_prog(xsk); - if (err) { - goto err_load_xdp_prog; - } - } else { - ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); - if (ctx->prog_fd < 0) - return -errno; - err = xsk_lookup_bpf_maps(xsk); - if (err) { - close(ctx->prog_fd); - return err; - } - } + if (ctx->has_bpf_link) + err = xsk_create_bpf_link(xsk); + else + err = bpf_set_link_xdp_fd(xsk->ctx->ifindex, ctx->prog_fd, + xsk->config.xdp_flags); - if (xsk->rx) { - err = xsk_set_bpf_maps(xsk); - if (err) { - if (!prog_id) { - goto err_set_bpf_maps; - } else { - close(ctx->prog_fd); - return err; - } - } - } - if (xsks_map_fd) - *xsks_map_fd = ctx->xsks_map_fd; + if (err) + goto err_attach_xdp_prog; - return 0; + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_bpf_maps; + + return err; err_set_bpf_maps: + if (ctx->has_bpf_link) + close(ctx->link_fd); + else + bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); +err_attach_xdp_prog: close(ctx->prog_fd); - bpf_set_link_xdp_fd(ctx->ifindex, -1, 0); err_load_xdp_prog: xsk_delete_bpf_maps(xsk); + return err; +} + +static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id) +{ + struct xsk_ctx *ctx = xsk->ctx; + int err; + + ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id); + if (ctx->prog_fd < 0) { + err = -errno; + goto err_prog_fd; + } + err = xsk_lookup_bpf_maps(xsk); + if (err) + goto err_lookup_maps; + + if (!xsk->rx) + return err; + + err = xsk_set_bpf_maps(xsk); + if (err) + goto err_set_maps; + + return err; + +err_set_maps: + close(ctx->xsks_map_fd); +err_lookup_maps: + close(ctx->prog_fd); +err_prog_fd: + if (ctx->has_bpf_link) + close(ctx->link_fd); + return err; +} + +static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd) +{ + struct xsk_socket *xsk = _xdp; + struct xsk_ctx *ctx = xsk->ctx; + __u32 prog_id = 0; + int err; + + if (ctx->has_bpf_link) + err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd); + else + err = bpf_get_link_xdp_id(ctx->ifindex, &prog_id, xsk->config.xdp_flags); + + if (err) + return err; + + err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) : + xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id); + + if (!err && xsks_map_fd) + *xsks_map_fd = ctx->xsks_map_fd; return err; } @@ -743,26 +910,30 @@ static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex, return NULL; } -static void xsk_put_ctx(struct xsk_ctx *ctx) +static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap) { struct xsk_umem *umem = ctx->umem; struct xdp_mmap_offsets off; int err; - if (--ctx->refcount == 0) { - err = xsk_get_mmap_offsets(umem->fd, &off); - if (!err) { - munmap(ctx->fill->ring - off.fr.desc, - off.fr.desc + umem->config.fill_size * - sizeof(__u64)); - munmap(ctx->comp->ring - off.cr.desc, - off.cr.desc + umem->config.comp_size * - sizeof(__u64)); - } + if (--ctx->refcount) + return; - list_del(&ctx->list); - free(ctx); - } + if (!unmap) + goto out_free; + + err = xsk_get_mmap_offsets(umem->fd, &off); + if (err) + goto out_free; + + munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size * + sizeof(__u64)); + munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size * + sizeof(__u64)); + +out_free: + list_del(&ctx->list); + free(ctx); } static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, @@ -797,8 +968,6 @@ static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk, memcpy(ctx->ifname, ifname, IFNAMSIZ - 1); ctx->ifname[IFNAMSIZ - 1] = '\0'; - umem->fill_save = NULL; - umem->comp_save = NULL; ctx->fill = fill; ctx->comp = comp; list_add(&ctx->list, &umem->ctx_list); @@ -854,6 +1023,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, struct xsk_socket *xsk; struct xsk_ctx *ctx; int err, ifindex; + bool unmap = umem->fill_save != fill; + bool rx_setup_done = false, tx_setup_done = false; if (!umem || !xsk_ptr || !(rx || tx)) return -EFAULT; @@ -881,6 +1052,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } else { xsk->fd = umem->fd; + rx_setup_done = umem->rx_ring_setup_done; + tx_setup_done = umem->tx_ring_setup_done; } ctx = xsk_get_ctx(umem, ifindex, queue_id); @@ -898,8 +1071,9 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } } xsk->ctx = ctx; + xsk->ctx->has_bpf_link = xsk_probe_bpf_link(); - if (rx) { + if (rx && !rx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING, &xsk->config.rx_size, sizeof(xsk->config.rx_size)); @@ -907,8 +1081,10 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, err = -errno; goto out_put_ctx; } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; } - if (tx) { + if (tx && !tx_setup_done) { err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING, &xsk->config.tx_size, sizeof(xsk->config.tx_size)); @@ -916,6 +1092,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, err = -errno; goto out_put_ctx; } + if (xsk->fd == umem->fd) + umem->rx_ring_setup_done = true; } err = xsk_get_mmap_offsets(xsk->fd, &off); @@ -994,6 +1172,8 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr, } *xsk_ptr = xsk; + umem->fill_save = NULL; + umem->comp_save = NULL; return 0; out_mmap_tx: @@ -1005,7 +1185,7 @@ out_mmap_rx: munmap(rx_map, off.rx.desc + xsk->config.rx_size * sizeof(struct xdp_desc)); out_put_ctx: - xsk_put_ctx(ctx); + xsk_put_ctx(ctx, unmap); out_socket: if (--umem->refcount) close(xsk->fd); @@ -1019,6 +1199,9 @@ int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname, struct xsk_ring_cons *rx, struct xsk_ring_prod *tx, const struct xsk_socket_config *usr_config) { + if (!umem) + return -EFAULT; + return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem, rx, tx, umem->fill_save, umem->comp_save, usr_config); @@ -1054,6 +1237,8 @@ void xsk_socket__delete(struct xsk_socket *xsk) if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); + if (ctx->has_bpf_link) + close(ctx->link_fd); } err = xsk_get_mmap_offsets(xsk->fd, &off); @@ -1068,7 +1253,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) } } - xsk_put_ctx(ctx); + xsk_put_ctx(ctx, true); umem->refcount--; /* Do not close an fd that also has an associated umem connected diff --git a/tools/lib/bpf/xsk.h b/tools/lib/bpf/xsk.h index a9fdea87b5cd..01c12dca9c10 100644 --- a/tools/lib/bpf/xsk.h +++ b/tools/lib/bpf/xsk.h @@ -3,7 +3,8 @@ /* * AF_XDP user-space access library. * - * Copyright(c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2018 - 2019 Intel Corporation. + * Copyright (c) 2019 Facebook * * Author(s): Magnus Karlsson <magnus.karlsson@intel.com> */ @@ -13,15 +14,80 @@ #include <stdio.h> #include <stdint.h> +#include <stdbool.h> #include <linux/if_xdp.h> #include "libbpf.h" -#include "libbpf_util.h" #ifdef __cplusplus extern "C" { #endif +/* Load-Acquire Store-Release barriers used by the XDP socket + * library. The following macros should *NOT* be considered part of + * the xsk.h API, and is subject to change anytime. + * + * LIBRARY INTERNAL + */ + +#define __XSK_READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define __XSK_WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#if defined(__i386__) || defined(__x86_64__) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile("" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile("" : : : "memory"); \ + ___p1; \ + }) +#elif defined(__aarch64__) +# define libbpf_smp_store_release(p, v) \ + asm volatile ("stlr %w1, %0" : "=Q" (*p) : "r" (v) : "memory") +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1; \ + asm volatile ("ldar %w0, %1" \ + : "=r" (___p1) : "Q" (*p) : "memory"); \ + ___p1; \ + }) +#elif defined(__riscv) +# define libbpf_smp_store_release(p, v) \ + do { \ + asm volatile ("fence rw,w" : : : "memory"); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +# define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + asm volatile ("fence r,rw" : : : "memory"); \ + ___p1; \ + }) +#endif + +#ifndef libbpf_smp_store_release +#define libbpf_smp_store_release(p, v) \ + do { \ + __sync_synchronize(); \ + __XSK_WRITE_ONCE(*p, v); \ + } while (0) +#endif + +#ifndef libbpf_smp_load_acquire +#define libbpf_smp_load_acquire(p) \ + ({ \ + typeof(*p) ___p1 = __XSK_READ_ONCE(*p); \ + __sync_synchronize(); \ + ___p1; \ + }) +#endif + +/* LIBRARY INTERNAL -- END */ + /* Do not access these members directly. Use the functions below. */ #define DEFINE_XSK_RING(name) \ struct name { \ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 068cdb41f76f..5e5388a38e2a 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2442,6 +2442,9 @@ static int handle_insn_ops(struct instruction *insn, struct insn_state *state) if (update_cfi_state(insn, &state->cfi, op)) return 1; + if (!insn->alt_group) + continue; + if (op->dest.type == OP_DEST_PUSHF) { if (!state->uaccess_stack) { state->uaccess_stack = 1; diff --git a/tools/perf/builtin-daemon.c b/tools/perf/builtin-daemon.c index ace8772a4f03..7c4a9d424a64 100644 --- a/tools/perf/builtin-daemon.c +++ b/tools/perf/builtin-daemon.c @@ -402,35 +402,42 @@ static pid_t handle_signalfd(struct daemon *daemon) int status; pid_t pid; + /* + * Take signal fd data as pure signal notification and check all + * the sessions state. The reason is that multiple signals can get + * coalesced in kernel and we can receive only single signal even + * if multiple SIGCHLD were generated. + */ err = read(daemon->signal_fd, &si, sizeof(struct signalfd_siginfo)); - if (err != sizeof(struct signalfd_siginfo)) + if (err != sizeof(struct signalfd_siginfo)) { + pr_err("failed to read signal fd\n"); return -1; + } list_for_each_entry(session, &daemon->sessions, list) { + if (session->pid == -1) + continue; - if (session->pid != (int) si.ssi_pid) + pid = waitpid(session->pid, &status, WNOHANG); + if (pid <= 0) continue; - pid = waitpid(session->pid, &status, 0); - if (pid == session->pid) { - if (WIFEXITED(status)) { - pr_info("session '%s' exited, status=%d\n", - session->name, WEXITSTATUS(status)); - } else if (WIFSIGNALED(status)) { - pr_info("session '%s' killed (signal %d)\n", - session->name, WTERMSIG(status)); - } else if (WIFSTOPPED(status)) { - pr_info("session '%s' stopped (signal %d)\n", - session->name, WSTOPSIG(status)); - } else { - pr_info("session '%s' Unexpected status (0x%x)\n", - session->name, status); - } + if (WIFEXITED(status)) { + pr_info("session '%s' exited, status=%d\n", + session->name, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + pr_info("session '%s' killed (signal %d)\n", + session->name, WTERMSIG(status)); + } else if (WIFSTOPPED(status)) { + pr_info("session '%s' stopped (signal %d)\n", + session->name, WSTOPSIG(status)); + } else { + pr_info("session '%s' Unexpected status (0x%x)\n", + session->name, status); } session->state = KILL; session->pid = -1; - return pid; } return 0; @@ -443,7 +450,6 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d .fd = daemon->signal_fd, .events = POLLIN, }; - pid_t wpid = 0, pid = session->pid; time_t start; start = time(NULL); @@ -452,7 +458,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d int err = poll(&pollfd, 1, 1000); if (err > 0) { - wpid = handle_signalfd(daemon); + handle_signalfd(daemon); } else if (err < 0) { perror("failed: poll\n"); return -1; @@ -460,7 +466,7 @@ static int daemon_session__wait(struct daemon_session *session, struct daemon *d if (start + secs < time(NULL)) return -1; - } while (wpid != pid); + } while (session->pid != -1); return 0; } @@ -902,7 +908,9 @@ static void daemon_session__kill(struct daemon_session *session, daemon_session__signal(session, SIGKILL); break; default: - break; + pr_err("failed to wait for session %s\n", + session->name); + return; } how++; @@ -955,7 +963,8 @@ static void daemon__kill(struct daemon *daemon) daemon__signal(daemon, SIGKILL); break; default: - break; + pr_err("failed to wait for sessions\n"); + return; } how++; @@ -1344,7 +1353,7 @@ out: close(sock_fd); if (conf_fd != -1) close(conf_fd); - if (conf_fd != -1) + if (signal_fd != -1) close(signal_fd); pr_info("daemon exited\n"); diff --git a/tools/perf/tests/bpf.c b/tools/perf/tests/bpf.c index f57e075b0ed2..c72adbd67386 100644 --- a/tools/perf/tests/bpf.c +++ b/tools/perf/tests/bpf.c @@ -86,7 +86,7 @@ static struct { .msg_load_fail = "check your vmlinux setting?", .target_func = &epoll_pwait_loop, .expect_result = (NR_ITERS + 1) / 2, - .pin = true, + .pin = true, }, #ifdef HAVE_BPF_PROLOGUE { @@ -99,13 +99,6 @@ static struct { .expect_result = (NR_ITERS + 1) / 4, }, #endif - { - .prog_id = LLVM_TESTCASE_BPF_RELOCATION, - .desc = "BPF relocation checker", - .name = "[bpf_relocation_test]", - .msg_compile_fail = "fix 'perf test LLVM' first", - .msg_load_fail = "libbpf error when dealing with relocation", - }, }; static int do_test(struct bpf_object *obj, int (*func)(void), diff --git a/tools/perf/tests/shell/daemon.sh b/tools/perf/tests/shell/daemon.sh index 5ad3ca8d681b..58984380b211 100755 --- a/tools/perf/tests/shell/daemon.sh +++ b/tools/perf/tests/shell/daemon.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # daemon operations # SPDX-License-Identifier: GPL-2.0 diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index 953f4afacd3b..5b6ccb90b397 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -298,10 +298,6 @@ static int auxtrace_queues__queue_buffer(struct auxtrace_queues *queues, queue->set = true; queue->tid = buffer->tid; queue->cpu = buffer->cpu; - } else if (buffer->cpu != queue->cpu || buffer->tid != queue->tid) { - pr_err("auxtrace queue conflict: cpu %d, tid %d vs cpu %d, tid %d\n", - queue->cpu, queue->tid, buffer->cpu, buffer->tid); - return -EINVAL; } buffer->buffer_nr = queues->next_buffer_nr++; diff --git a/tools/perf/util/bpf-event.c b/tools/perf/util/bpf-event.c index 57d58c81a5f8..cdecda1ddd36 100644 --- a/tools/perf/util/bpf-event.c +++ b/tools/perf/util/bpf-event.c @@ -196,25 +196,32 @@ static int perf_event__synthesize_one_bpf_prog(struct perf_session *session, } if (info_linear->info_len < offsetof(struct bpf_prog_info, prog_tags)) { + free(info_linear); pr_debug("%s: the kernel is too old, aborting\n", __func__); return -2; } info = &info_linear->info; + if (!info->jited_ksyms) { + free(info_linear); + return -1; + } /* number of ksyms, func_lengths, and tags should match */ sub_prog_cnt = info->nr_jited_ksyms; if (sub_prog_cnt != info->nr_prog_tags || - sub_prog_cnt != info->nr_jited_func_lens) + sub_prog_cnt != info->nr_jited_func_lens) { + free(info_linear); return -1; + } /* check BTF func info support */ if (info->btf_id && info->nr_func_info && info->func_info_rec_size) { /* btf func info number should be same as sub_prog_cnt */ if (sub_prog_cnt != info->nr_func_info) { pr_debug("%s: mismatch in BPF sub program count and BTF function info count, aborting\n", __func__); - err = -1; - goto out; + free(info_linear); + return -1; } if (btf__get_from_id(info->btf_id, &btf)) { pr_debug("%s: failed to get BTF of id %u, aborting\n", __func__, info->btf_id); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 42c84adeb2fb..c0c0fab22cb8 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -356,6 +356,9 @@ __add_event(struct list_head *list, int *idx, struct perf_cpu_map *cpus = pmu ? perf_cpu_map__get(pmu->cpus) : cpu_list ? perf_cpu_map__new(cpu_list) : NULL; + if (pmu && attr->type == PERF_TYPE_RAW) + perf_pmu__warn_invalid_config(pmu, attr->config, name); + if (init_attr) event_attr_init(attr); diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 44ef28302fc7..46fd0f998484 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1812,3 +1812,36 @@ int perf_pmu__caps_parse(struct perf_pmu *pmu) return nr_caps; } + +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name) +{ + struct perf_pmu_format *format; + __u64 masks = 0, bits; + char buf[100]; + unsigned int i; + + list_for_each_entry(format, &pmu->format, list) { + if (format->value != PERF_PMU_FORMAT_VALUE_CONFIG) + continue; + + for_each_set_bit(i, format->bits, PERF_PMU_FORMAT_BITS) + masks |= 1ULL << i; + } + + /* + * Kernel doesn't export any valid format bits. + */ + if (masks == 0) + return; + + bits = config & ~masks; + if (bits == 0) + return; + + bitmap_scnprintf((unsigned long *)&bits, sizeof(bits) * 8, buf, sizeof(buf)); + + pr_warning("WARNING: event '%s' not valid (bits %s of config " + "'%llx' not supported by kernel)!\n", + name ?: "N/A", buf, config); +} diff --git a/tools/perf/util/pmu.h b/tools/perf/util/pmu.h index 8164388478c6..160b0f561771 100644 --- a/tools/perf/util/pmu.h +++ b/tools/perf/util/pmu.h @@ -123,4 +123,7 @@ int perf_pmu__convert_scale(const char *scale, char **end, double *sval); int perf_pmu__caps_parse(struct perf_pmu *pmu); +void perf_pmu__warn_invalid_config(struct perf_pmu *pmu, __u64 config, + char *name); + #endif /* __PMU_H */ diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b698046ec2db..dff178103ce5 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -424,7 +424,7 @@ int perf_event__synthesize_mmap_events(struct perf_tool *tool, while (!io.eof) { static const char anonstr[] = "//anon"; - size_t size; + size_t size, aligned_size; /* ensure null termination since stack will be reused. */ event->mmap2.filename[0] = '\0'; @@ -484,11 +484,12 @@ out: } size = strlen(event->mmap2.filename) + 1; - size = PERF_ALIGN(size, sizeof(u64)); + aligned_size = PERF_ALIGN(size, sizeof(u64)); event->mmap2.len -= event->mmap.start; event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); + (sizeof(event->mmap2.filename) - aligned_size)); + memset(event->mmap2.filename + size, 0, machine->id_hdr_size + + (aligned_size - size)); event->mmap2.header.size += machine->id_hdr_size; event->mmap2.pid = tgid; event->mmap2.tid = pid; @@ -758,7 +759,7 @@ static int __event__synthesize_thread(union perf_event *comm_event, for (i = 0; i < n; i++) { char *end; pid_t _pid; - bool kernel_thread; + bool kernel_thread = false; _pid = strtol(dirent[i]->d_name, &end, 10); if (*end) diff --git a/tools/perf/util/vdso.c b/tools/perf/util/vdso.c index 3cc91ad048ea..43beb169631d 100644 --- a/tools/perf/util/vdso.c +++ b/tools/perf/util/vdso.c @@ -133,6 +133,8 @@ static struct dso *__machine__addnew_vdso(struct machine *machine, const char *s if (dso != NULL) { __dsos__add(&machine->dsos, dso); dso__set_long_name(dso, long_name, false); + /* Put dso here because __dsos_add already got it */ + dso__put(dso); } return dso; diff --git a/tools/testing/kunit/configs/broken_on_uml.config b/tools/testing/kunit/configs/broken_on_uml.config index a7f0603d33f6..690870043ac0 100644 --- a/tools/testing/kunit/configs/broken_on_uml.config +++ b/tools/testing/kunit/configs/broken_on_uml.config @@ -40,3 +40,5 @@ # CONFIG_RESET_BRCMSTB_RESCAL is not set # CONFIG_RESET_INTEL_GW is not set # CONFIG_ADI_AXI_ADC is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set diff --git a/tools/testing/kunit/kunit_config.py b/tools/testing/kunit/kunit_config.py index 0b550cbd667d..1e2683dcc0e7 100644 --- a/tools/testing/kunit/kunit_config.py +++ b/tools/testing/kunit/kunit_config.py @@ -13,7 +13,7 @@ from typing import List, Set CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$' CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+|".*")$' -KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value']) +KconfigEntryBase = collections.namedtuple('KconfigEntryBase', ['name', 'value']) class KconfigEntry(KconfigEntryBase): diff --git a/tools/testing/radix-tree/idr-test.c b/tools/testing/radix-tree/idr-test.c index 3b796dd5e577..ca24f6839d50 100644 --- a/tools/testing/radix-tree/idr-test.c +++ b/tools/testing/radix-tree/idr-test.c @@ -296,21 +296,34 @@ static void *idr_throbber(void *arg) return NULL; } +/* + * There are always either 1 or 2 objects in the IDR. If we find nothing, + * or we find something at an ID we didn't expect, that's a bug. + */ void idr_find_test_1(int anchor_id, int throbber_id) { pthread_t throbber; time_t start = time(NULL); - pthread_create(&throbber, NULL, idr_throbber, &throbber_id); - BUG_ON(idr_alloc(&find_idr, xa_mk_value(anchor_id), anchor_id, anchor_id + 1, GFP_KERNEL) != anchor_id); + pthread_create(&throbber, NULL, idr_throbber, &throbber_id); + + rcu_read_lock(); do { int id = 0; void *entry = idr_get_next(&find_idr, &id); - BUG_ON(entry != xa_mk_value(id)); + rcu_read_unlock(); + if ((id != anchor_id && id != throbber_id) || + entry != xa_mk_value(id)) { + printf("%s(%d, %d): %p at %d\n", __func__, anchor_id, + throbber_id, entry, id); + abort(); + } + rcu_read_lock(); } while (time(NULL) < start + 11); + rcu_read_unlock(); pthread_join(throbber, NULL); @@ -577,6 +590,7 @@ void ida_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); idr_checks(); ida_tests(); @@ -584,5 +598,6 @@ int __weak main(void) rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/linux/compiler_types.h b/tools/testing/radix-tree/linux/compiler_types.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/tools/testing/radix-tree/linux/compiler_types.h +++ /dev/null diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 9eae0fb5a67d..e00520cc6349 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -224,7 +224,9 @@ void multiorder_checks(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); multiorder_checks(); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/radix-tree/xarray.c b/tools/testing/radix-tree/xarray.c index e61e43efe463..f20e12cbbfd4 100644 --- a/tools/testing/radix-tree/xarray.c +++ b/tools/testing/radix-tree/xarray.c @@ -25,11 +25,13 @@ void xarray_tests(void) int __weak main(void) { + rcu_register_thread(); radix_tree_init(); xarray_tests(); radix_tree_cpu_dead(1); rcu_barrier(); if (nr_allocated) printf("nr_allocated = %d\n", nr_allocated); + rcu_unregister_thread(); return 0; } diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c index b2282be6f938..612d3899614a 100644 --- a/tools/testing/selftests/arm64/fp/sve-ptrace.c +++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c @@ -332,5 +332,5 @@ int main(void) ksft_print_cnts(); - return 0; + return ret; } diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index 9210691aa998..e3e08d9c7020 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -284,16 +284,28 @@ endfunction // Set up test pattern in the FFR // x0: pid // x2: generation +// +// We need to generate a canonical FFR value, which consists of a number of +// low "1" bits, followed by a number of zeros. This gives us 17 unique values +// per 16 bits of FFR, so we create a 4 bit signature out of the PID and +// generation, and use that as the initial number of ones in the pattern. +// We fill the upper lanes of FFR with zeros. // Beware: corrupts P0. function setup_ffr mov x4, x30 - bl pattern + and w0, w0, #0x3 + bfi w0, w2, #2, #2 + mov w1, #1 + lsl w1, w1, w0 + sub w1, w1, #1 + ldr x0, =ffrref - ldr x1, =scratch - rdvl x2, #1 - lsr x2, x2, #3 - bl memcpy + strh w1, [x0], 2 + rdvl x1, #1 + lsr x1, x1, #3 + sub x1, x1, #2 + bl memclr mov x0, #0 ldr x1, =ffrref diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index c3999587bc23..6448c626498f 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -21,7 +21,7 @@ endif BPF_GCC ?= $(shell command -v bpf-gcc;) SAN_CFLAGS ?= -CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \ +CFLAGS += -g -Og -rdynamic -Wall $(GENFLAGS) $(SAN_CFLAGS) \ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(OUTPUT) \ -Dbpf_prog_load=bpf_prog_test_load \ @@ -201,6 +201,7 @@ $(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -Og' \ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \ prefix= DESTDIR=$(HOST_SCRATCH_DIR)/ install @@ -218,6 +219,7 @@ $(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \ + EXTRA_CFLAGS='-g -Og' \ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers ifneq ($(BPFOBJ),$(HOST_BPFOBJ)) @@ -225,11 +227,12 @@ $(HOST_BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \ ../../../include/uapi/linux/bpf.h \ | $(INCLUDE_DIR) $(HOST_BUILD_DIR)/libbpf $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) \ - OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ + EXTRA_CFLAGS='-g -Og' \ + OUTPUT=$(HOST_BUILD_DIR)/libbpf/ CC=$(HOSTCC) LD=$(HOSTLD) \ DESTDIR=$(HOST_SCRATCH_DIR)/ prefix= all install_headers endif -$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) | $(BPFTOOL) $(INCLUDE_DIR) +$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR) ifeq ($(VMLINUX_H),) $(call msg,GEN,,$@) $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@ @@ -300,6 +303,10 @@ endef SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c +LINKED_SKELS := test_static_linked.skel.h + +test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o + # Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on # $eval()) and pass control to DEFINE_TEST_RUNNER_RULES. # Parameters: @@ -320,6 +327,7 @@ TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS) TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \ $$(filter-out $(SKEL_BLACKLIST), \ $$(TRUNNER_BPF_SRCS))) +TRUNNER_BPF_SKELS_LINKED := $$(addprefix $$(TRUNNER_OUTPUT)/,$(LINKED_SKELS)) TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS) # Evaluate rules now with extra TRUNNER_XXX variables above already defined @@ -352,11 +360,22 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \ $(TRUNNER_BPF_CFLAGS)) -$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \ - $(TRUNNER_OUTPUT)/%.o \ - | $(BPFTOOL) $(TRUNNER_OUTPUT) +$(TRUNNER_BPF_SKELS): %.skel.h: %.o $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked1.o) $$< + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked2.o) $$(<:.o=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(<:.o=.linked3.o) $$(<:.o=.linked2.o) + $(Q)diff $$(<:.o=.linked2.o) $$(<:.o=.linked3.o) + $(Q)$$(BPFTOOL) gen skeleton $$(<:.o=.linked3.o) name $$(notdir $$(<:.o=)) > $$@ + +$(TRUNNER_BPF_SKELS_LINKED): $(TRUNNER_BPF_OBJS) $(BPFTOOL) | $(TRUNNER_OUTPUT) + $$(call msg,LINK-BPF,$(TRUNNER_BINARY),$$(@:.skel.h=.o)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked1.o) $$(addprefix $(TRUNNER_OUTPUT)/,$$($$(@F)-deps)) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked1.o) + $(Q)$$(BPFTOOL) gen object $$(@:.skel.h=.linked3.o) $$(@:.skel.h=.linked2.o) + $(Q)diff $$(@:.skel.h=.linked2.o) $$(@:.skel.h=.linked3.o) $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@) - $(Q)$$(BPFTOOL) gen skeleton $$< > $$@ + $(Q)$$(BPFTOOL) gen skeleton $$(@:.skel.h=.linked3.o) name $$(notdir $$(@:.skel.h=)) > $$@ endif # ensure we set up tests.h header generation rule just once @@ -378,6 +397,7 @@ $(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \ $(TRUNNER_EXTRA_HDRS) \ $(TRUNNER_BPF_OBJS) \ $(TRUNNER_BPF_SKELS) \ + $(TRUNNER_BPF_SKELS_LINKED) \ $$(BPFOBJ) | $(TRUNNER_OUTPUT) $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@) $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F) diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst index 3464161c8eea..65fe318d1e71 100644 --- a/tools/testing/selftests/bpf/README.rst +++ b/tools/testing/selftests/bpf/README.rst @@ -179,3 +179,17 @@ types, which was introduced in `Clang 13`__. The older Clang versions will either crash when compiling these tests, or generate an incorrect BTF. __ https://reviews.llvm.org/D83289 + +Kernel function call test and Clang version +=========================================== + +Some selftests (e.g. kfunc_call and bpf_tcp_ca) require a LLVM support +to generate extern function in BTF. It was introduced in `Clang 13`__. + +Without it, the error from compiling bpf selftests looks like: + +.. code-block:: console + + libbpf: failed to find BTF for extern 'tcp_slow_start' [25] section: -2 + +__ https://reviews.llvm.org/D93563 diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h index 91f0fac632f4..029589c008c9 100644 --- a/tools/testing/selftests/bpf/bpf_tcp_helpers.h +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h @@ -187,16 +187,6 @@ struct tcp_congestion_ops { typeof(y) __y = (y); \ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); }) -static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) -{ - __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); - - return acked; -} - static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tp->snd_cwnd < tp->snd_ssthresh; @@ -213,22 +203,7 @@ static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk) return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited); } -static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) -{ - /* If credits accumulated at a higher w, apply them gently now. */ - if (tp->snd_cwnd_cnt >= w) { - tp->snd_cwnd_cnt = 0; - tp->snd_cwnd++; - } - - tp->snd_cwnd_cnt += acked; - if (tp->snd_cwnd_cnt >= w) { - __u32 delta = tp->snd_cwnd_cnt / w; - - tp->snd_cwnd_cnt -= delta * w; - tp->snd_cwnd += delta; - } - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); -} +extern __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked) __ksym; +extern void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked) __ksym; #endif diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c index b8d6aef99db4..99628e1a1e58 100644 --- a/tools/testing/selftests/bpf/get_cgroup_id_user.c +++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c @@ -57,6 +57,10 @@ int main(int argc, char **argv) __u32 key = 0, pid; int exit_code = 1; char buf[256]; + const struct timespec req = { + .tv_sec = 1, + .tv_nsec = 0, + }; cgroup_fd = cgroup_setup_and_join(TEST_CGROUP); if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno)) @@ -115,7 +119,7 @@ int main(int argc, char **argv) goto close_pmu; /* trigger some syscalls */ - sleep(1); + syscall(__NR_nanosleep, &req, NULL); err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid); if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno)) diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c index f0a64d8ac59a..e42ea1195d18 100644 --- a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c +++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c @@ -55,7 +55,6 @@ void test_array_map_batch_ops(void) int map_fd, *keys, *values, *visited; __u32 count, total, total_success; const __u32 max_entries = 10; - bool nospace_err; __u64 batch = 0; int err, step; DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, @@ -90,7 +89,6 @@ void test_array_map_batch_ops(void) * elements each. */ count = step; - nospace_err = false; while (true) { err = bpf_map_lookup_batch(map_fd, total ? &batch : NULL, &batch, @@ -107,9 +105,6 @@ void test_array_map_batch_ops(void) } - if (nospace_err == true) - continue; - CHECK(total != max_entries, "lookup with steps", "total = %u, max_entries = %u\n", total, max_entries); diff --git a/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c new file mode 100644 index 000000000000..2e986e5e4cac --- /dev/null +++ b/tools/testing/selftests/bpf/map_tests/lpm_trie_map_batch_ops.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <arpa/inet.h> +#include <linux/bpf.h> +#include <netinet/in.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <stdlib.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include <test_maps.h> + +struct test_lpm_key { + __u32 prefix; + struct in_addr ipv4; +}; + +static void map_batch_update(int map_fd, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + __u32 i; + int err; + char buff[16] = { 0 }; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + for (i = 0; i < max_entries; i++) { + keys[i].prefix = 32; + snprintf(buff, 16, "192.168.1.%d", i + 1); + inet_pton(AF_INET, buff, &keys[i].ipv4); + values[i] = i + 1; + } + + err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts); + CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno)); +} + +static void map_batch_verify(int *visited, __u32 max_entries, + struct test_lpm_key *keys, int *values) +{ + char buff[16] = { 0 }; + int lower_byte = 0; + __u32 i; + + memset(visited, 0, max_entries * sizeof(*visited)); + for (i = 0; i < max_entries; i++) { + inet_ntop(AF_INET, &keys[i].ipv4, buff, 32); + CHECK(sscanf(buff, "192.168.1.%d", &lower_byte) == EOF, + "sscanf()", "error: i %d\n", i); + CHECK(lower_byte != values[i], "key/value checking", + "error: i %d key %s value %d\n", i, buff, values[i]); + visited[i] = 1; + } + for (i = 0; i < max_entries; i++) { + CHECK(visited[i] != 1, "visited checking", + "error: keys array at index %d missing\n", i); + } +} + +void test_lpm_trie_map_batch_ops(void) +{ + struct bpf_create_map_attr xattr = { + .name = "lpm_trie_map", + .map_type = BPF_MAP_TYPE_LPM_TRIE, + .key_size = sizeof(struct test_lpm_key), + .value_size = sizeof(int), + .map_flags = BPF_F_NO_PREALLOC, + }; + struct test_lpm_key *keys, key; + int map_fd, *values, *visited; + __u32 step, count, total, total_success; + const __u32 max_entries = 10; + __u64 batch = 0; + int err; + DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts, + .elem_flags = 0, + .flags = 0, + ); + + xattr.max_entries = max_entries; + map_fd = bpf_create_map_xattr(&xattr); + CHECK(map_fd == -1, "bpf_create_map_xattr()", "error:%s\n", + strerror(errno)); + + keys = malloc(max_entries * sizeof(struct test_lpm_key)); + values = malloc(max_entries * sizeof(int)); + visited = malloc(max_entries * sizeof(int)); + CHECK(!keys || !values || !visited, "malloc()", "error:%s\n", + strerror(errno)); + + total_success = 0; + for (step = 1; step < max_entries; step++) { + map_batch_update(map_fd, max_entries, keys, values); + map_batch_verify(visited, max_entries, keys, values); + memset(keys, 0, max_entries * sizeof(*keys)); + memset(values, 0, max_entries * sizeof(*values)); + batch = 0; + total = 0; + /* iteratively lookup/delete elements with 'step' + * elements each. + */ + count = step; + while (true) { + err = bpf_map_lookup_batch(map_fd, + total ? &batch : NULL, &batch, + keys + total, values + total, &count, &opts); + + CHECK((err && errno != ENOENT), "lookup with steps", + "error: %s\n", strerror(errno)); + + total += count; + if (err) + break; + } + + CHECK(total != max_entries, "lookup with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + map_batch_verify(visited, max_entries, keys, values); + + total = 0; + count = step; + while (total < max_entries) { + if (max_entries - total < step) + count = max_entries - total; + err = bpf_map_delete_batch(map_fd, keys + total, &count, + &opts); + CHECK((err && errno != ENOENT), "delete batch", + "error: %s\n", strerror(errno)); + total += count; + if (err) + break; + } + CHECK(total != max_entries, "delete with steps", + "total = %u, max_entries = %u\n", total, max_entries); + + /* check map is empty, errono == ENOENT */ + err = bpf_map_get_next_key(map_fd, NULL, &key); + CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()", + "error: %s\n", strerror(errno)); + + total_success++; + } + + CHECK(total_success == 0, "check total_success", + "unexpected failure\n"); + + printf("%s:PASS\n", __func__); + + free(keys); + free(values); + free(visited); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c index 37c5494a0381..e25917f04602 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c @@ -6,6 +6,7 @@ #include <test_progs.h> #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" +#include "bpf_tcp_nogpl.skel.h" #define min(a, b) ((a) < (b) ? (a) : (b)) @@ -227,10 +228,53 @@ static void test_dctcp(void) bpf_dctcp__destroy(dctcp_skel); } +static char *err_str; +static bool found; + +static int libbpf_debug_print(enum libbpf_print_level level, + const char *format, va_list args) +{ + char *log_buf; + + if (level != LIBBPF_WARN || + strcmp(format, "libbpf: \n%s\n")) { + vprintf(format, args); + return 0; + } + + log_buf = va_arg(args, char *); + if (!log_buf) + goto out; + if (err_str && strstr(log_buf, err_str) != NULL) + found = true; +out: + printf(format, log_buf); + return 0; +} + +static void test_invalid_license(void) +{ + libbpf_print_fn_t old_print_fn; + struct bpf_tcp_nogpl *skel; + + err_str = "struct ops programs must have a GPL compatible license"; + found = false; + old_print_fn = libbpf_set_print(libbpf_debug_print); + + skel = bpf_tcp_nogpl__open_and_load(); + ASSERT_NULL(skel, "bpf_tcp_nogpl"); + ASSERT_EQ(found, true, "expected_err_msg"); + + bpf_tcp_nogpl__destroy(skel); + libbpf_set_print(old_print_fn); +} + void test_bpf_tcp_ca(void) { if (test__start_subtest("dctcp")) test_dctcp(); if (test__start_subtest("cubic")) test_cubic(); + if (test__start_subtest("invalid_license")) + test_invalid_license(); } diff --git a/tools/testing/selftests/bpf/prog_tests/check_mtu.c b/tools/testing/selftests/bpf/prog_tests/check_mtu.c index 36af1c138faf..b62a39315336 100644 --- a/tools/testing/selftests/bpf/prog_tests/check_mtu.c +++ b/tools/testing/selftests/bpf/prog_tests/check_mtu.c @@ -128,6 +128,8 @@ static void test_check_mtu_xdp(__u32 mtu, __u32 ifindex) test_check_mtu_run_xdp(skel, skel->progs.xdp_use_helper, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_exceed_mtu, mtu); test_check_mtu_run_xdp(skel, skel->progs.xdp_minus_delta, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len, mtu); + test_check_mtu_run_xdp(skel, skel->progs.xdp_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); @@ -187,6 +189,8 @@ static void test_check_mtu_tc(__u32 mtu, __u32 ifindex) test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_exceed_mtu_da, mtu); test_check_mtu_run_tc(skel, skel->progs.tc_minus_delta, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len, mtu); + test_check_mtu_run_tc(skel, skel->progs.tc_input_len_exceed, mtu); cleanup: test_check_mtu__destroy(skel); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c new file mode 100644 index 000000000000..6c4d42a2386f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/fexit_sleep.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#define _GNU_SOURCE +#include <sched.h> +#include <test_progs.h> +#include <time.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include "fexit_sleep.skel.h" + +static int do_sleep(void *skel) +{ + struct fexit_sleep *fexit_skel = skel; + struct timespec ts1 = { .tv_nsec = 1 }; + struct timespec ts2 = { .tv_sec = 10 }; + + fexit_skel->bss->pid = getpid(); + (void)syscall(__NR_nanosleep, &ts1, NULL); + (void)syscall(__NR_nanosleep, &ts2, NULL); + return 0; +} + +#define STACK_SIZE (1024 * 1024) +static char child_stack[STACK_SIZE]; + +void test_fexit_sleep(void) +{ + struct fexit_sleep *fexit_skel = NULL; + int wstatus, duration = 0; + pid_t cpid; + int err, fexit_cnt; + + fexit_skel = fexit_sleep__open_and_load(); + if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n")) + goto cleanup; + + err = fexit_sleep__attach(fexit_skel); + if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err)) + goto cleanup; + + cpid = clone(do_sleep, child_stack + STACK_SIZE, CLONE_FILES | SIGCHLD, fexit_skel); + if (CHECK(cpid == -1, "clone", strerror(errno))) + goto cleanup; + + /* wait until first sys_nanosleep ends and second sys_nanosleep starts */ + while (READ_ONCE(fexit_skel->bss->fentry_cnt) != 2); + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + + /* close progs and detach them. That will trigger two nop5->jmp5 rewrites + * in the trampolines to skip nanosleep_fexit prog. + * The nanosleep_fentry prog will get detached first. + * The nanosleep_fexit prog will get detached second. + * Detaching will trigger freeing of both progs JITed images. + * There will be two dying bpf_tramp_image-s, but only the initial + * bpf_tramp_image (with both _fentry and _fexit progs will be stuck + * waiting for percpu_ref_kill to confirm). The other one + * will be freed quickly. + */ + close(bpf_program__fd(fexit_skel->progs.nanosleep_fentry)); + close(bpf_program__fd(fexit_skel->progs.nanosleep_fexit)); + fexit_sleep__detach(fexit_skel); + + /* kill the thread to unwind sys_nanosleep stack through the trampoline */ + kill(cpid, 9); + + if (CHECK(waitpid(cpid, &wstatus, 0) == -1, "waitpid", strerror(errno))) + goto cleanup; + if (CHECK(WEXITSTATUS(wstatus) != 0, "exitstatus", "failed")) + goto cleanup; + + /* The bypassed nanosleep_fexit prog shouldn't have executed. + * Unlike progs the maps were not freed and directly accessible. + */ + fexit_cnt = READ_ONCE(fexit_skel->bss->fexit_cnt); + if (CHECK(fexit_cnt != 1, "fexit_cnt", "%d", fexit_cnt)) + goto cleanup; + +cleanup: + fexit_sleep__destroy(fexit_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/kfunc_call.c b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c new file mode 100644 index 000000000000..7fc0951ee75f --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kfunc_call.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <test_progs.h> +#include <network_helpers.h> +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test_subprog.skel.h" + +static void test_main(void) +{ + struct kfunc_call_test *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 12, "test1-retval"); + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test2); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test2)"); + ASSERT_EQ(retval, 3, "test2-retval"); + + kfunc_call_test__destroy(skel); +} + +static void test_subprog(void) +{ + struct kfunc_call_test_subprog *skel; + int prog_fd, retval, err; + + skel = kfunc_call_test_subprog__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + return; + + prog_fd = bpf_program__fd(skel->progs.kfunc_call_test1); + err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4), + NULL, NULL, (__u32 *)&retval, NULL); + ASSERT_OK(err, "bpf_prog_test_run(test1)"); + ASSERT_EQ(retval, 10, "test1-retval"); + ASSERT_NEQ(skel->data->active_res, -1, "active_res"); + ASSERT_EQ(skel->data->sk_state, BPF_TCP_CLOSE, "sk_state"); + + kfunc_call_test_subprog__destroy(skel); +} + +void test_kfunc_call(void) +{ + if (test__start_subtest("main")) + test_main(); + + if (test__start_subtest("subprog")) + test_subprog(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index b8b48cac2ac3..ab77596b64e3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -7,6 +7,7 @@ #include "test_skmsg_load_helpers.skel.h" #include "test_sockmap_update.skel.h" #include "test_sockmap_invalid_update.skel.h" +#include "test_sockmap_skb_verdict_attach.skel.h" #include "bpf_iter_sockmap.skel.h" #define TCP_REPAIR 19 /* TCP sock is under repair right now */ @@ -281,6 +282,39 @@ out: bpf_iter_sockmap__destroy(skel); } +static void test_sockmap_skb_verdict_attach(enum bpf_attach_type first, + enum bpf_attach_type second) +{ + struct test_sockmap_skb_verdict_attach *skel; + int err, map, verdict; + + skel = test_sockmap_skb_verdict_attach__open_and_load(); + if (CHECK_FAIL(!skel)) { + perror("test_sockmap_skb_verdict_attach__open_and_load"); + return; + } + + verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + map = bpf_map__fd(skel->maps.sock_map); + + err = bpf_prog_attach(verdict, map, first, 0); + if (CHECK_FAIL(err)) { + perror("bpf_prog_attach"); + goto out; + } + + err = bpf_prog_attach(verdict, map, second, 0); + assert(err == -1 && errno == EBUSY); + + err = bpf_prog_detach2(verdict, map, first); + if (CHECK_FAIL(err)) { + perror("bpf_prog_detach2"); + goto out; + } +out: + test_sockmap_skb_verdict_attach__destroy(skel); +} + void test_sockmap_basic(void) { if (test__start_subtest("sockmap create_update_free")) @@ -301,4 +335,10 @@ void test_sockmap_basic(void) test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP); if (test__start_subtest("sockhash copy")) test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH); + if (test__start_subtest("sockmap skb_verdict attach")) { + test_sockmap_skb_verdict_attach(BPF_SK_SKB_VERDICT, + BPF_SK_SKB_STREAM_VERDICT); + test_sockmap_skb_verdict_attach(BPF_SK_SKB_STREAM_VERDICT, + BPF_SK_SKB_VERDICT); + } } diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c index c26e6bf05e49..648d9ae898d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c @@ -1603,6 +1603,141 @@ static void test_reuseport(struct test_sockmap_listen *skel, } } +static void udp_redir_to_connected(int family, int sotype, int sock_mapfd, + int verd_mapfd, enum redir_mode mode) +{ + const char *log_prefix = redir_mode_str(mode); + struct sockaddr_storage addr; + int c0, c1, p0, p1; + unsigned int pass; + socklen_t len; + int err, n; + u64 value; + u32 key; + char b; + + zero_verdict_count(verd_mapfd); + + p0 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p0 < 0) + return; + len = sizeof(addr); + err = xgetsockname(p0, sockaddr(&addr), &len); + if (err) + goto close_peer0; + + c0 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c0 < 0) + goto close_peer0; + err = xconnect(c0, sockaddr(&addr), len); + if (err) + goto close_cli0; + err = xgetsockname(c0, sockaddr(&addr), &len); + if (err) + goto close_cli0; + err = xconnect(p0, sockaddr(&addr), len); + if (err) + goto close_cli0; + + p1 = socket_loopback(family, sotype | SOCK_NONBLOCK); + if (p1 < 0) + goto close_cli0; + err = xgetsockname(p1, sockaddr(&addr), &len); + if (err) + goto close_cli0; + + c1 = xsocket(family, sotype | SOCK_NONBLOCK, 0); + if (c1 < 0) + goto close_peer1; + err = xconnect(c1, sockaddr(&addr), len); + if (err) + goto close_cli1; + err = xgetsockname(c1, sockaddr(&addr), &len); + if (err) + goto close_cli1; + err = xconnect(p1, sockaddr(&addr), len); + if (err) + goto close_cli1; + + key = 0; + value = p0; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + key = 1; + value = p1; + err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST); + if (err) + goto close_cli1; + + n = write(c1, "a", 1); + if (n < 0) + FAIL_ERRNO("%s: write", log_prefix); + if (n == 0) + FAIL("%s: incomplete write", log_prefix); + if (n < 1) + goto close_cli1; + + key = SK_PASS; + err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass); + if (err) + goto close_cli1; + if (pass != 1) + FAIL("%s: want pass count 1, have %d", log_prefix, pass); + + n = read(mode == REDIR_INGRESS ? p0 : c0, &b, 1); + if (n < 0) + FAIL_ERRNO("%s: read", log_prefix); + if (n == 0) + FAIL("%s: incomplete read", log_prefix); + +close_cli1: + xclose(c1); +close_peer1: + xclose(p1); +close_cli0: + xclose(c0); +close_peer0: + xclose(p0); +} + +static void udp_skb_redir_to_connected(struct test_sockmap_listen *skel, + struct bpf_map *inner_map, int family) +{ + int verdict = bpf_program__fd(skel->progs.prog_skb_verdict); + int verdict_map = bpf_map__fd(skel->maps.verdict_map); + int sock_map = bpf_map__fd(inner_map); + int err; + + err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_VERDICT, 0); + if (err) + return; + + skel->bss->test_ingress = false; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_EGRESS); + skel->bss->test_ingress = true; + udp_redir_to_connected(family, SOCK_DGRAM, sock_map, verdict_map, + REDIR_INGRESS); + + xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_VERDICT); +} + +static void test_udp_redir(struct test_sockmap_listen *skel, struct bpf_map *map, + int family) +{ + const char *family_name, *map_name; + char s[MAX_TEST_NAME]; + + family_name = family_str(family); + map_name = map_type_str(map); + snprintf(s, sizeof(s), "%s %s %s", map_name, family_name, __func__); + if (!test__start_subtest(s)) + return; + udp_skb_redir_to_connected(skel, map, family); +} + static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, int family) { @@ -1611,6 +1746,7 @@ static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map, test_redir(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_STREAM); test_reuseport(skel, map, family, SOCK_DGRAM); + test_udp_redir(skel, map, family); } void test_sockmap_listen(void) diff --git a/tools/testing/selftests/bpf/prog_tests/static_linked.c b/tools/testing/selftests/bpf/prog_tests/static_linked.c new file mode 100644 index 000000000000..46556976dccc --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/static_linked.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <test_progs.h> +#include "test_static_linked.skel.h" + +void test_static_linked(void) +{ + int err; + struct test_static_linked* skel; + + skel = test_static_linked__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return; + + skel->rodata->rovar1 = 1; + skel->bss->static_var1 = 2; + skel->bss->static_var11 = 3; + + skel->rodata->rovar2 = 4; + skel->bss->static_var2 = 5; + skel->bss->static_var22 = 6; + + err = test_static_linked__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto cleanup; + + err = test_static_linked__attach(skel); + if (!ASSERT_OK(err, "skel_attach")) + goto cleanup; + + /* trigger */ + usleep(1); + + ASSERT_EQ(skel->bss->var1, 1 * 2 + 2 + 3, "var1"); + ASSERT_EQ(skel->bss->var2, 4 * 3 + 5 + 6, "var2"); + +cleanup: + test_static_linked__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_ima.c b/tools/testing/selftests/bpf/prog_tests/test_ima.c index b54bc0c351b7..0252f61d611a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_ima.c +++ b/tools/testing/selftests/bpf/prog_tests/test_ima.c @@ -68,7 +68,8 @@ void test_test_ima(void) goto close_prog; snprintf(cmd, sizeof(cmd), "./ima_setup.sh setup %s", measured_dir); - if (CHECK_FAIL(system(cmd))) + err = system(cmd); + if (CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno)) goto close_clean; err = run_measured_process(measured_dir, &skel->bss->monitored_pid); @@ -81,7 +82,8 @@ void test_test_ima(void) close_clean: snprintf(cmd, sizeof(cmd), "./ima_setup.sh cleanup %s", measured_dir); - CHECK_FAIL(system(cmd)); + err = system(cmd); + CHECK(err, "failed to run command", "%s, errno = %d\n", cmd, errno); close_prog: ima__destroy(skel); } diff --git a/tools/testing/selftests/bpf/progs/bind4_prog.c b/tools/testing/selftests/bpf/progs/bind4_prog.c index 115a3b0ad984..474c6a62078a 100644 --- a/tools/testing/selftests/bpf/progs/bind4_prog.c +++ b/tools/testing/selftests/bpf/progs/bind4_prog.c @@ -57,6 +57,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -127,6 +148,10 @@ int bind_v4_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip4 = bpf_htonl(SERV4_REWRITE_IP); ctx->user_port = bpf_htons(SERV4_REWRITE_PORT); diff --git a/tools/testing/selftests/bpf/progs/bind6_prog.c b/tools/testing/selftests/bpf/progs/bind6_prog.c index 4c0d348034b9..c19cfa869f30 100644 --- a/tools/testing/selftests/bpf/progs/bind6_prog.c +++ b/tools/testing/selftests/bpf/progs/bind6_prog.c @@ -63,6 +63,27 @@ static __inline int bind_to_device(struct bpf_sock_addr *ctx) return 0; } +static __inline int bind_reuseport(struct bpf_sock_addr *ctx) +{ + int val = 1; + + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || !val) + return 1; + val = 0; + if (bpf_setsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val))) + return 1; + if (bpf_getsockopt(ctx, SOL_SOCKET, SO_REUSEPORT, + &val, sizeof(val)) || val) + return 1; + + return 0; +} + static __inline int misc_opts(struct bpf_sock_addr *ctx, int opt) { int old, tmp, new = 0xeb9f; @@ -141,6 +162,10 @@ int bind_v6_prog(struct bpf_sock_addr *ctx) if (misc_opts(ctx, SO_MARK) || misc_opts(ctx, SO_PRIORITY)) return 0; + /* Set reuseport and unset */ + if (bind_reuseport(ctx)) + return 0; + ctx->user_ip6[0] = bpf_htonl(SERV6_REWRITE_IP_0); ctx->user_ip6[1] = bpf_htonl(SERV6_REWRITE_IP_1); ctx->user_ip6[2] = bpf_htonl(SERV6_REWRITE_IP_2); diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c index 6939bfd8690f..f62df4d023f9 100644 --- a/tools/testing/selftests/bpf/progs/bpf_cubic.c +++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c @@ -174,8 +174,8 @@ static __always_inline void bictcp_hystart_reset(struct sock *sk) * as long as it is used in one of the func ptr * under SEC(".struct_ops"). */ -SEC("struct_ops/bictcp_init") -void BPF_PROG(bictcp_init, struct sock *sk) +SEC("struct_ops/bpf_cubic_init") +void BPF_PROG(bpf_cubic_init, struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); @@ -192,7 +192,7 @@ void BPF_PROG(bictcp_init, struct sock *sk) * The remaining tcp-cubic functions have an easier way. */ SEC("no-sec-prefix-bictcp_cwnd_event") -void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event) +void BPF_PROG(bpf_cubic_cwnd_event, struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { struct bictcp *ca = inet_csk_ca(sk); @@ -384,7 +384,7 @@ tcp_friendliness: } /* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */ -void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +void BPF_STRUCT_OPS(bpf_cubic_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -403,7 +403,7 @@ void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) tcp_cong_avoid_ai(tp, ca->cnt, acked); } -__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) +__u32 BPF_STRUCT_OPS(bpf_cubic_recalc_ssthresh, struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); @@ -420,7 +420,7 @@ __u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state) +void BPF_STRUCT_OPS(bpf_cubic_state, struct sock *sk, __u8 new_state) { if (new_state == TCP_CA_Loss) { bictcp_reset(inet_csk_ca(sk)); @@ -496,7 +496,7 @@ static __always_inline void hystart_update(struct sock *sk, __u32 delay) } } -void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, +void BPF_STRUCT_OPS(bpf_cubic_acked, struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -525,21 +525,21 @@ void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk, hystart_update(sk, delay); } -__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); +extern __u32 tcp_reno_undo_cwnd(struct sock *sk) __ksym; - return max(tp->snd_cwnd, tp->prior_cwnd); +__u32 BPF_STRUCT_OPS(bpf_cubic_undo_cwnd, struct sock *sk) +{ + return tcp_reno_undo_cwnd(sk); } SEC(".struct_ops") struct tcp_congestion_ops cubic = { - .init = (void *)bictcp_init, - .ssthresh = (void *)bictcp_recalc_ssthresh, - .cong_avoid = (void *)bictcp_cong_avoid, - .set_state = (void *)bictcp_state, - .undo_cwnd = (void *)tcp_reno_undo_cwnd, - .cwnd_event = (void *)bictcp_cwnd_event, - .pkts_acked = (void *)bictcp_acked, + .init = (void *)bpf_cubic_init, + .ssthresh = (void *)bpf_cubic_recalc_ssthresh, + .cong_avoid = (void *)bpf_cubic_cong_avoid, + .set_state = (void *)bpf_cubic_state, + .undo_cwnd = (void *)bpf_cubic_undo_cwnd, + .cwnd_event = (void *)bpf_cubic_cwnd_event, + .pkts_acked = (void *)bpf_cubic_acked, .name = "bpf_cubic", }; diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c index 4dc1a967776a..fd42247da8b4 100644 --- a/tools/testing/selftests/bpf/progs/bpf_dctcp.c +++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c @@ -194,22 +194,12 @@ __u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } -SEC("struct_ops/tcp_reno_cong_avoid") -void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (!tcp_is_cwnd_limited(sk)) - return; +extern void tcp_reno_cong_avoid(struct sock *sk, __u32 ack, __u32 acked) __ksym; - /* In "safe" area, increase. */ - if (tcp_in_slow_start(tp)) { - acked = tcp_slow_start(tp, acked); - if (!acked) - return; - } - /* In dangerous area, increase slowly. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); +SEC("struct_ops/dctcp_reno_cong_avoid") +void BPF_PROG(dctcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked) +{ + tcp_reno_cong_avoid(sk, ack, acked); } SEC(".struct_ops") @@ -226,7 +216,7 @@ struct tcp_congestion_ops dctcp = { .in_ack_event = (void *)dctcp_update_alpha, .cwnd_event = (void *)dctcp_cwnd_event, .ssthresh = (void *)dctcp_ssthresh, - .cong_avoid = (void *)tcp_reno_cong_avoid, + .cong_avoid = (void *)dctcp_cong_avoid, .undo_cwnd = (void *)dctcp_cwnd_undo, .set_state = (void *)dctcp_state, .flags = TCP_CONG_NEEDS_ECN, diff --git a/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c new file mode 100644 index 000000000000..2ecd833dcd41 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_tcp_nogpl.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/bpf.h> +#include <linux/types.h> +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include "bpf_tcp_helpers.h" + +char _license[] SEC("license") = "X"; + +void BPF_STRUCT_OPS(nogpltcp_init, struct sock *sk) +{ +} + +SEC(".struct_ops") +struct tcp_congestion_ops bpf_nogpltcp = { + .init = (void *)nogpltcp_init, + .name = "bpf_nogpltcp", +}; diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c index 12b40dc81e14..8aaa24a00322 100644 --- a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c +++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c @@ -174,6 +174,12 @@ struct struct_in_struct { }; }; +struct struct_in_array {}; + +struct struct_in_array_typed {}; + +typedef struct struct_in_array_typed struct_in_array_t[2]; + struct struct_with_embedded_stuff { int a; struct { @@ -203,6 +209,8 @@ struct struct_with_embedded_stuff { } r[5]; struct struct_in_struct s[10]; int t[11]; + struct struct_in_array (*u)[2]; + struct_in_array_t *v; }; struct float_struct { diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c index 5f645fdaba6f..52a550d281d9 100644 --- a/tools/testing/selftests/bpf/progs/fentry_test.c +++ b/tools/testing/selftests/bpf/progs/fentry_test.c @@ -64,7 +64,7 @@ __u64 test7_result = 0; SEC("fentry/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/fexit_sleep.c b/tools/testing/selftests/bpf/progs/fexit_sleep.c new file mode 100644 index 000000000000..03a672d76353 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/fexit_sleep.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> + +char LICENSE[] SEC("license") = "GPL"; + +int pid = 0; +int fentry_cnt = 0; +int fexit_cnt = 0; + +SEC("fentry/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fentry, const struct pt_regs *regs) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fentry_cnt++; + return 0; +} + +SEC("fexit/__x64_sys_nanosleep") +int BPF_PROG(nanosleep_fexit, const struct pt_regs *regs, int ret) +{ + if ((int)bpf_get_current_pid_tgid() != pid) + return 0; + + fexit_cnt++; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c index 0952affb22a6..8f1ccb7302e1 100644 --- a/tools/testing/selftests/bpf/progs/fexit_test.c +++ b/tools/testing/selftests/bpf/progs/fexit_test.c @@ -65,7 +65,7 @@ __u64 test7_result = 0; SEC("fexit/bpf_fentry_test7") int BPF_PROG(test7, struct bpf_fentry_test_t *arg) { - if (arg == 0) + if (!arg) test7_result = 1; return 0; } @@ -74,7 +74,7 @@ __u64 test8_result = 0; SEC("fexit/bpf_fentry_test8") int BPF_PROG(test8, struct bpf_fentry_test_t *arg) { - if (arg->a == 0) + if (!arg->a) test8_result = 1; return 0; } diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test.c b/tools/testing/selftests/bpf/progs/kfunc_call_test.c new file mode 100644 index 000000000000..470f8723e463 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +extern int bpf_kfunc_call_test2(struct sock *sk, __u32 a, __u32 b) __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; + +SEC("classifier") +int kfunc_call_test2(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + return bpf_kfunc_call_test2((struct sock *)sk, 1, 2); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + __u64 a = 1ULL << 32; + __u32 ret; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + a = bpf_kfunc_call_test1((struct sock *)sk, 1, a | 2, 3, a | 4); + ret = a >> 32; /* ret should be 2 */ + ret += (__u32)a; /* ret should be 12 */ + + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c new file mode 100644 index 000000000000..b2dcb7d9cb03 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/kfunc_call_test_subprog.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> +#include "bpf_tcp_helpers.h" + +extern const int bpf_prog_active __ksym; +extern __u64 bpf_kfunc_call_test1(struct sock *sk, __u32 a, __u64 b, + __u32 c, __u64 d) __ksym; +extern struct sock *bpf_kfunc_call_test3(struct sock *sk) __ksym; +int active_res = -1; +int sk_state = -1; + +int __noinline f1(struct __sk_buff *skb) +{ + struct bpf_sock *sk = skb->sk; + int *active; + + if (!sk) + return -1; + + sk = bpf_sk_fullsock(sk); + if (!sk) + return -1; + + active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, + bpf_get_smp_processor_id()); + if (active) + active_res = *active; + + sk_state = bpf_kfunc_call_test3((struct sock *)sk)->__sk_common.skc_state; + + return (__u32)bpf_kfunc_call_test1((struct sock *)sk, 1, 2, 3, 4); +} + +SEC("classifier") +int kfunc_call_test1(struct __sk_buff *skb) +{ + return f1(skb); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/skb_pkt_end.c b/tools/testing/selftests/bpf/progs/skb_pkt_end.c index cf6823f42e80..7f2eaa2f89f8 100644 --- a/tools/testing/selftests/bpf/progs/skb_pkt_end.c +++ b/tools/testing/selftests/bpf/progs/skb_pkt_end.c @@ -4,7 +4,6 @@ #include <bpf/bpf_core_read.h> #include <bpf/bpf_helpers.h> -#define NULL 0 #define INLINE __always_inline #define skb_shorter(skb, len) ((void *)(long)(skb)->data + (len) > (void *)(long)skb->data_end) diff --git a/tools/testing/selftests/bpf/progs/test_check_mtu.c b/tools/testing/selftests/bpf/progs/test_check_mtu.c index b7787b43f9db..c4a9bae96e75 100644 --- a/tools/testing/selftests/bpf/progs/test_check_mtu.c +++ b/tools/testing/selftests/bpf/progs/test_check_mtu.c @@ -105,6 +105,54 @@ int xdp_minus_delta(struct xdp_md *ctx) return retval; } +SEC("xdp") +int xdp_input_len(struct xdp_md *ctx) +{ + int retval = XDP_PASS; /* Expected retval on successful test */ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + __u32 ifindex = GLOBAL_USER_IFINDEX; + __u32 data_len = data_end - data; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input len is L3, like MTU and iph->tot_len. + * Remember XDP data_len is L2. + */ + __u32 mtu_len = data_len - ETH_HLEN; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = XDP_ABORTED; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("xdp") +int xdp_input_len_exceed(struct xdp_md *ctx) +{ + int retval = XDP_ABORTED; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = XDP_PASS ; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + SEC("classifier") int tc_use_helper(struct __sk_buff *ctx) { @@ -196,3 +244,47 @@ int tc_minus_delta(struct __sk_buff *ctx) global_bpf_mtu_xdp = mtu_len; return retval; } + +SEC("classifier") +int tc_input_len(struct __sk_buff *ctx) +{ + int retval = BPF_OK; /* Expected retval on successful test */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + if (bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0)) + retval = BPF_DROP; + + global_bpf_mtu_xdp = mtu_len; + return retval; +} + +SEC("classifier") +int tc_input_len_exceed(struct __sk_buff *ctx) +{ + int retval = BPF_DROP; /* Fail */ + __u32 ifindex = GLOBAL_USER_IFINDEX; + int err; + + /* API allow user give length to check as input via mtu_len param, + * resulting MTU value is still output in mtu_len param after call. + * + * Input length value is L3 size like MTU. + */ + __u32 mtu_len = GLOBAL_USER_MTU; + + mtu_len += 1; /* Exceed with 1 */ + + err = bpf_check_mtu(ctx, ifindex, &mtu_len, 0, 0); + if (err == BPF_MTU_CHK_RET_FRAG_NEEDED) + retval = BPF_OK; /* Success in exceeding MTU check */ + + global_bpf_mtu_xdp = mtu_len; + return retval; +} diff --git a/tools/testing/selftests/bpf/progs/test_global_func10.c b/tools/testing/selftests/bpf/progs/test_global_func10.c index 61c2ae92ce41..97b7031d0e22 100644 --- a/tools/testing/selftests/bpf/progs/test_global_func10.c +++ b/tools/testing/selftests/bpf/progs/test_global_func10.c @@ -14,7 +14,7 @@ struct Big { __noinline int foo(const struct Big *big) { - if (big == 0) + if (!big) return 0; return bpf_get_prandom_u32() < big->y; diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c index fa221141e9c1..a39eba9f5201 100644 --- a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c +++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c @@ -29,6 +29,7 @@ struct { } verdict_map SEC(".maps"); static volatile bool test_sockmap; /* toggled by user-space */ +static volatile bool test_ingress; /* toggled by user-space */ SEC("sk_skb/stream_parser") int prog_stream_parser(struct __sk_buff *skb) @@ -55,6 +56,27 @@ int prog_stream_verdict(struct __sk_buff *skb) return verdict; } +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + unsigned int *count; + __u32 zero = 0; + int verdict; + + if (test_sockmap) + verdict = bpf_sk_redirect_map(skb, &sock_map, zero, + test_ingress ? BPF_F_INGRESS : 0); + else + verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, + test_ingress ? BPF_F_INGRESS : 0); + + count = bpf_map_lookup_elem(&verdict_map, &verdict); + if (count) + (*count)++; + + return verdict; +} + SEC("sk_msg") int prog_msg_verdict(struct sk_msg_md *msg) { diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c new file mode 100644 index 000000000000..2d31f66e4f23 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_sockmap_skb_verdict_attach.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "vmlinux.h" +#include <bpf/bpf_helpers.h> + +struct { + __uint(type, BPF_MAP_TYPE_SOCKMAP); + __uint(max_entries, 2); + __type(key, __u32); + __type(value, __u64); +} sock_map SEC(".maps"); + +SEC("sk_skb/skb_verdict") +int prog_skb_verdict(struct __sk_buff *skb) +{ + return SK_DROP; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_static_linked1.c b/tools/testing/selftests/bpf/progs/test_static_linked1.c new file mode 100644 index 000000000000..ea1a6c4c7172 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked1.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* 8-byte aligned .bss */ +static volatile long static_var1; +static volatile int static_var11; +int var1 = 0; +/* 4-byte aligned .rodata */ +const volatile int rovar1; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 2; +} + +SEC("raw_tp/sys_enter") +int handler1(const void *ctx) +{ + var1 = subprog(rovar1) + static_var1 + static_var11; + + return 0; +} + +char LICENSE[] SEC("license") = "GPL"; +int VERSION SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_static_linked2.c b/tools/testing/selftests/bpf/progs/test_static_linked2.c new file mode 100644 index 000000000000..54d8d1ab577c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_static_linked2.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Facebook */ + +#include <linux/bpf.h> +#include <bpf/bpf_helpers.h> + +/* 4-byte aligned .bss */ +static volatile int static_var2; +static volatile int static_var22; +int var2 = 0; +/* 8-byte aligned .rodata */ +const volatile long rovar2; + +/* same "subprog" name in both files */ +static __noinline int subprog(int x) +{ + /* but different formula */ + return x * 3; +} + +SEC("raw_tp/sys_enter") +int handler2(const void *ctx) +{ + var2 = subprog(rovar2) + static_var2 + static_var22; + + return 0; +} + +/* different name and/or type of the variable doesn't matter */ +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 9afe947cfae9..ba6eadfec565 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -508,10 +508,8 @@ int _ip6geneve_get_tunnel(struct __sk_buff *skb) } ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt)); - if (ret < 0) { - ERROR(ret); - return TC_ACT_SHOT; - } + if (ret < 0) + gopt.opt_class = 0; bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4, gopt.opt_class); diff --git a/tools/testing/selftests/bpf/test_xsk.sh b/tools/testing/selftests/bpf/test_xsk.sh index 56d4474e2c83..46633a3bfb0b 100755 --- a/tools/testing/selftests/bpf/test_xsk.sh +++ b/tools/testing/selftests/bpf/test_xsk.sh @@ -107,7 +107,7 @@ setup_vethPairs() { echo "setting up ${VETH0}: namespace: ${NS0}" fi ip netns add ${NS1} - ip link add ${VETH0} type veth peer name ${VETH1} + ip link add ${VETH0} numtxqueues 4 numrxqueues 4 type veth peer name ${VETH1} numtxqueues 4 numrxqueues 4 if [ -f /proc/net/if_inet6 ]; then echo 1 > /proc/sys/net/ipv6/conf/${VETH0}/disable_ipv6 fi @@ -118,6 +118,7 @@ setup_vethPairs() { ip netns exec ${NS1} ip link set ${VETH1} mtu ${MTU} ip link set ${VETH0} mtu ${MTU} ip netns exec ${NS1} ip link set ${VETH1} up + ip netns exec ${NS1} ip link set dev lo up ip link set ${VETH0} up } diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c index 1fd07a4f27ac..c162498a64fc 100644 --- a/tools/testing/selftests/bpf/verifier/bounds_deduction.c +++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c @@ -6,8 +6,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 2", @@ -20,6 +21,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, }, @@ -31,8 +34,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 4", @@ -45,6 +49,8 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", + .result_unpriv = REJECT, .result = ACCEPT, }, { @@ -55,8 +61,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 6", @@ -67,8 +74,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 7", @@ -80,8 +88,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to sub from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -94,8 +103,9 @@ offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .errstr = "dereference of modified ctx ptr", + .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, { @@ -106,8 +116,9 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, + .errstr_unpriv = "R0 tried to sub from different maps, paths, or prohibited types", .errstr = "R0 tried to subtract pointer from scalar", + .result = REJECT, }, { "check deducing bounds from const, 10", @@ -119,6 +130,6 @@ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .result = REJECT, .errstr = "math between ctx pointer and register with unbounded min value is not allowed", + .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index eb888c8479c3..336a749673d1 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -19,7 +19,7 @@ BPF_MOV64_IMM(BPF_REG_0, 2), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 1, @@ -136,7 +136,7 @@ { "calls: wrong src reg", .insns = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 3, 0, 0), BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, @@ -397,7 +397,7 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .fixup_map_hash_48b = { 3 }, .result_unpriv = REJECT, .result = ACCEPT, @@ -1977,7 +1977,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, }, @@ -2003,7 +2003,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, @@ -2028,7 +2028,7 @@ BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SOCKET_FILTER, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .errstr = "!read_ok", .result = REJECT, }, diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c index 5cf361d8eb1c..17fe33a75034 100644 --- a/tools/testing/selftests/bpf/verifier/dead_code.c +++ b/tools/testing/selftests/bpf/verifier/dead_code.c @@ -85,7 +85,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -103,7 +103,7 @@ BPF_MOV64_IMM(BPF_REG_0, 12), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -121,7 +121,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 7, @@ -137,7 +137,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, @@ -152,7 +152,7 @@ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), BPF_EXIT_INSN(), }, - .errstr_unpriv = "function calls to other bpf functions are allowed for", + .errstr_unpriv = "loading/calling other bpf or kernel functions are allowed for", .result_unpriv = REJECT, .result = ACCEPT, .retval = 2, diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c index b117bdd3806d..6f610cfddae5 100644 --- a/tools/testing/selftests/bpf/verifier/map_ptr.c +++ b/tools/testing/selftests/bpf/verifier/map_ptr.c @@ -75,6 +75,8 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, { @@ -91,5 +93,7 @@ BPF_EXIT_INSN(), }, .fixup_map_hash_16b = { 4 }, + .result_unpriv = REJECT, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", .result = ACCEPT, }, diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index b018ad71e0a8..3e32400c4b44 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -497,7 +497,7 @@ .result = ACCEPT, }, { - "unpriv: adding of fp", + "unpriv: adding of fp, reg", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), BPF_MOV64_IMM(BPF_REG_1, 0), @@ -505,6 +505,19 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), BPF_EXIT_INSN(), }, + .errstr_unpriv = "R1 tried to add from different maps, paths, or prohibited types", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "unpriv: adding of fp, imm", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8), + BPF_EXIT_INSN(), + }, .errstr_unpriv = "R1 stack pointer arithmetic goes out of range", .result_unpriv = REJECT, .result = ACCEPT, diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c index ed4e76b24649..feb91266db39 100644 --- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c +++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c @@ -169,7 +169,7 @@ .fixup_map_array_48b = { 1 }, .result = ACCEPT, .result_unpriv = REJECT, - .errstr_unpriv = "R2 tried to add from different maps or paths", + .errstr_unpriv = "R2 tried to add from different maps, paths, or prohibited types", .retval = 0, }, { @@ -517,6 +517,27 @@ .retval = 0xabcdef12, }, { + "map access: value_ptr += N, value_ptr -= N known scalar", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + BPF_MOV32_IMM(BPF_REG_1, 0x12345678), + BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2), + BPF_MOV64_IMM(BPF_REG_1, 2), + BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_array_48b = { 3 }, + .result = ACCEPT, + .retval = 0x12345678, +}, +{ "map access: unknown scalar += value_ptr, 1", .insns = { BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), diff --git a/tools/testing/selftests/bpf/vmtest.sh b/tools/testing/selftests/bpf/vmtest.sh index 22554894db99..8889b3f55236 100755 --- a/tools/testing/selftests/bpf/vmtest.sh +++ b/tools/testing/selftests/bpf/vmtest.sh @@ -24,15 +24,15 @@ EXIT_STATUS_FILE="${LOG_FILE_BASE}.exit_status" usage() { cat <<EOF -Usage: $0 [-i] [-d <output_dir>] -- [<command>] +Usage: $0 [-i] [-s] [-d <output_dir>] -- [<command>] <command> is the command you would normally run when you are in tools/testing/selftests/bpf. e.g: $0 -- ./test_progs -t test_lsm -If no command is specified, "${DEFAULT_COMMAND}" will be run by -default. +If no command is specified and a debug shell (-s) is not requested, +"${DEFAULT_COMMAND}" will be run by default. If you build your kernel using KBUILD_OUTPUT= or O= options, these can be passed as environment variables to the script: @@ -49,6 +49,9 @@ Options: -d) Update the output directory (default: ${OUTPUT_DIR}) -j) Number of jobs for compilation, similar to -j in make (default: ${NUM_COMPILE_JOBS}) + -s) Instead of powering off the VM, start an interactive + shell. If <command> is specified, the shell runs after + the command finishes executing EOF } @@ -149,6 +152,7 @@ update_init_script() local init_script_dir="${OUTPUT_DIR}/${MOUNT_DIR}/etc/rcS.d" local init_script="${init_script_dir}/S50-startup" local command="$1" + local exit_command="$2" mount_image @@ -162,9 +166,10 @@ EOF fi - sudo bash -c "cat >${init_script}" <<EOF -#!/bin/bash + sudo bash -c "echo '#!/bin/bash' > ${init_script}" + if [[ "${command}" != "" ]]; then + sudo bash -c "cat >>${init_script}" <<EOF # Have a default value in the exit status file # incase the VM is forcefully stopped. echo "130" > "/root/${EXIT_STATUS_FILE}" @@ -175,9 +180,12 @@ echo "130" > "/root/${EXIT_STATUS_FILE}" stdbuf -oL -eL ${command} echo "\$?" > "/root/${EXIT_STATUS_FILE}" } 2>&1 | tee "/root/${LOG_FILE}" -poweroff -f +# Ensure that the logs are written to disk +sync EOF + fi + sudo bash -c "echo ${exit_command} >> ${init_script}" sudo chmod a+x "${init_script}" unmount_image } @@ -277,8 +285,10 @@ main() local kernel_bzimage="${kernel_checkout}/${X86_BZIMAGE}" local command="${DEFAULT_COMMAND}" local update_image="no" + local exit_command="poweroff -f" + local debug_shell="no" - while getopts 'hkid:j:' opt; do + while getopts 'hskid:j:' opt; do case ${opt} in i) update_image="yes" @@ -289,6 +299,11 @@ main() j) NUM_COMPILE_JOBS="$OPTARG" ;; + s) + command="" + debug_shell="yes" + exit_command="bash" + ;; h) usage exit 0 @@ -307,7 +322,7 @@ main() done shift $((OPTIND -1)) - if [[ $# -eq 0 ]]; then + if [[ $# -eq 0 && "${debug_shell}" == "no" ]]; then echo "No command specified, will run ${DEFAULT_COMMAND} in the vm" else command="$@" @@ -355,10 +370,12 @@ main() fi update_selftests "${kernel_checkout}" "${make_command}" - update_init_script "${command}" + update_init_script "${command}" "${exit_command}" run_vm "${kernel_bzimage}" - copy_logs - echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" + if [[ "${command}" != "" ]]; then + copy_logs + echo "Logs saved in ${OUTPUT_DIR}/${LOG_FILE}" + fi } catch() diff --git a/tools/testing/selftests/bpf/xdpxceiver.c b/tools/testing/selftests/bpf/xdpxceiver.c index 8b0f7fdd9003..1135fb980814 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.c +++ b/tools/testing/selftests/bpf/xdpxceiver.c @@ -41,8 +41,12 @@ * Reduce the size of the RX ring to a fraction of the fill ring size. * iv. fill queue empty * Do not populate the fill queue and then try to receive pkts. + * f. bpf_link resource persistence + * Configure sockets at indexes 0 and 1, run a traffic on queue ids 0, + * then remove xsk sockets from queue 0 on both veth interfaces and + * finally run a traffic on queues ids 1 * - * Total tests: 10 + * Total tests: 12 * * Flow: * ----- @@ -93,6 +97,13 @@ typedef __u16 __sum16; #include "xdpxceiver.h" #include "../kselftest.h" +static const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; +static const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; +static const char *IP1 = "192.168.100.162"; +static const char *IP2 = "192.168.100.161"; +static const u16 UDP_PORT1 = 2020; +static const u16 UDP_PORT2 = 2121; + static void __exit_with_error(int error, const char *file, const char *func, int line) { if (configured_mode == TEST_MODE_UNCONFIGURED) { @@ -108,27 +119,12 @@ static void __exit_with_error(int error, const char *file, const char *func, int #define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, __LINE__) #define print_ksft_result(void)\ - (ksft_test_result_pass("PASS: %s %s %s%s%s\n", configured_mode ? "DRV" : "SKB",\ + (ksft_test_result_pass("PASS: %s %s %s%s%s%s\n", configured_mode ? "DRV" : "SKB",\ test_type == TEST_TYPE_POLL ? "POLL" : "NOPOLL",\ test_type == TEST_TYPE_TEARDOWN ? "Socket Teardown" : "",\ test_type == TEST_TYPE_BIDI ? "Bi-directional Sockets" : "",\ - test_type == TEST_TYPE_STATS ? "Stats" : "")) - -static void pthread_init_mutex(void) -{ - pthread_mutex_init(&sync_mutex, NULL); - pthread_mutex_init(&sync_mutex_tx, NULL); - pthread_cond_init(&signal_rx_condition, NULL); - pthread_cond_init(&signal_tx_condition, NULL); -} - -static void pthread_destroy_mutex(void) -{ - pthread_mutex_destroy(&sync_mutex); - pthread_mutex_destroy(&sync_mutex_tx); - pthread_cond_destroy(&signal_rx_condition); - pthread_cond_destroy(&signal_tx_condition); -} + test_type == TEST_TYPE_STATS ? "Stats" : "",\ + test_type == TEST_TYPE_BPF_RES ? "BPF RES" : "")) static void *memset32_htonl(void *dest, u32 val, u32 size) { @@ -147,24 +143,11 @@ static void *memset32_htonl(void *dest, u32 val, u32 size) } /* - * This function code has been taken from - * Linux kernel lib/checksum.c - */ -static inline unsigned short from32to16(unsigned int x) -{ - /* add up 16-bit and 16-bit for 16+c bit */ - x = (x & 0xffff) + (x >> 16); - /* add up carry.. */ - x = (x & 0xffff) + (x >> 16); - return x; -} - -/* * Fold a partial checksum * This function code has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 csum_fold(__u32 csum) +static __u16 csum_fold(__u32 csum) { u32 sum = (__force u32)csum; @@ -177,7 +160,7 @@ static inline __u16 csum_fold(__u32 csum) * This function code has been taken from * Linux kernel lib/checksum.c */ -static inline u32 from64to32(u64 x) +static u32 from64to32(u64 x) { /* add up 32-bit and 32-bit for 32+c bit */ x = (x & 0xffffffff) + (x >> 32); @@ -186,13 +169,11 @@ static inline u32 from64to32(u64 x) return (u32)x; } -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum); - /* * This function code has been taken from * Linux kernel lib/checksum.c */ -__u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { unsigned long long s = (__force u32)sum; @@ -210,13 +191,12 @@ __u32 csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u3 * This function has been taken from * Linux kernel include/asm-generic/checksum.h */ -static inline __u16 -csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) +static __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __u32 sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } -static inline u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) +static u16 udp_csum(u32 saddr, u32 daddr, u32 len, u8 proto, u16 *udp_pkt) { u32 csum = 0; u32 cnt = 0; @@ -271,9 +251,8 @@ static void gen_eth_frame(struct xsk_umem_info *umem, u64 addr) memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, PKT_SIZE); } -static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) +static void xsk_configure_umem(struct ifobject *data, void *buffer, int idx) { - int ret; struct xsk_umem_config cfg = { .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, @@ -281,23 +260,28 @@ static void xsk_configure_umem(struct ifobject *data, void *buffer, u64 size) .frame_headroom = frame_headroom, .flags = XSK_UMEM__DEFAULT_FLAGS }; + int size = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; + struct xsk_umem_info *umem; + int ret; - data->umem = calloc(1, sizeof(struct xsk_umem_info)); - if (!data->umem) + umem = calloc(1, sizeof(struct xsk_umem_info)); + if (!umem) exit_with_error(errno); - ret = xsk_umem__create(&data->umem->umem, buffer, size, - &data->umem->fq, &data->umem->cq, &cfg); + ret = xsk_umem__create(&umem->umem, buffer, size, + &umem->fq, &umem->cq, &cfg); if (ret) exit_with_error(ret); - data->umem->buffer = buffer; + umem->buffer = buffer; + + data->umem_arr[idx] = umem; } static void xsk_populate_fill_ring(struct xsk_umem_info *umem) { int ret, i; - u32 idx; + u32 idx = 0; ret = xsk_ring_prod__reserve(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS, &idx); if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) @@ -307,18 +291,19 @@ static void xsk_populate_fill_ring(struct xsk_umem_info *umem) xsk_ring_prod__submit(&umem->fq, XSK_RING_PROD__DEFAULT_NUM_DESCS); } -static int xsk_configure_socket(struct ifobject *ifobject) +static int xsk_configure_socket(struct ifobject *ifobject, int idx) { struct xsk_socket_config cfg; + struct xsk_socket_info *xsk; struct xsk_ring_cons *rxr; struct xsk_ring_prod *txr; int ret; - ifobject->xsk = calloc(1, sizeof(struct xsk_socket_info)); - if (!ifobject->xsk) + xsk = calloc(1, sizeof(struct xsk_socket_info)); + if (!xsk) exit_with_error(errno); - ifobject->xsk->umem = ifobject->umem; + xsk->umem = ifobject->umem; cfg.rx_size = rxqsize; cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; cfg.libbpf_flags = 0; @@ -326,19 +311,20 @@ static int xsk_configure_socket(struct ifobject *ifobject) cfg.bind_flags = xdp_bind_flags; if (test_type != TEST_TYPE_BIDI) { - rxr = (ifobject->fv.vector == rx) ? &ifobject->xsk->rx : NULL; - txr = (ifobject->fv.vector == tx) ? &ifobject->xsk->tx : NULL; + rxr = (ifobject->fv.vector == rx) ? &xsk->rx : NULL; + txr = (ifobject->fv.vector == tx) ? &xsk->tx : NULL; } else { - rxr = &ifobject->xsk->rx; - txr = &ifobject->xsk->tx; + rxr = &xsk->rx; + txr = &xsk->tx; } - ret = xsk_socket__create(&ifobject->xsk->xsk, ifobject->ifname, - opt_queue, ifobject->umem->umem, rxr, txr, &cfg); - + ret = xsk_socket__create(&xsk->xsk, ifobject->ifname, idx, + ifobject->umem->umem, rxr, txr, &cfg); if (ret) return 1; + ifobject->xsk_arr[idx] = xsk; + return 0; } @@ -364,12 +350,15 @@ static void usage(const char *prog) ksft_print_msg(str, prog); } -static bool switch_namespace(int idx) +static int switch_namespace(const char *nsname) { char fqns[26] = "/var/run/netns/"; int nsfd; - strncat(fqns, ifdict[idx]->nsname, sizeof(fqns) - strlen(fqns) - 1); + if (!nsname || strlen(nsname) == 0) + return -1; + + strncat(fqns, nsname, sizeof(fqns) - strlen(fqns) - 1); nsfd = open(fqns, O_RDONLY); if (nsfd == -1) @@ -378,26 +367,9 @@ static bool switch_namespace(int idx) if (setns(nsfd, 0) == -1) exit_with_error(errno); - return true; -} - -static void *nsswitchthread(void *args) -{ - struct targs *targs = args; + print_verbose("NS switched: %s\n", nsname); - targs->retptr = false; - - if (switch_namespace(targs->idx)) { - ifdict[targs->idx]->ifindex = if_nametoindex(ifdict[targs->idx]->ifname); - if (!ifdict[targs->idx]->ifindex) { - ksft_test_result_fail("ERROR: [%s] interface \"%s\" does not exist\n", - __func__, ifdict[targs->idx]->ifname); - } else { - print_verbose("Interface found: %s\n", ifdict[targs->idx]->ifname); - targs->retptr = true; - } - } - pthread_exit(NULL); + return nsfd; } static int validate_interfaces(void) @@ -409,33 +381,6 @@ static int validate_interfaces(void) ret = false; ksft_test_result_fail("ERROR: interfaces: -i <int>,<ns> -i <int>,<ns>."); } - if (strcmp(ifdict[i]->nsname, "")) { - struct targs *targs; - - targs = malloc(sizeof(*targs)); - if (!targs) - exit_with_error(errno); - - targs->idx = i; - if (pthread_create(&ns_thread, NULL, nsswitchthread, targs)) - exit_with_error(errno); - - pthread_join(ns_thread, NULL); - - if (targs->retptr) - print_verbose("NS switched: %s\n", ifdict[i]->nsname); - - free(targs); - } else { - ifdict[i]->ifindex = if_nametoindex(ifdict[i]->ifname); - if (!ifdict[i]->ifindex) { - ksft_test_result_fail - ("ERROR: interface \"%s\" does not exist\n", ifdict[i]->ifname); - ret = false; - } else { - print_verbose("Interface found: %s\n", ifdict[i]->ifname); - } - } } return ret; } @@ -447,7 +392,7 @@ static void parse_command_line(int argc, char **argv) opterr = 0; for (;;) { - c = getopt_long(argc, argv, "i:q:DC:v", long_options, &option_index); + c = getopt_long(argc, argv, "i:DC:v", long_options, &option_index); if (c == -1) break; @@ -467,9 +412,6 @@ static void parse_command_line(int argc, char **argv) MAX_INTERFACES_NAMESPACE_CHARS); interface_index++; break; - case 'q': - opt_queue = atoi(optarg); - break; case 'D': debug_pkt_dump = 1; break; @@ -506,7 +448,7 @@ static void kick_tx(struct xsk_socket_info *xsk) exit_with_error(errno); } -static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) +static void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) { unsigned int rcvd; u32 idx; @@ -514,7 +456,7 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, int batch_size) if (!xsk->outstanding_tx) return; - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) + if (xsk_ring_prod__needs_wakeup(&xsk->tx)) kick_tx(xsk); rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx); @@ -584,7 +526,7 @@ static void rx_pkt(struct xsk_socket_info *xsk, struct pollfd *fds) static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) { - u32 idx; + u32 idx = 0; unsigned int i; bool tx_invalid_test = stat_test_type == STAT_TEST_TX_INVALID; u32 len = tx_invalid_test ? XSK_UMEM__DEFAULT_FRAME_SIZE + 1 : PKT_SIZE; @@ -602,16 +544,15 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frameptr, int batch_size) xsk_ring_prod__submit(&xsk->tx, batch_size); if (!tx_invalid_test) { xsk->outstanding_tx += batch_size; - } else { - if (!NEED_WAKEUP || xsk_ring_prod__needs_wakeup(&xsk->tx)) - kick_tx(xsk); + } else if (xsk_ring_prod__needs_wakeup(&xsk->tx)) { + kick_tx(xsk); } *frameptr += batch_size; *frameptr %= num_frames; complete_tx_only(xsk, batch_size); } -static inline int get_batch_size(int pkt_cnt) +static int get_batch_size(int pkt_cnt) { if (!opt_pkt_count) return BATCH_SIZE; @@ -667,45 +608,40 @@ static void tx_only_all(struct ifobject *ifobject) static void worker_pkt_dump(void) { - struct in_addr ipaddr; + struct ethhdr *ethhdr; + struct iphdr *iphdr; + struct udphdr *udphdr; + char s[128]; + int payload; + void *ptr; fprintf(stdout, "---------------------------------------\n"); for (int iter = 0; iter < num_frames - 1; iter++) { + ptr = pkt_buf[iter]->payload; + ethhdr = ptr; + iphdr = ptr + sizeof(*ethhdr); + udphdr = ptr + sizeof(*ethhdr) + sizeof(*iphdr); + /*extract L2 frame */ fprintf(stdout, "DEBUG>> L2: dst mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_dest[i]); + fprintf(stdout, "%02X", ethhdr->h_dest[i]); fprintf(stdout, "\nDEBUG>> L2: src mac: "); for (int i = 0; i < ETH_ALEN; i++) - fprintf(stdout, "%02X", ((struct ethhdr *) - pkt_buf[iter]->payload)->h_source[i]); + fprintf(stdout, "%02X", ethhdr->h_source[i]); /*extract L3 frame */ - fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->ihl); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->saddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", inet_ntoa(ipaddr)); - - ipaddr.s_addr = - ((struct iphdr *)(pkt_buf[iter]->payload + sizeof(struct ethhdr)))->daddr; - fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", inet_ntoa(ipaddr)); - + fprintf(stdout, "\nDEBUG>> L3: ip_hdr->ihl: %02X\n", iphdr->ihl); + fprintf(stdout, "DEBUG>> L3: ip_hdr->saddr: %s\n", + inet_ntop(AF_INET, &iphdr->saddr, s, sizeof(s))); + fprintf(stdout, "DEBUG>> L3: ip_hdr->daddr: %s\n", + inet_ntop(AF_INET, &iphdr->daddr, s, sizeof(s))); /*extract L4 frame */ - fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->source)); - - fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", - ntohs(((struct udphdr *)(pkt_buf[iter]->payload + - sizeof(struct ethhdr) + - sizeof(struct iphdr)))->dest)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->src: %d\n", ntohs(udphdr->source)); + fprintf(stdout, "DEBUG>> L4: udp_hdr->dst: %d\n", ntohs(udphdr->dest)); /*extract L5 frame */ - int payload = *((uint32_t *)(pkt_buf[iter]->payload + PKT_HDR_SIZE)); + payload = *((uint32_t *)(ptr + PKT_HDR_SIZE)); if (payload == EOT) { print_verbose("End-of-transmission frame received\n"); @@ -809,37 +745,69 @@ static void worker_pkt_validate(void) } } -static void thread_common_ops(struct ifobject *ifobject, void *bufs, pthread_mutex_t *mutexptr, - atomic_int *spinningptr) +static void thread_common_ops(struct ifobject *ifobject, void *bufs) { + int umem_sz = num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE; int ctr = 0; int ret; - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + ifobject->ns_fd = switch_namespace(ifobject->nsname); + + if (test_type == TEST_TYPE_BPF_RES) + umem_sz *= 2; + + bufs = mmap(NULL, umem_sz, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (bufs == MAP_FAILED) + exit_with_error(errno); + + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); /* Retry Create Socket if it fails as xsk_socket__create() * is asynchronous - * - * Essential to lock Mutex here to prevent Tx thread from - * entering before Rx and causing a deadlock */ - pthread_mutex_lock(mutexptr); while (ret && ctr < SOCK_RECONF_CTR) { - atomic_store(spinningptr, 1); - xsk_configure_umem(ifobject, bufs, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE); - ret = xsk_configure_socket(ifobject); + xsk_configure_umem(ifobject, bufs, 0); + ifobject->umem = ifobject->umem_arr[0]; + ret = xsk_configure_socket(ifobject, 0); usleep(USLEEP_MAX); ctr++; } - atomic_store(spinningptr, 0); - pthread_mutex_unlock(mutexptr); if (ctr >= SOCK_RECONF_CTR) exit_with_error(ret); + + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; + + if (test_type == TEST_TYPE_BPF_RES) { + xsk_configure_umem(ifobject, (u8 *)bufs + (umem_sz / 2), 1); + ifobject->umem = ifobject->umem_arr[1]; + ret = xsk_configure_socket(ifobject, 1); + } + + ifobject->umem = ifobject->umem_arr[0]; + ifobject->xsk = ifobject->xsk_arr[0]; + print_verbose("Interface [%s] vector [%s]\n", + ifobject->ifname, ifobject->fv.vector == tx ? "Tx" : "Rx"); } -static void *worker_testapp_validate(void *arg) +static bool testapp_is_test_two_stepped(void) +{ + return (test_type != TEST_TYPE_BIDI && test_type != TEST_TYPE_BPF_RES) || second_step; +} + +static void testapp_cleanup_xsk_res(struct ifobject *ifobj) +{ + if (testapp_is_test_two_stepped()) { + xsk_socket__delete(ifobj->xsk->xsk); + (void)xsk_umem__delete(ifobj->umem->umem); + } +} + +static void *worker_testapp_validate_tx(void *arg) { struct udphdr *udp_hdr = (struct udphdr *)(pkt_data + sizeof(struct ethhdr) + sizeof(struct iphdr)); @@ -849,157 +817,97 @@ static void *worker_testapp_validate(void *arg) struct generic_data data; void *bufs = NULL; - pthread_attr_setstacksize(&attr, THREAD_STACK); - - if (!bidi_pass) { - bufs = mmap(NULL, num_frames * XSK_UMEM__DEFAULT_FRAME_SIZE, - PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (bufs == MAP_FAILED) - exit_with_error(errno); - - if (strcmp(ifobject->nsname, "")) - switch_namespace(ifobject->ifdict_index); + if (!second_step) + thread_common_ops(ifobject, bufs); + + for (int i = 0; i < num_frames; i++) { + /*send EOT frame */ + if (i == (num_frames - 1)) + data.seqnum = -1; + else + data.seqnum = i; + gen_udp_hdr(&data, ifobject, udp_hdr); + gen_ip_hdr(ifobject, ip_hdr); + gen_udp_csum(udp_hdr, ip_hdr); + gen_eth_hdr(ifobject, eth_hdr); + gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); } - if (ifobject->fv.vector == tx) { - int spinningrxctr = 0; - - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_tx); + print_verbose("Sending %d packets on interface %s\n", + (opt_pkt_count - 1), ifobject->ifname); + tx_only_all(ifobject); - while (atomic_load(&spinning_rx) && spinningrxctr < SOCK_RECONF_CTR) { - spinningrxctr++; - usleep(USLEEP_MAX); - } + testapp_cleanup_xsk_res(ifobject); + pthread_exit(NULL); +} - print_verbose("Interface [%s] vector [Tx]\n", ifobject->ifname); - for (int i = 0; i < num_frames; i++) { - /*send EOT frame */ - if (i == (num_frames - 1)) - data.seqnum = -1; - else - data.seqnum = i; - gen_udp_hdr(&data, ifobject, udp_hdr); - gen_ip_hdr(ifobject, ip_hdr); - gen_udp_csum(udp_hdr, ip_hdr); - gen_eth_hdr(ifobject, eth_hdr); - gen_eth_frame(ifobject->umem, i * XSK_UMEM__DEFAULT_FRAME_SIZE); - } +static void *worker_testapp_validate_rx(void *arg) +{ + struct ifobject *ifobject = (struct ifobject *)arg; + struct pollfd fds[MAX_SOCKS] = { }; + void *bufs = NULL; - print_verbose("Sending %d packets on interface %s\n", - (opt_pkt_count - 1), ifobject->ifname); - tx_only_all(ifobject); - } else if (ifobject->fv.vector == rx) { - struct pollfd fds[MAX_SOCKS] = { }; - int ret; - - if (!bidi_pass) - thread_common_ops(ifobject, bufs, &sync_mutex_tx, &spinning_rx); - - print_verbose("Interface [%s] vector [Rx]\n", ifobject->ifname); - if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) - xsk_populate_fill_ring(ifobject->umem); - - TAILQ_INIT(&head); - if (debug_pkt_dump) { - pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); - if (!pkt_buf) - exit_with_error(errno); - } + if (!second_step) + thread_common_ops(ifobject, bufs); - fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); - fds[0].events = POLLIN; + if (stat_test_type != STAT_TEST_RX_FILL_EMPTY) + xsk_populate_fill_ring(ifobject->umem); - pthread_mutex_lock(&sync_mutex); - pthread_cond_signal(&signal_rx_condition); - pthread_mutex_unlock(&sync_mutex); + TAILQ_INIT(&head); + if (debug_pkt_dump) { + pkt_buf = calloc(num_frames, sizeof(*pkt_buf)); + if (!pkt_buf) + exit_with_error(errno); + } - while (1) { - if (test_type == TEST_TYPE_POLL) { - ret = poll(fds, 1, POLL_TMOUT); - if (ret <= 0) - continue; - } + fds[0].fd = xsk_socket__fd(ifobject->xsk->xsk); + fds[0].events = POLLIN; - if (test_type != TEST_TYPE_STATS) { - rx_pkt(ifobject->xsk, fds); - worker_pkt_validate(); - } else { - worker_stats_validate(ifobject); - } + pthread_barrier_wait(&barr); - if (sigvar) - break; + while (1) { + if (test_type != TEST_TYPE_STATS) { + rx_pkt(ifobject->xsk, fds); + worker_pkt_validate(); + } else { + worker_stats_validate(ifobject); } + if (sigvar) + break; + } - if (test_type != TEST_TYPE_STATS) - print_verbose("Received %d packets on interface %s\n", - pkt_counter, ifobject->ifname); + print_verbose("Received %d packets on interface %s\n", + pkt_counter, ifobject->ifname); - if (test_type == TEST_TYPE_TEARDOWN) - print_verbose("Destroying socket\n"); - } + if (test_type == TEST_TYPE_TEARDOWN) + print_verbose("Destroying socket\n"); - if ((test_type != TEST_TYPE_BIDI) || bidi_pass) { - xsk_socket__delete(ifobject->xsk->xsk); - (void)xsk_umem__delete(ifobject->umem->umem); - } + testapp_cleanup_xsk_res(ifobject); pthread_exit(NULL); } static void testapp_validate(void) { - struct timespec max_wait = { 0, 0 }; bool bidi = test_type == TEST_TYPE_BIDI; + bool bpf = test_type == TEST_TYPE_BPF_RES; - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, THREAD_STACK); - - if ((test_type == TEST_TYPE_BIDI) && bidi_pass) { - pthread_init_mutex(); - if (!switching_notify) { - print_verbose("Switching Tx/Rx vectors\n"); - switching_notify++; - } - } - - pthread_mutex_lock(&sync_mutex); + if (pthread_barrier_init(&barr, NULL, 2)) + exit_with_error(errno); /*Spawn RX thread */ - if (!bidi || !bidi_pass) { - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[1])) - exit_with_error(errno); - } else if (bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[0]->fv.vector = rx; - if (pthread_create(&t0, &attr, worker_testapp_validate, ifdict[0])) - exit_with_error(errno); - } + pthread_create(&t0, NULL, ifdict_rx->func_ptr, ifdict_rx); - if (clock_gettime(CLOCK_REALTIME, &max_wait)) + pthread_barrier_wait(&barr); + if (pthread_barrier_destroy(&barr)) exit_with_error(errno); - max_wait.tv_sec += TMOUT_SEC; - - if (pthread_cond_timedwait(&signal_rx_condition, &sync_mutex, &max_wait) == ETIMEDOUT) - exit_with_error(errno); - - pthread_mutex_unlock(&sync_mutex); /*Spawn TX thread */ - if (!bidi || !bidi_pass) { - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[0])) - exit_with_error(errno); - } else if (bidi && bidi_pass) { - /*switch Tx/Rx vectors */ - ifdict[1]->fv.vector = tx; - if (pthread_create(&t1, &attr, worker_testapp_validate, ifdict[1])) - exit_with_error(errno); - } + pthread_create(&t1, NULL, ifdict_tx->func_ptr, ifdict_tx); pthread_join(t1, NULL); pthread_join(t0, NULL); - if (debug_pkt_dump) { + if (debug_pkt_dump && test_type != TEST_TYPE_STATS) { worker_pkt_dump(); for (int iter = 0; iter < num_frames - 1; iter++) { free(pkt_buf[iter]->payload); @@ -1008,20 +916,85 @@ static void testapp_validate(void) free(pkt_buf); } - if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !(test_type == TEST_TYPE_STATS)) + if (!(test_type == TEST_TYPE_TEARDOWN) && !bidi && !bpf && !(test_type == TEST_TYPE_STATS)) print_ksft_result(); } -static void testapp_sockets(void) +static void testapp_teardown(void) +{ + int i; + + for (i = 0; i < MAX_TEARDOWN_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + } + + print_ksft_result(); +} + +static void swap_vectors(struct ifobject *ifobj1, struct ifobject *ifobj2) +{ + void *(*tmp_func_ptr)(void *) = ifobj1->func_ptr; + enum fvector tmp_vector = ifobj1->fv.vector; + + ifobj1->func_ptr = ifobj2->func_ptr; + ifobj1->fv.vector = ifobj2->fv.vector; + + ifobj2->func_ptr = tmp_func_ptr; + ifobj2->fv.vector = tmp_vector; + + ifdict_tx = ifobj1; + ifdict_rx = ifobj2; +} + +static void testapp_bidi(void) +{ + for (int i = 0; i < MAX_BIDI_ITER; i++) { + pkt_counter = 0; + prev_pkt = -1; + sigvar = 0; + print_verbose("Creating socket\n"); + testapp_validate(); + if (!second_step) { + print_verbose("Switching Tx/Rx vectors\n"); + swap_vectors(ifdict[1], ifdict[0]); + } + second_step = true; + } + + swap_vectors(ifdict[0], ifdict[1]); + + print_ksft_result(); +} + +static void swap_xsk_res(void) +{ + xsk_socket__delete(ifdict_tx->xsk->xsk); + xsk_umem__delete(ifdict_tx->umem->umem); + xsk_socket__delete(ifdict_rx->xsk->xsk); + xsk_umem__delete(ifdict_rx->umem->umem); + ifdict_tx->umem = ifdict_tx->umem_arr[1]; + ifdict_tx->xsk = ifdict_tx->xsk_arr[1]; + ifdict_rx->umem = ifdict_rx->umem_arr[1]; + ifdict_rx->xsk = ifdict_rx->xsk_arr[1]; +} + +static void testapp_bpf_res(void) { - for (int i = 0; i < ((test_type == TEST_TYPE_TEARDOWN) ? MAX_TEARDOWN_ITER : MAX_BIDI_ITER); - i++) { + int i; + + for (i = 0; i < MAX_BPF_ITER; i++) { pkt_counter = 0; prev_pkt = -1; sigvar = 0; print_verbose("Creating socket\n"); testapp_validate(); - test_type == TEST_TYPE_BIDI ? bidi_pass++ : bidi_pass; + if (!second_step) + swap_xsk_res(); + second_step = true; } print_ksft_result(); @@ -1053,78 +1026,33 @@ static void testapp_stats(void) print_ksft_result(); } -static void init_iface_config(struct ifaceconfigobj *ifaceconfig) +static void init_iface(struct ifobject *ifobj, const char *dst_mac, + const char *src_mac, const char *dst_ip, + const char *src_ip, const u16 dst_port, + const u16 src_port, enum fvector vector) { - /*Init interface0 */ - ifdict[0]->fv.vector = tx; - memcpy(ifdict[0]->dst_mac, ifaceconfig->dst_mac, ETH_ALEN); - memcpy(ifdict[0]->src_mac, ifaceconfig->src_mac, ETH_ALEN); - ifdict[0]->dst_ip = ifaceconfig->dst_ip.s_addr; - ifdict[0]->src_ip = ifaceconfig->src_ip.s_addr; - ifdict[0]->dst_port = ifaceconfig->dst_port; - ifdict[0]->src_port = ifaceconfig->src_port; - - /*Init interface1 */ - ifdict[1]->fv.vector = rx; - memcpy(ifdict[1]->dst_mac, ifaceconfig->src_mac, ETH_ALEN); - memcpy(ifdict[1]->src_mac, ifaceconfig->dst_mac, ETH_ALEN); - ifdict[1]->dst_ip = ifaceconfig->src_ip.s_addr; - ifdict[1]->src_ip = ifaceconfig->dst_ip.s_addr; - ifdict[1]->dst_port = ifaceconfig->src_port; - ifdict[1]->src_port = ifaceconfig->dst_port; -} + struct in_addr ip; -static void *nsdisablemodethread(void *args) -{ - struct targs *targs = args; + memcpy(ifobj->dst_mac, dst_mac, ETH_ALEN); + memcpy(ifobj->src_mac, src_mac, ETH_ALEN); - targs->retptr = false; + inet_aton(dst_ip, &ip); + ifobj->dst_ip = ip.s_addr; - if (switch_namespace(targs->idx)) { - targs->retptr = bpf_set_link_xdp_fd(ifdict[targs->idx]->ifindex, -1, targs->flags); - } else { - targs->retptr = errno; - print_verbose("Failed to switch namespace to %s\n", ifdict[targs->idx]->nsname); - } - - pthread_exit(NULL); -} - -static void disable_xdp_mode(int mode) -{ - int err = 0; - __u32 flags = XDP_FLAGS_UPDATE_IF_NOEXIST | mode; - char *mode_str = mode & XDP_FLAGS_SKB_MODE ? "skb" : "drv"; + inet_aton(src_ip, &ip); + ifobj->src_ip = ip.s_addr; - for (int i = 0; i < MAX_INTERFACES; i++) { - if (strcmp(ifdict[i]->nsname, "")) { - struct targs *targs; - - targs = malloc(sizeof(*targs)); - memset(targs, 0, sizeof(*targs)); - if (!targs) - exit_with_error(errno); - - targs->idx = i; - targs->flags = flags; - if (pthread_create(&ns_thread, NULL, nsdisablemodethread, targs)) - exit_with_error(errno); - - pthread_join(ns_thread, NULL); - err = targs->retptr; - free(targs); - } else { - err = bpf_set_link_xdp_fd(ifdict[i]->ifindex, -1, flags); - } - - if (err) { - print_verbose("Failed to disable %s mode on interface %s\n", - mode_str, ifdict[i]->ifname); - exit_with_error(err); - } + ifobj->dst_port = dst_port; + ifobj->src_port = src_port; - print_verbose("Disabled %s mode for interface: %s\n", mode_str, ifdict[i]->ifname); - configured_mode = mode & XDP_FLAGS_SKB_MODE ? TEST_MODE_DRV : TEST_MODE_SKB; + if (vector == tx) { + ifobj->fv.vector = tx; + ifobj->func_ptr = worker_testapp_validate_tx; + ifdict_tx = ifobj; + } else { + ifobj->fv.vector = rx; + ifobj->func_ptr = worker_testapp_validate_rx; + ifdict_rx = ifobj; } } @@ -1135,72 +1063,70 @@ static void run_pkt_test(int mode, int type) /* reset defaults after potential previous test */ xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST; pkt_counter = 0; - switching_notify = 0; - bidi_pass = 0; + second_step = 0; prev_pkt = -1; - ifdict[0]->fv.vector = tx; - ifdict[1]->fv.vector = rx; sigvar = 0; stat_test_type = -1; rxqsize = XSK_RING_CONS__DEFAULT_NUM_DESCS; frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM; + configured_mode = mode; + switch (mode) { case (TEST_MODE_SKB): - if (configured_mode == TEST_MODE_DRV) - disable_xdp_mode(XDP_FLAGS_DRV_MODE); xdp_flags |= XDP_FLAGS_SKB_MODE; break; case (TEST_MODE_DRV): - if (configured_mode == TEST_MODE_SKB) - disable_xdp_mode(XDP_FLAGS_SKB_MODE); xdp_flags |= XDP_FLAGS_DRV_MODE; break; default: break; } - pthread_init_mutex(); - - if (test_type == TEST_TYPE_STATS) + switch (test_type) { + case TEST_TYPE_STATS: testapp_stats(); - else if ((test_type != TEST_TYPE_TEARDOWN) && (test_type != TEST_TYPE_BIDI)) + break; + case TEST_TYPE_TEARDOWN: + testapp_teardown(); + break; + case TEST_TYPE_BIDI: + testapp_bidi(); + break; + case TEST_TYPE_BPF_RES: + testapp_bpf_res(); + break; + default: testapp_validate(); - else - testapp_sockets(); - - pthread_destroy_mutex(); + break; + } } int main(int argc, char **argv) { struct rlimit _rlim = { RLIM_INFINITY, RLIM_INFINITY }; + bool failure = false; + int i, j; if (setrlimit(RLIMIT_MEMLOCK, &_rlim)) exit_with_error(errno); - const char *MAC1 = "\x00\x0A\x56\x9E\xEE\x62"; - const char *MAC2 = "\x00\x0A\x56\x9E\xEE\x61"; - const char *IP1 = "192.168.100.162"; - const char *IP2 = "192.168.100.161"; - u16 UDP_DST_PORT = 2020; - u16 UDP_SRC_PORT = 2121; - int i, j; - - ifaceconfig = malloc(sizeof(struct ifaceconfigobj)); - memcpy(ifaceconfig->dst_mac, MAC1, ETH_ALEN); - memcpy(ifaceconfig->src_mac, MAC2, ETH_ALEN); - inet_aton(IP1, &ifaceconfig->dst_ip); - inet_aton(IP2, &ifaceconfig->src_ip); - ifaceconfig->dst_port = UDP_DST_PORT; - ifaceconfig->src_port = UDP_SRC_PORT; - for (int i = 0; i < MAX_INTERFACES; i++) { ifdict[i] = malloc(sizeof(struct ifobject)); if (!ifdict[i]) exit_with_error(errno); ifdict[i]->ifdict_index = i; + ifdict[i]->xsk_arr = calloc(2, sizeof(struct xsk_socket_info *)); + if (!ifdict[i]->xsk_arr) { + failure = true; + goto cleanup; + } + ifdict[i]->umem_arr = calloc(2, sizeof(struct xsk_umem_info *)); + if (!ifdict[i]->umem_arr) { + failure = true; + goto cleanup; + } } setlocale(LC_ALL, ""); @@ -1209,9 +1135,8 @@ int main(int argc, char **argv) num_frames = ++opt_pkt_count; - init_iface_config(ifaceconfig); - - disable_xdp_mode(XDP_FLAGS_DRV_MODE); + init_iface(ifdict[0], MAC1, MAC2, IP1, IP2, UDP_PORT1, UDP_PORT2, tx); + init_iface(ifdict[1], MAC2, MAC1, IP2, IP1, UDP_PORT2, UDP_PORT1, rx); ksft_set_plan(TEST_MODE_MAX * TEST_TYPE_MAX); @@ -1220,8 +1145,17 @@ int main(int argc, char **argv) run_pkt_test(i, j); } - for (int i = 0; i < MAX_INTERFACES; i++) +cleanup: + for (int i = 0; i < MAX_INTERFACES; i++) { + if (ifdict[i]->ns_fd != -1) + close(ifdict[i]->ns_fd); + free(ifdict[i]->xsk_arr); + free(ifdict[i]->umem_arr); free(ifdict[i]); + } + + if (failure) + exit_with_error(errno); ksft_exit_pass(); diff --git a/tools/testing/selftests/bpf/xdpxceiver.h b/tools/testing/selftests/bpf/xdpxceiver.h index 30314ef305c2..6c428b276ab6 100644 --- a/tools/testing/selftests/bpf/xdpxceiver.h +++ b/tools/testing/selftests/bpf/xdpxceiver.h @@ -23,6 +23,7 @@ #define MAX_SOCKS 1 #define MAX_TEARDOWN_ITER 10 #define MAX_BIDI_ITER 2 +#define MAX_BPF_ITER 2 #define PKT_HDR_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ sizeof(struct udphdr)) #define MIN_PKT_SIZE 64 @@ -33,14 +34,11 @@ #define IP_PKT_TOS 0x9 #define UDP_PKT_SIZE (IP_PKT_SIZE - sizeof(struct iphdr)) #define UDP_PKT_DATA_SIZE (UDP_PKT_SIZE - sizeof(struct udphdr)) -#define TMOUT_SEC (3) #define EOT (-1) #define USLEEP_MAX 200000 -#define THREAD_STACK 60000000 #define SOCK_RECONF_CTR 10 #define BATCH_SIZE 64 #define POLL_TMOUT 1000 -#define NEED_WAKEUP true #define DEFAULT_PKT_CNT 10000 #define RX_FULL_RXQSIZE 32 @@ -63,6 +61,7 @@ enum TEST_TYPES { TEST_TYPE_TEARDOWN, TEST_TYPE_BIDI, TEST_TYPE_STATS, + TEST_TYPE_BPF_RES, TEST_TYPE_MAX }; @@ -77,11 +76,9 @@ enum STAT_TEST_TYPES { static int configured_mode = TEST_MODE_UNCONFIGURED; static u8 debug_pkt_dump; static u32 num_frames; -static u8 switching_notify; -static u8 bidi_pass; +static bool second_step; static int test_type; -static int opt_queue; static int opt_pkt_count; static u8 opt_verbose; @@ -125,48 +122,32 @@ struct generic_data { u32 seqnum; }; -struct ifaceconfigobj { - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; - struct in_addr dst_ip; - struct in_addr src_ip; - u16 src_port; - u16 dst_port; -} *ifaceconfig; - struct ifobject { - int ifindex; - int ifdict_index; char ifname[MAX_INTERFACE_NAME_CHARS]; char nsname[MAX_INTERFACES_NAMESPACE_CHARS]; - struct flow_vector fv; struct xsk_socket_info *xsk; + struct xsk_socket_info **xsk_arr; + struct xsk_umem_info **umem_arr; struct xsk_umem_info *umem; - u8 dst_mac[ETH_ALEN]; - u8 src_mac[ETH_ALEN]; + void *(*func_ptr)(void *arg); + struct flow_vector fv; + int ns_fd; + int ifdict_index; u32 dst_ip; u32 src_ip; u16 src_port; u16 dst_port; + u8 dst_mac[ETH_ALEN]; + u8 src_mac[ETH_ALEN]; }; static struct ifobject *ifdict[MAX_INTERFACES]; +static struct ifobject *ifdict_rx; +static struct ifobject *ifdict_tx; /*threads*/ -atomic_int spinning_tx; -atomic_int spinning_rx; -pthread_mutex_t sync_mutex; -pthread_mutex_t sync_mutex_tx; -pthread_cond_t signal_rx_condition; -pthread_cond_t signal_tx_condition; -pthread_t t0, t1, ns_thread; -pthread_attr_t attr; - -struct targs { - u8 retptr; - int idx; - u32 flags; -}; +pthread_barrier_t barr; +pthread_t t0, t1; TAILQ_HEAD(head_s, pkt) head = TAILQ_HEAD_INITIALIZER(head); struct head_s *head_p; diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh index 1fedfc9da434..42d44e27802c 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh @@ -446,6 +446,35 @@ __invalid_nexthop_test() log_test "Unresolved neigh: nexthop does not exist: $desc" } +__invalid_nexthop_bucket_test() +{ + local desc=$1; shift + local dip=$1; shift + local via_add=$1; shift + local trap_name="unresolved_neigh" + + RET=0 + + # Check that route to nexthop that does not exist triggers + # unresolved_neigh + ip nexthop add id 1 via $via_add dev $rp2 + ip nexthop add id 10 group 1 type resilient buckets 32 + ip route add $dip nhid 10 + + t0_packets=$(devlink_trap_rx_packets_get $trap_name) + ping_do $h1 $dip + t1_packets=$(devlink_trap_rx_packets_get $trap_name) + + if [[ $t0_packets -eq $t1_packets ]]; then + check_err 1 "Trap counter did not increase" + fi + + ip route del $dip nhid 10 + ip nexthop del id 10 + ip nexthop del id 1 + log_test "Unresolved neigh: nexthop bucket does not exist: $desc" +} + unresolved_neigh_test() { __host_miss_test "IPv4" 198.51.100.1 @@ -453,6 +482,8 @@ unresolved_neigh_test() __invalid_nexthop_test "IPv4" 198.51.100.1 198.51.100.3 24 198.51.100.4 __invalid_nexthop_test "IPv6" 2001:db8:2::1 2001:db8:2::3 64 \ 2001:db8:2::4 + __invalid_nexthop_bucket_test "IPv4" 198.51.100.1 198.51.100.4 + __invalid_nexthop_bucket_test "IPv6" 2001:db8:2::1 2001:db8:2::4 } vrf_without_routes_create() diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh index ed346da5d3cb..a217f9f6775b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh @@ -33,6 +33,7 @@ ALL_TESTS=" nexthop_obj_invalid_test nexthop_obj_offload_test nexthop_obj_group_offload_test + nexthop_obj_bucket_offload_test nexthop_obj_blackhole_offload_test nexthop_obj_route_offload_test devlink_reload_test @@ -739,11 +740,28 @@ nexthop_obj_invalid_test() ip nexthop add id 1 dev $swp1 ip nexthop add id 2 dev $swp1 + ip nexthop add id 3 via 192.0.2.3 dev $swp1 ip nexthop add id 10 group 1/2 check_fail $? "managed to configure a nexthop group with device-only nexthops when should not" + ip nexthop add id 10 group 3 type resilient buckets 7 + check_fail $? "managed to configure a too small resilient nexthop group when should not" + + ip nexthop add id 10 group 3 type resilient buckets 129 + check_fail $? "managed to configure a resilient nexthop group with invalid number of buckets when should not" + + ip nexthop add id 10 group 1/2 type resilient buckets 32 + check_fail $? "managed to configure a resilient nexthop group with device-only nexthops when should not" + + ip nexthop add id 10 group 3 type resilient buckets 32 + check_err $? "failed to configure a valid resilient nexthop group" + ip nexthop replace id 3 dev $swp1 + check_fail $? "managed to populate a nexthop bucket with a device-only nexthop when should not" + log_test "nexthop objects - invalid configurations" + ip nexthop del id 10 + ip nexthop del id 3 ip nexthop del id 2 ip nexthop del id 1 @@ -858,6 +876,70 @@ nexthop_obj_group_offload_test() simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 } +nexthop_obj_bucket_offload_test() +{ + # Test offload indication of nexthop buckets + RET=0 + + simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64 + simple_if_init $swp2 + setup_wait + + ip nexthop add id 1 via 192.0.2.2 dev $swp1 + ip nexthop add id 2 via 2001:db8:1::2 dev $swp1 + ip nexthop add id 10 group 1/2 type resilient buckets 32 idle_timer 0 + ip neigh replace 192.0.2.2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 192.0.2.3 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked as offloaded when should" + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked as offloaded when should" + + # Invalidate nexthop id 1 + ip neigh replace 192.0.2.2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 1 + check_err $? "IPv4 nexthop buckets not marked with trap when should" + + # Invalidate nexthop id 2 + ip neigh replace 2001:db8:1::2 nud failed dev $swp1 + busywait "$TIMEOUT" wait_for_trap \ + ip nexthop bucket show nhid 2 + check_err $? "IPv6 nexthop buckets not marked with trap when should" + + # Revalidate nexthop id 1 by changing its configuration + ip nexthop replace id 1 via 192.0.2.3 dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 1 + check_err $? "nexthop bucket not marked as offloaded after revalidating nexthop" + + # Revalidate nexthop id 2 by changing its neighbour + ip neigh replace 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud reachable \ + dev $swp1 + busywait "$TIMEOUT" wait_for_offload \ + ip nexthop bucket show nhid 2 + check_err $? "nexthop bucket not marked as offloaded after revalidating neighbour" + + log_test "nexthop bucket offload indication" + + ip neigh del 2001:db8:1::2 dev $swp1 + ip neigh del 192.0.2.3 dev $swp1 + ip neigh del 192.0.2.2 dev $swp1 + ip nexthop del id 10 + ip nexthop del id 2 + ip nexthop del id 1 + + simple_if_fini $swp2 + simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64 +} + nexthop_obj_blackhole_offload_test() { # Test offload indication of blackhole nexthop objects diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh deleted file mode 100755 index 0231205a7147..000000000000 --- a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/q_in_vni_veto.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -lib_dir=$(dirname $0)/../../../../net/forwarding - -VXPORT=4789 - -ALL_TESTS=" - create_dot1d_and_dot1ad_vxlans -" -NUM_NETIFS=2 -source $lib_dir/lib.sh - -setup_prepare() -{ - swp1=${NETIFS[p1]} - swp2=${NETIFS[p2]} - - ip link set dev $swp1 up - ip link set dev $swp2 up -} - -cleanup() -{ - pre_cleanup - - ip link set dev $swp2 down - ip link set dev $swp1 down -} - -create_dot1d_and_dot1ad_vxlans() -{ - RET=0 - - ip link add dev br0 type bridge vlan_filtering 1 vlan_protocol 802.1ad \ - vlan_default_pvid 0 mcast_snooping 0 - ip link set dev br0 up - - ip link add name vx100 type vxlan id 1000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx100 up - - ip link set dev $swp1 master br0 - ip link set dev vx100 master br0 - bridge vlan add vid 100 dev vx100 pvid untagged - - ip link add dev br1 type bridge vlan_filtering 0 mcast_snooping 0 - ip link set dev br1 up - - ip link add name vx200 type vxlan id 2000 local 192.0.2.17 dstport \ - "$VXPORT" nolearning noudpcsum tos inherit ttl 100 - ip link set dev vx200 up - - ip link set dev $swp2 master br1 - ip link set dev vx200 master br1 2>/dev/null - check_fail $? "802.1d and 802.1ad VxLANs at the same time not rejected" - - ip link set dev vx200 master br1 2>&1 >/dev/null \ - | grep -q mlxsw_spectrum - check_err $? "802.1d and 802.1ad VxLANs at the same time rejected without extack" - - log_test "create 802.1d and 802.1ad VxLANs" - - ip link del dev vx200 - ip link del dev br1 - ip link del dev vx100 - ip link del dev br0 -} - -trap cleanup EXIT - -setup_prepare -setup_wait - -tests_run - -exit $EXIT_STATUS diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh index 553cb9fad508..5ec3beb637c8 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh @@ -11,6 +11,7 @@ ALL_TESTS=" matchall_mirror_behind_flower_ingress_test matchall_sample_behind_flower_ingress_test matchall_mirror_behind_flower_egress_test + matchall_proto_match_test police_limits_test multi_police_test " @@ -18,6 +19,7 @@ NUM_NETIFS=2 source $lib_dir/tc_common.sh source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh switch_create() { @@ -166,7 +168,8 @@ matchall_sample_egress_test() RET=0 # It is forbidden in mlxsw driver to have matchall with sample action - # bound on egress + # bound on egress. Spectrum-1 specific restriction + [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return tc qdisc add dev $swp1 clsact @@ -289,6 +292,22 @@ matchall_mirror_behind_flower_egress_test() matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2" } +matchall_proto_match_test() +{ + RET=0 + + tc qdisc add dev $swp1 clsact + + tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \ + matchall skip_sw \ + action sample group 1 rate 100 + check_fail $? "Incorrect success to add matchall rule with protocol match" + + tc qdisc del dev $swp1 clsact + + log_test "matchall protocol match" +} + police_limits_test() { RET=0 diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh index 75d00104f291..093bed088ad0 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh @@ -34,6 +34,7 @@ lib_dir=$(dirname $0)/../../../net/forwarding ALL_TESTS=" tc_sample_rate_test tc_sample_max_rate_test + tc_sample_conflict_test tc_sample_group_conflict_test tc_sample_md_iif_test tc_sample_md_lag_iif_test @@ -41,6 +42,10 @@ ALL_TESTS=" tc_sample_md_lag_oif_test tc_sample_md_out_tc_test tc_sample_md_out_tc_occ_test + tc_sample_md_latency_test + tc_sample_acl_group_conflict_test + tc_sample_acl_rate_test + tc_sample_acl_max_rate_test " NUM_NETIFS=8 CAPTURE_FILE=$(mktemp) @@ -268,6 +273,35 @@ tc_sample_max_rate_test() log_test "tc sample maximum rate" } +tc_sample_conflict_test() +{ + RET=0 + + # Test that two sampling rules cannot be configured on the same port, + # even when they share the same parameters. + + tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 &> /dev/null + check_fail $? "Managed to configure second sampling rule" + + # Delete the first rule and make sure the second rule can now be + # configured. + + tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall + + tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule after deletion" + + log_test "tc sample conflict test" + + tc filter del dev $rp1 ingress protocol all pref 2 handle 102 matchall +} + tc_sample_group_conflict_test() { RET=0 @@ -482,6 +516,137 @@ tc_sample_md_out_tc_occ_test() tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall } +tc_sample_md_latency_test() +{ + RET=0 + + # Egress sampling not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp2 egress protocol all pref 1 handle 101 matchall \ + skip_sw action sample rate 5 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "latency " $CAPTURE_FILE + check_err $? "Sampled packets do not have latency attribute" + + log_test "tc sample latency" + + tc filter del dev $rp2 egress protocol all pref 1 handle 101 matchall +} + +tc_sample_acl_group_conflict_test() +{ + RET=0 + + # Test that two flower sampling rules cannot be configured on the same + # port with different groups. + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule" + + tc filter add dev $rp1 ingress protocol ip pref 2 handle 102 flower \ + skip_sw action sample rate 1024 group 1 + check_err $? "Failed to configure sampling rule with same group" + + tc filter add dev $rp1 ingress protocol ip pref 3 handle 103 flower \ + skip_sw action sample rate 1024 group 2 &> /dev/null + check_fail $? "Managed to configure sampling rule with conflicting group" + + log_test "tc sample (w/ flower) group conflict test" + + tc filter del dev $rp1 ingress protocol ip pref 2 handle 102 flower + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +__tc_sample_acl_rate_test() +{ + local bind=$1; shift + local port=$1; shift + local pkts pct + + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.1 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l) + pct=$((100 * (pkts - 100) / 100)) + (( -25 <= pct && pct <= 25)) + check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%" + + # Setup a filter that should not match any packet and make sure packets + # are not sampled. + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower + + tc filter add dev $port $bind protocol ip pref 1 handle 101 flower \ + skip_sw dst_ip 198.51.100.10 action sample rate 32 group 1 + check_err $? "Failed to configure sampling rule" + + psample_capture_start + + ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \ + -B 198.51.100.1 -t udp dp=52768,sp=42768 -q + + psample_capture_stop + + grep -q -e "group 1 " $CAPTURE_FILE + check_fail $? "Sampled packets when should not" + + log_test "tc sample (w/ flower) rate ($bind)" + + tc filter del dev $port $bind protocol ip pref 1 handle 101 flower +} + +tc_sample_acl_rate_test() +{ + __tc_sample_acl_rate_test ingress $rp1 + __tc_sample_acl_rate_test egress $rp2 +} + +tc_sample_acl_max_rate_test() +{ + RET=0 + + # Policy-based sampling is not supported on Spectrum-1. + [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24 - 1)) group 1 + check_err $? "Failed to configure sampling rule with max rate" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower + + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + skip_sw action sample rate $((2 ** 24)) \ + group 1 &> /dev/null + check_fail $? "Managed to configure sampling rate above maximum" + + log_test "tc sample (w/ flower) maximum rate" +} + trap cleanup EXIT setup_prepare diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 9f64d5c7107b..7ca1f030d209 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -24,8 +24,11 @@ function check { local code=$1 local str=$2 local exp_str=$3 + local exp_fail=$4 - if [ $code -ne 0 ]; then + [ -z "$exp_fail" ] && cop="-ne" || cop="-eq" + + if [ $code $cop 0 ]; then ((num_errors++)) return fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh new file mode 100755 index 000000000000..0c56746e9ce0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +NSIM_NETDEV=$(make_netdev) +[ a$ETHTOOL == a ] && ETHTOOL=ethtool + +set -o pipefail + +# netdevsim starts out with None/None +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: None +Active FEC encoding: None" + +# Test Auto +$ETHTOOL --set-fec $NSIM_NETDEV encoding auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto +Active FEC encoding: Off" + +# Test case in-sensitivity +for o in off Off OFF; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: Off +Active FEC encoding: Off" +done + +for o in BaseR baser BAser; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: BaseR +Active FEC encoding: BaseR" +done + +for o in llrs rs; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: ${o^^} +Active FEC encoding: ${o^^}" +done + +# Test mutliple bits +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: RS LLRS +Active FEC encoding: LLRS" + +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto Off RS +Active FEC encoding: RS" + +# Make sure other link modes are rejected +$ETHTOOL --set-fec $NSIM_NETDEV encoding FIBRE 2>/dev/null +check $? '' '' 1 + +$ETHTOOL --set-fec $NSIM_NETDEV encoding bla-bla-bla 2>/dev/null +check $? '' '' 1 + +# Try JSON +$ETHTOOL --json --show-fec $NSIM_NETDEV | jq empty >>/dev/null 2>&1 +if [ $? -eq 0 ]; then + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"Off"' + + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto RS + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto" +"RS"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"RS"' +fi + +# Test error injection +echo 11 > $NSIM_DEV_DFS/ethtool/get_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? '' '' 1 + +echo 0 > $NSIM_DEV_DFS/ethtool/get_err +echo 11 > $NSIM_DEV_DFS/ethtool/set_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? + +$ETHTOOL --set-fec $NSIM_NETDEV encoding RS 2>/dev/null +check $? '' '' 1 + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 32b87cc77c8e..7bd7e776c266 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -8,10 +8,13 @@ /x86_64/debug_regs /x86_64/evmcs_test /x86_64/get_cpuid_test +/x86_64/get_msr_index_features /x86_64/kvm_pv_test +/x86_64/hyperv_clock /x86_64/hyperv_cpuid /x86_64/mmio_warning_test /x86_64/platform_info_test +/x86_64/set_boot_cpu_id /x86_64/set_sregs_test /x86_64/smm_test /x86_64/state_test diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a6d61f451f88..67eebb53235f 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -39,12 +39,15 @@ LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c lib/s390x/diag318_test_handler.c TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/get_cpuid_test +TEST_GEN_PROGS_x86_64 += x86_64/hyperv_clock TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test +TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test TEST_GEN_PROGS_x86_64 += x86_64/state_test diff --git a/tools/testing/selftests/kvm/hardware_disable_test.c b/tools/testing/selftests/kvm/hardware_disable_test.c index 2f2eeb8a1d86..5aadf84c91c0 100644 --- a/tools/testing/selftests/kvm/hardware_disable_test.c +++ b/tools/testing/selftests/kvm/hardware_disable_test.c @@ -108,7 +108,7 @@ static void run_test(uint32_t run) kvm_vm_elf_load(vm, program_invocation_name, 0, 0); vm_create_irqchip(vm); - fprintf(stderr, "%s: [%d] start vcpus\n", __func__, run); + pr_debug("%s: [%d] start vcpus\n", __func__, run); for (i = 0; i < VCPU_NUM; ++i) { vm_vcpu_add_default(vm, i, guest_code); payloads[i].vm = vm; @@ -124,7 +124,7 @@ static void run_test(uint32_t run) check_set_affinity(throw_away, &cpu_set); } } - fprintf(stderr, "%s: [%d] all threads launched\n", __func__, run); + pr_debug("%s: [%d] all threads launched\n", __func__, run); sem_post(sem); for (i = 0; i < VCPU_NUM; ++i) check_join(threads[i], &b); @@ -147,16 +147,16 @@ int main(int argc, char **argv) if (pid == 0) run_test(i); /* This function always exits */ - fprintf(stderr, "%s: [%d] waiting semaphore\n", __func__, i); + pr_debug("%s: [%d] waiting semaphore\n", __func__, i); sem_wait(sem); r = (rand() % DELAY_US_MAX) + 1; - fprintf(stderr, "%s: [%d] waiting %dus\n", __func__, i, r); + pr_debug("%s: [%d] waiting %dus\n", __func__, i, r); usleep(r); r = waitpid(pid, &s, WNOHANG); TEST_ASSERT(r != pid, "%s: [%d] child exited unexpectedly status: [%d]", __func__, i, s); - fprintf(stderr, "%s: [%d] killing child\n", __func__, i); + pr_debug("%s: [%d] killing child\n", __func__, i); kill(pid, SIGKILL); } diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index 2d7eb6989e83..0f4258eaa629 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -16,6 +16,7 @@ #include "sparsebit.h" +#define KVM_DEV_PATH "/dev/kvm" #define KVM_MAX_VCPUS 512 /* @@ -133,6 +134,7 @@ void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl, void *arg); void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg); void kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); int _kvm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index e5fbf16f725b..b8849a1aca79 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1697,11 +1697,16 @@ void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) { int ret; - ret = ioctl(vm->fd, cmd, arg); + ret = _vm_ioctl(vm, cmd, arg); TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)", cmd, ret, errno, strerror(errno)); } +int _vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg) +{ + return ioctl(vm->fd, cmd, arg); +} + /* * KVM system ioctl * diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h index 34465dc562d8..91ce1b5d480b 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h +++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h @@ -10,8 +10,6 @@ #include "sparsebit.h" -#define KVM_DEV_PATH "/dev/kvm" - struct userspace_mem_region { struct kvm_userspace_memory_region region; struct sparsebit *unused_phy_pages; diff --git a/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c new file mode 100644 index 000000000000..cb953df4d7d0 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/get_msr_index_features.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_GET_MSR_INDEX_LIST and + * KVM_GET_MSR_FEATURE_INDEX_LIST work as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +static int kvm_num_index_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +static void test_get_msr_index(void) +{ + int old_res, res, kvm_fd, r; + struct kvm_msr_list *list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_index_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_index_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + list = malloc(sizeof(*list) + old_res * sizeof(list->indices[0])); + list->nmsrs = old_res; + r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + TEST_ASSERT(list->nmsrs == old_res, "Expecting nmsrs to be identical"); + free(list); + + close(kvm_fd); +} + +static int kvm_num_feature_msrs(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + TEST_ASSERT(r == -1 && errno == E2BIG, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST probe, r: %i", + r); + + r = list->nmsrs; + free(list); + return r; +} + +struct kvm_msr_list *kvm_get_msr_feature_list(int kvm_fd, int nmsrs) +{ + struct kvm_msr_list *list; + int r; + + list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0])); + list->nmsrs = nmsrs; + r = ioctl(kvm_fd, KVM_GET_MSR_FEATURE_INDEX_LIST, list); + + TEST_ASSERT(r == 0, + "Unexpected result from KVM_GET_MSR_FEATURE_INDEX_LIST, r: %i", + r); + + return list; +} + +static void test_get_msr_feature(void) +{ + int res, old_res, i, kvm_fd; + struct kvm_msr_list *feature_list; + + kvm_fd = open(KVM_DEV_PATH, O_RDONLY); + if (kvm_fd < 0) + exit(KSFT_SKIP); + + old_res = kvm_num_feature_msrs(kvm_fd, 0); + TEST_ASSERT(old_res != 0, "Expecting nmsrs to be > 0"); + + if (old_res != 1) { + res = kvm_num_feature_msrs(kvm_fd, 1); + TEST_ASSERT(res > 1, "Expecting nmsrs to be > 1"); + TEST_ASSERT(res == old_res, "Expecting nmsrs to be identical"); + } + + feature_list = kvm_get_msr_feature_list(kvm_fd, old_res); + TEST_ASSERT(old_res == feature_list->nmsrs, + "Unmatching number of msr indexes"); + + for (i = 0; i < feature_list->nmsrs; i++) + kvm_get_feature_msr(feature_list->indices[i]); + + free(feature_list); + close(kvm_fd); +} + +int main(int argc, char *argv[]) +{ + if (kvm_check_cap(KVM_CAP_GET_MSR_FEATURES)) + test_get_msr_feature(); + + test_get_msr_index(); +} diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c new file mode 100644 index 000000000000..7f1d2765572c --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2021, Red Hat, Inc. + * + * Tests for Hyper-V clocksources + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 +#define HV_X64_MSR_REFERENCE_TSC 0x40000021 +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 + +/* Simplified mul_u64_u64_shr() */ +static inline u64 mul_u64_u64_shr64(u64 a, u64 b) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } rm, rn, rh, a0, b0; + u64 c; + + a0.ll = a; + b0.ll = b; + + rm.ll = (u64)a0.l.low * b0.l.high; + rn.ll = (u64)a0.l.high * b0.l.low; + rh.ll = (u64)a0.l.high * b0.l.high; + + rh.l.low = c = rm.l.high + rn.l.high + rh.l.low; + rh.l.high = (c >> 32) + rh.l.high; + + return rh.ll; +} + +static inline void nop_loop(void) +{ + int i; + + for (i = 0; i < 1000000; i++) + asm volatile("nop"); +} + +static inline void check_tsc_msr_rdtsc(void) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); + GUEST_ASSERT(tsc_freq > 0); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + GUEST_ASSERT(r2 > r1 && t2 > t1); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + GUEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100); +} + +static inline u64 get_tscpage_ts(struct ms_hyperv_tsc_page *tsc_page) +{ + return mul_u64_u64_shr64(rdtsc(), tsc_page->tsc_scale) + tsc_page->tsc_offset; +} + +static inline void check_tsc_msr_tsc_page(struct ms_hyperv_tsc_page *tsc_page) +{ + u64 r1, r2, t1, t2; + + /* Compare TSC page clocksource with HV_X64_MSR_TIME_REF_COUNT */ + t1 = get_tscpage_ts(tsc_page); + r1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + + /* 10 ms tolerance */ + GUEST_ASSERT(r1 >= t1 && r1 - t1 < 100000); + nop_loop(); + + t2 = get_tscpage_ts(tsc_page); + r2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + GUEST_ASSERT(r2 >= t1 && r2 - t2 < 100000); +} + +static void guest_main(struct ms_hyperv_tsc_page *tsc_page, vm_paddr_t tsc_page_gpa) +{ + u64 tsc_scale, tsc_offset; + + /* Set Guest OS id to enable Hyper-V emulation */ + GUEST_SYNC(1); + wrmsr(HV_X64_MSR_GUEST_OS_ID, (u64)0x8100 << 48); + GUEST_SYNC(2); + + check_tsc_msr_rdtsc(); + + GUEST_SYNC(3); + + /* Set up TSC page is disabled state, check that it's clean */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + + GUEST_SYNC(4); + + /* Set up TSC page is enabled state */ + wrmsr(HV_X64_MSR_REFERENCE_TSC, tsc_page_gpa | 0x1); + GUEST_ASSERT(tsc_page->tsc_sequence != 0); + + GUEST_SYNC(5); + + check_tsc_msr_tsc_page(tsc_page); + + GUEST_SYNC(6); + + tsc_offset = tsc_page->tsc_offset; + /* Call KVM_SET_CLOCK from userspace, check that TSC page was updated */ + + GUEST_SYNC(7); + /* Sanity check TSC page timestamp, it should be close to 0 */ + GUEST_ASSERT(get_tscpage_ts(tsc_page) < 100000); + + GUEST_ASSERT(tsc_page->tsc_offset != tsc_offset); + + nop_loop(); + + /* + * Enable Re-enlightenment and check that TSC page stays constant across + * KVM_SET_CLOCK. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0x1 << 16 | 0xff); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0x1); + tsc_offset = tsc_page->tsc_offset; + tsc_scale = tsc_page->tsc_scale; + GUEST_SYNC(8); + GUEST_ASSERT(tsc_page->tsc_offset == tsc_offset); + GUEST_ASSERT(tsc_page->tsc_scale == tsc_scale); + + GUEST_SYNC(9); + + check_tsc_msr_tsc_page(tsc_page); + + /* + * Disable re-enlightenment and TSC page, check that KVM doesn't update + * it anymore. + */ + wrmsr(HV_X64_MSR_REENLIGHTENMENT_CONTROL, 0); + wrmsr(HV_X64_MSR_TSC_EMULATION_CONTROL, 0); + wrmsr(HV_X64_MSR_REFERENCE_TSC, 0); + memset(tsc_page, 0, sizeof(*tsc_page)); + + GUEST_SYNC(10); + GUEST_ASSERT(tsc_page->tsc_sequence == 0); + GUEST_ASSERT(tsc_page->tsc_offset == 0); + GUEST_ASSERT(tsc_page->tsc_scale == 0); + + GUEST_DONE(); +} + +#define VCPU_ID 0 + +static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) +{ + u64 tsc_freq, r1, r2, t1, t2; + s64 delta_ns; + + tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); + TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); + + /* First, check MSR-based clocksource */ + r1 = rdtsc(); + t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + nop_loop(); + r2 = rdtsc(); + t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + + TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); + + /* HV_X64_MSR_TIME_REF_COUNT is in 100ns */ + delta_ns = ((t2 - t1) * 100) - ((r2 - r1) * 1000000000 / tsc_freq); + if (delta_ns < 0) + delta_ns = -delta_ns; + + /* 1% tolerance */ + TEST_ASSERT(delta_ns * 100 < (t2 - t1) * 100, + "Elapsed time does not match (MSR=%ld, TSC=%ld)", + (t2 - t1) * 100, (r2 - r1) * 1000000000 / tsc_freq); +} + +int main(void) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct ucall uc; + vm_vaddr_t tsc_page_gva; + int stage; + + vm = vm_create_default(VCPU_ID, 0, guest_main); + run = vcpu_state(vm, VCPU_ID); + + vcpu_set_hv_cpuid(vm, VCPU_ID); + + tsc_page_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0); + memset(addr_gpa2hva(vm, tsc_page_gva), 0x0, getpagesize()); + TEST_ASSERT((addr_gva2gpa(vm, tsc_page_gva) & (getpagesize() - 1)) == 0, + "TSC page has to be page aligned\n"); + vcpu_args_set(vm, VCPU_ID, 2, tsc_page_gva, addr_gva2gpa(vm, tsc_page_gva)); + + host_check_tsc_msr_rdtsc(vm); + + for (stage = 1;; stage++) { + _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Stage %d: unexpected exit reason: %u (%s),\n", + stage, run->exit_reason, + exit_reason_str(run->exit_reason)); + + switch (get_ucall(vm, VCPU_ID, &uc)) { + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], + __FILE__, uc.args[1]); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + /* Keep in sync with guest_main() */ + TEST_ASSERT(stage == 11, "Testing ended prematurely, stage %d\n", + stage); + goto out; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage, + "Stage %d: Unexpected register values vmexit, got %lx", + stage, (ulong)uc.args[1]); + + /* Reset kvmclock triggering TSC page update */ + if (stage == 7 || stage == 8 || stage == 10) { + struct kvm_clock_data clock = {0}; + + vm_ioctl(vm, KVM_SET_CLOCK, &clock); + } + } + +out: + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c new file mode 100644 index 000000000000..12c558fc8074 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/set_boot_cpu_id.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test that KVM_SET_BOOT_CPU_ID works as intended + * + * Copyright (C) 2020, Red Hat, Inc. + */ +#define _GNU_SOURCE /* for program_invocation_name */ +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" + +#define N_VCPU 2 +#define VCPU_ID0 0 +#define VCPU_ID1 1 + +static uint32_t get_bsp_flag(void) +{ + return rdmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BSP; +} + +static void guest_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() != 0); + + GUEST_DONE(); +} + +static void guest_not_bsp_vcpu(void *arg) +{ + GUEST_SYNC(1); + + GUEST_ASSERT(get_bsp_flag() == 0); + + GUEST_DONE(); +} + +static void test_set_boot_busy(struct kvm_vm *vm) +{ + int res; + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID0); + TEST_ASSERT(res == -1 && errno == EBUSY, + "KVM_SET_BOOT_CPU_ID set while running vm"); +} + +static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) +{ + struct ucall uc; + int stage; + + for (stage = 0; stage < 2; stage++) { + + vcpu_run(vm, vcpuid); + + switch (get_ucall(vm, vcpuid, &uc)) { + case UCALL_SYNC: + TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") && + uc.args[1] == stage + 1, + "Stage %d: Unexpected register values vmexit, got %lx", + stage + 1, (ulong)uc.args[1]); + test_set_boot_busy(vm); + break; + case UCALL_DONE: + TEST_ASSERT(stage == 1, + "Expected GUEST_DONE in stage 2, got stage %d", + stage); + break; + case UCALL_ABORT: + TEST_ASSERT(false, "%s at %s:%ld\n\tvalues: %#lx, %#lx", + (const char *)uc.args[0], __FILE__, + uc.args[1], uc.args[2], uc.args[3]); + default: + TEST_ASSERT(false, "Unexpected exit: %s", + exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason)); + } + } +} + +static struct kvm_vm *create_vm(void) +{ + struct kvm_vm *vm; + uint64_t vcpu_pages = (DEFAULT_STACK_PGS) * 2; + uint64_t extra_pg_pages = vcpu_pages / PTES_PER_MIN_PAGE * N_VCPU; + uint64_t pages = DEFAULT_GUEST_PHY_PAGES + vcpu_pages + extra_pg_pages; + + pages = vm_adjust_num_guest_pages(VM_MODE_DEFAULT, pages); + vm = vm_create(VM_MODE_DEFAULT, pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_create_irqchip(vm); + + return vm; +} + +static void add_x86_vcpu(struct kvm_vm *vm, uint32_t vcpuid, bool bsp_code) +{ + if (bsp_code) + vm_vcpu_add_default(vm, vcpuid, guest_bsp_vcpu); + else + vm_vcpu_add_default(vm, vcpuid, guest_not_bsp_vcpu); + + vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid()); +} + +static void run_vm_bsp(uint32_t bsp_vcpu) +{ + struct kvm_vm *vm; + bool is_bsp_vcpu1 = bsp_vcpu == VCPU_ID1; + + vm = create_vm(); + + if (is_bsp_vcpu1) + vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + + add_x86_vcpu(vm, VCPU_ID0, !is_bsp_vcpu1); + add_x86_vcpu(vm, VCPU_ID1, is_bsp_vcpu1); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + kvm_vm_free(vm); +} + +static void check_set_bsp_busy(void) +{ + struct kvm_vm *vm; + int res; + + vm = create_vm(); + + add_x86_vcpu(vm, VCPU_ID0, true); + add_x86_vcpu(vm, VCPU_ID1, false); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set after adding vcpu"); + + run_vcpu(vm, VCPU_ID0); + run_vcpu(vm, VCPU_ID1); + + res = _vm_ioctl(vm, KVM_SET_BOOT_CPU_ID, (void *) VCPU_ID1); + TEST_ASSERT(res == -1 && errno == EBUSY, "KVM_SET_BOOT_CPU_ID set to a terminated vcpu"); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + if (!kvm_check_cap(KVM_CAP_SET_BOOT_CPU_ID)) { + print_skip("set_boot_cpu_id not available"); + return 0; + } + + run_vm_bsp(VCPU_ID0); + run_vm_bsp(VCPU_ID1); + run_vm_bsp(VCPU_ID0); + + check_set_bsp_busy(); +} diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 25f198bec0b2..f4242a961088 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -23,6 +23,8 @@ TEST_PROGS += drop_monitor_tests.sh TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh +TEST_PROGS += udpgro_fwd.sh +TEST_PROGS += veth.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh new file mode 100755 index 000000000000..5148d97a5df8 --- /dev/null +++ b/tools/testing/selftests/net/forwarding/dual_vxlan_bridge.sh @@ -0,0 +1,366 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# +--------------------+ +----------------------+ +# | H1 (vrf) | | H2 (vrf) | +# | + h1.10 | | + h2.20 | +# | | 192.0.2.1/28 | | | 192.0.2.2/28 | +# | | | | | | +# | + $h1 | | + $h2 | +# | | | | | | +# +----|---------------+ +--|-------------------+ +# | | +# +----|--------------------------------------------------|--------------------+ +# | SW | | | +# | +--|-------------------------------+ +----------------|------------------+ | +# | | + $swp1 BR1 (802.1ad) | | BR2 (802.1d) + $swp2 | | +# | | vid 100 pvid untagged | | | | | +# | | | | + $swp2.20 | | +# | | | | | | +# | | + vx100 (vxlan) | | + vx200 (vxlan) | | +# | | local 192.0.2.17 | | local 192.0.2.17 | | +# | | remote 192.0.2.34 | | remote 192.0.2.50 | | +# | | id 1000 dstport $VXPORT | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | +# | +--------------------------------- + +-----------------------------------+ | +# | | +# | 192.0.2.32/28 via 192.0.2.18 | +# | 192.0.2.48/28 via 192.0.2.18 | +# | | +# | + $rp1 | +# | | 192.0.2.17/28 | +# +----|-----------------------------------------------------------------------+ +# | +# +----|--------------------------------------------------------+ +# | | VRP2 (vrf) | +# | + $rp2 | +# | 192.0.2.18/28 | +# | | (maybe) HW +# ============================================================================= +# | | (likely) SW +# | + v1 (veth) + v3 (veth) | +# | | 192.0.2.33/28 | 192.0.2.49/28 | +# +----|---------------------------------------|----------------+ +# | | +# +----|------------------------------+ +----|------------------------------+ +# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) | +# | 192.0.2.34/28 | | 192.0.2.50/28 | +# | | | | +# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 | +# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 | +# | | | | +# | +-------------------------------+ | | +-------------------------------+ | +# | | BR3 (802.1ad) | | | | BR3 (802.1d) | | +# | | + vx100 (vxlan) | | | | + vx200 (vxlan) | | +# | | local 192.0.2.34 | | | | local 192.0.2.50 | | +# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | | +# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | | +# | | id 1000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | | +# | | vid 100 pvid untagged | | | | | | +# | | | | | | + w1.20 | | +# | | | | | | | | | +# | | + w1 (veth) | | | | + w1 (veth) | | +# | | | vid 100 pvid untagged | | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | | | | +# | +--|----------------------------+ | | +--|----------------------------+ | +# | | | VW2 (vrf) | | | | | VW2 (vrf) | | +# | | + w2 (veth) | | | | + w2 (veth) | | +# | | | | | | | | | | +# | | | | | | | | | | +# | | + w2.10 | | | | + w2.20 | | +# | | 192.0.2.3/28 | | | | 192.0.2.4/28 | | +# | +-------------------------------+ | | +-------------------------------+ | +# +-----------------------------------+ +-----------------------------------+ + +: ${VXPORT:=4789} +export VXPORT + +: ${ALL_TESTS:=" + ping_ipv4 + "} + +NUM_NETIFS=6 +source lib.sh + +h1_create() +{ + simple_if_init $h1 + tc qdisc add dev $h1 clsact + vlan_create $h1 10 v$h1 192.0.2.1/28 +} + +h1_destroy() +{ + vlan_destroy $h1 10 + tc qdisc del dev $h1 clsact + simple_if_fini $h1 +} + +h2_create() +{ + simple_if_init $h2 + tc qdisc add dev $h2 clsact + vlan_create $h2 20 v$h2 192.0.2.2/28 +} + +h2_destroy() +{ + vlan_destroy $h2 20 + tc qdisc del dev $h2 clsact + simple_if_fini $h2 +} + +rp1_set_addr() +{ + ip address add dev $rp1 192.0.2.17/28 + + ip route add 192.0.2.32/28 nexthop via 192.0.2.18 + ip route add 192.0.2.48/28 nexthop via 192.0.2.18 +} + +rp1_unset_addr() +{ + ip route del 192.0.2.48/28 nexthop via 192.0.2.18 + ip route del 192.0.2.32/28 nexthop via 192.0.2.18 + + ip address del dev $rp1 192.0.2.17/28 +} + +switch_create() +{ + #### BR1 #### + ip link add name br1 type bridge vlan_filtering 1 \ + vlan_protocol 802.1ad vlan_default_pvid 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br1 address $(mac_get $swp1) + ip link set dev br1 up + + #### BR2 #### + ip link add name br2 type bridge vlan_filtering 0 mcast_snooping 0 + # Make sure the bridge uses the MAC address of the local port and not + # that of the VxLAN's device. + ip link set dev br2 address $(mac_get $swp2) + ip link set dev br2 up + + ip link set dev $rp1 up + rp1_set_addr + + #### VX100 #### + ip link add name vx100 type vxlan id 1000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx100 up + + ip link set dev vx100 master br1 + bridge vlan add vid 100 dev vx100 pvid untagged + + ip link set dev $swp1 master br1 + ip link set dev $swp1 up + bridge vlan add vid 100 dev $swp1 pvid untagged + + #### VX200 #### + ip link add name vx200 type vxlan id 2000 local 192.0.2.17 \ + dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100 + ip link set dev vx200 up + + ip link set dev vx200 master br2 + + ip link set dev $swp2 up + ip link add name $swp2.20 link $swp2 type vlan id 20 + ip link set dev $swp2.20 master br2 + ip link set dev $swp2.20 up + + bridge fdb append dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + bridge fdb append dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self +} + +switch_destroy() +{ + bridge fdb del dev vx200 00:00:00:00:00:00 dst 192.0.2.50 self + bridge fdb del dev vx100 00:00:00:00:00:00 dst 192.0.2.34 self + + ip link set dev vx200 nomaster + ip link set dev vx200 down + ip link del dev vx200 + + ip link del dev $swp2.20 + ip link set dev $swp2 down + ip link set dev $swp2 nomaster + + bridge vlan del vid 100 dev $swp1 + ip link set dev $swp1 down + ip link set dev $swp1 nomaster + + ip link set dev vx100 nomaster + ip link set dev vx100 down + ip link del dev vx100 + + rp1_unset_addr + ip link set dev $rp1 down + + ip link set dev br2 down + ip link del dev br2 + + ip link set dev br1 down + ip link del dev br1 +} + +vrp2_create() +{ + simple_if_init $rp2 192.0.2.18/28 + __simple_if_init v1 v$rp2 192.0.2.33/28 + __simple_if_init v3 v$rp2 192.0.2.49/28 + tc qdisc add dev v1 clsact +} + +vrp2_destroy() +{ + tc qdisc del dev v1 clsact + __simple_if_fini v3 192.0.2.49/28 + __simple_if_fini v1 192.0.2.33/28 + simple_if_fini $rp2 192.0.2.18/28 +} + +ns_init_common() +{ + local in_if=$1; shift + local in_addr=$1; shift + local other_in_addr=$1; shift + local vxlan_name=$1; shift + local vxlan_id=$1; shift + local vlan_id=$1; shift + local host_addr=$1; shift + local nh_addr=$1; shift + + ip link set dev $in_if up + ip address add dev $in_if $in_addr/28 + tc qdisc add dev $in_if clsact + + ip link add name br3 type bridge vlan_filtering 0 + ip link set dev br3 up + + ip link add name w1 type veth peer name w2 + + ip link set dev w1 master br3 + ip link set dev w1 up + + ip link add name $vxlan_name type vxlan id $vxlan_id local $in_addr \ + dstport "$VXPORT" + ip link set dev $vxlan_name up + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst 192.0.2.17 self + bridge fdb append dev $vxlan_name 00:00:00:00:00:00 dst $other_in_addr self + + ip link set dev $vxlan_name master br3 + tc qdisc add dev $vxlan_name clsact + + simple_if_init w2 + vlan_create w2 $vlan_id vw2 $host_addr/28 + + ip route add 192.0.2.16/28 nexthop via $nh_addr + ip route add $other_in_addr/32 nexthop via $nh_addr +} +export -f ns_init_common + +ns1_create() +{ + ip netns add ns1 + ip link set dev v2 netns ns1 + in_ns ns1 \ + ns_init_common v2 192.0.2.34 192.0.2.50 vx100 1000 10 192.0.2.3 \ + 192.0.2.33 + + in_ns ns1 bridge vlan add vid 100 dev vx100 pvid untagged +} + +ns1_destroy() +{ + ip netns exec ns1 ip link set dev v2 netns 1 + ip netns del ns1 +} + +ns2_create() +{ + ip netns add ns2 + ip link set dev v4 netns ns2 + in_ns ns2 \ + ns_init_common v4 192.0.2.50 192.0.2.34 vx200 2000 20 192.0.2.4 \ + 192.0.2.49 + + in_ns ns2 ip link add name w1.20 link w1 type vlan id 20 + in_ns ns2 ip link set dev w1.20 master br3 + in_ns ns2 ip link set dev w1.20 up +} + +ns2_destroy() +{ + ip netns exec ns2 ip link set dev v4 netns 1 + ip netns del ns2 +} + +setup_prepare() +{ + h1=${NETIFS[p1]} + swp1=${NETIFS[p2]} + + swp2=${NETIFS[p3]} + h2=${NETIFS[p4]} + + rp1=${NETIFS[p5]} + rp2=${NETIFS[p6]} + + vrf_prepare + forwarding_enable + + h1_create + h2_create + switch_create + + ip link add name v1 type veth peer name v2 + ip link add name v3 type veth peer name v4 + vrp2_create + ns1_create + ns2_create + + r1_mac=$(in_ns ns1 mac_get w2) + r2_mac=$(in_ns ns2 mac_get w2) + h2_mac=$(mac_get $h2) +} + +cleanup() +{ + pre_cleanup + + ns2_destroy + ns1_destroy + vrp2_destroy + ip link del dev v3 + ip link del dev v1 + + switch_destroy + h2_destroy + h1_destroy + + forwarding_restore + vrf_cleanup +} + +ping_ipv4() +{ + ping_test $h1 192.0.2.3 ": local->remote 1 through VxLAN with an 802.1ad bridge" + ping_test $h2 192.0.2.4 ": local->remote 2 through VxLAN with an 802.1d bridge" +} + +test_all() +{ + echo "Running tests with UDP port $VXPORT" + tests_run +} + +trap cleanup EXIT + +setup_prepare +setup_wait +test_all + +exit $EXIT_STATUS diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh index 66496659bea7..e134a5f529c9 100644 --- a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh +++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh @@ -224,7 +224,7 @@ fib_ipv4_plen_test() ip -n $ns link set dev dummy1 up # Add two routes with the same key and different prefix length and - # make sure both are in hardware. It can be verfied that both are + # make sure both are in hardware. It can be verified that both are # sharing the same leaf by checking the /proc/net/fib_trie ip -n $ns route add 192.0.2.0/24 dev dummy1 ip -n $ns route add 192.0.2.0/25 dev dummy1 diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index be71012b8fc5..42e28c983d41 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -353,6 +353,11 @@ wait_for_offload() "$@" | grep -q offload } +wait_for_trap() +{ + "$@" | grep -q trap +} + until_counter_is() { local expr=$1; shift @@ -767,6 +772,15 @@ rate() echo $((8 * (t1 - t0) / interval)) } +packets_rate() +{ + local t0=$1; shift + local t1=$1; shift + local interval=$1; shift + + echo $(((t1 - t0) / interval)) +} + mac_get() { local if_name=$1 diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh index 160f9cccdfb7..4f9f17cb45d6 100755 --- a/tools/testing/selftests/net/forwarding/tc_police.sh +++ b/tools/testing/selftests/net/forwarding/tc_police.sh @@ -35,6 +35,8 @@ ALL_TESTS=" police_shared_test police_rx_mirror_test police_tx_mirror_test + police_pps_rx_test + police_pps_tx_test " NUM_NETIFS=6 source tc_common.sh @@ -290,6 +292,60 @@ police_tx_mirror_test() police_mirror_common_test $rp2 egress "police tx and mirror" } +police_pps_common_test() +{ + local test_name=$1; shift + + RET=0 + + # Rule to measure bandwidth on ingress of $h2 + tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action drop + + mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \ + -t udp sp=12345,dp=54321 -p 1000 -c 0 -q & + + local t0=$(tc_rule_stats_get $h2 1 ingress .packets) + sleep 10 + local t1=$(tc_rule_stats_get $h2 1 ingress .packets) + + local er=$((2000)) + local nr=$(packets_rate $t0 $t1 10) + local nr_pct=$((100 * (nr - er) / er)) + ((-10 <= nr_pct && nr_pct <= 10)) + check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%." + + log_test "$test_name" + + { kill %% && wait %%; } 2>/dev/null + tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_rx_test() +{ + # Rule to police traffic destined to $h2 on ingress of $rp1 + tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on rx" + + tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower +} + +police_pps_tx_test() +{ + # Rule to police traffic destined to $h2 on egress of $rp2 + tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \ + dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \ + action police pkts_rate 2000 pkts_burst 400 conform-exceed drop/ok + + police_pps_common_test "police pps on tx" + + tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower +} + setup_prepare() { h1=${NETIFS[p1]} diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh index ce6bea9675c0..eb307ca37bfa 100755 --- a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh +++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh @@ -657,10 +657,21 @@ test_ecn_decap() { # In accordance with INET_ECN_decapsulate() __test_ecn_decap 00 00 0x00 + __test_ecn_decap 00 01 0x00 + __test_ecn_decap 00 02 0x00 + # 00 03 is tested in test_ecn_decap_error() + __test_ecn_decap 01 00 0x01 __test_ecn_decap 01 01 0x01 - __test_ecn_decap 02 01 0x02 + __test_ecn_decap 01 02 0x01 __test_ecn_decap 01 03 0x03 + __test_ecn_decap 02 00 0x02 + __test_ecn_decap 02 01 0x01 + __test_ecn_decap 02 02 0x02 __test_ecn_decap 02 03 0x03 + __test_ecn_decap 03 00 0x03 + __test_ecn_decap 03 01 0x03 + __test_ecn_decap 03 02 0x03 + __test_ecn_decap 03 03 0x03 test_ecn_decap_error } diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 39edce4f541c..2674ba20d524 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -5,8 +5,9 @@ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) ns="ns1-$rndh" ksft_skip=4 test_cnt=1 +timeout_poll=100 +timeout_test=$((timeout_poll * 2 + 1)) ret=0 -pids=() flush_pids() { @@ -14,18 +15,14 @@ flush_pids() # give it some time sleep 1.1 - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1 - done - pids=() + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGUSR1 &>/dev/null } cleanup() { + ip netns pids "${ns}" | xargs --no-run-if-empty kill -SIGKILL &>/dev/null + ip netns del $ns - for pid in ${pids[@]}; do - [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1 - done } ip -Version > /dev/null 2>&1 @@ -79,39 +76,57 @@ trap cleanup EXIT ip netns add $ns ip -n $ns link set dev lo up -echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null & +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -l -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -pids[0]=$! chk_msk_nr 0 "no msk on netns creation" -echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null & +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10000 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 -pids[1]=$! chk_msk_nr 2 "after MPC handshake " chk_msk_remote_key_nr 2 "....chk remote_key" chk_msk_fallback_nr 0 "....chk no fallback" flush_pids -echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null & -pids[0]=$! +echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -l -s TCP -t ${timeout_poll} \ + 0.0.0.0 >/dev/null & sleep 0.1 -echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null & -pids[1]=$! +echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p 10001 -j -t ${timeout_poll} \ + 127.0.0.1 >/dev/null & sleep 0.1 chk_msk_fallback_nr 1 "check fallback" flush_pids NR_CLIENTS=100 for I in `seq 1 $NR_CLIENTS`; do - echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null & - pids[$((I*2))]=$! + echo "a" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -l -w 10 \ + -t ${timeout_poll} 0.0.0.0 >/dev/null & done sleep 0.1 for I in `seq 1 $NR_CLIENTS`; do - echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null & - pids[$((I*2 + 1))]=$! + echo "b" | \ + timeout ${timeout_test} \ + ip netns exec $ns \ + ./mptcp_connect -p $((I+10001)) -w 10 \ + -t ${timeout_poll} 127.0.0.1 >/dev/null & done sleep 1.5 diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 77bb62feb872..69d89b5d666f 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -55,6 +55,7 @@ static int cfg_sndbuf; static int cfg_rcvbuf; static bool cfg_join; static bool cfg_remove; +static unsigned int cfg_do_w; static int cfg_wait; static void die_usage(void) @@ -272,8 +273,8 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (cfg_join && first && do_w > 100) do_w = 100; - if (cfg_remove && do_w > 50) - do_w = 50; + if (cfg_remove && do_w > cfg_do_w) + do_w = cfg_do_w; bw = write(fd, buf, do_w); if (bw < 0) @@ -829,7 +830,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6jrlp:s:hut:m:S:R:w:")) != -1) { + while ((c = getopt(argc, argv, "6jr:lp:s:hut:m:S:R:w:")) != -1) { switch (c) { case 'j': cfg_join = true; @@ -840,6 +841,9 @@ static void parse_opts(int argc, char **argv) cfg_remove = true; cfg_mode = CFG_MODE_POLL; cfg_wait = 400000; + cfg_do_w = atoi(optarg); + if (cfg_do_w <= 0) + cfg_do_w = 50; break; case 'l': listen_mode = true; diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh index 10a030b53b23..385cdc98aed8 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh @@ -11,7 +11,8 @@ cin="" cout="" ksft_skip=4 capture=false -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) ipv6=true ethtool_random_on=true tc_delay="$((RANDOM%50))" @@ -273,7 +274,7 @@ check_mptcp_disabled() ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0 local err=0 - LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ + LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \ grep -q "^socket: Protocol not available$" && err=1 ip netns delete ${disabled_ns} @@ -425,19 +426,32 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + if [ ${listener_ns} != ${connector_ns} ]; then + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + fi + local stat_synrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableSYNRX") local stat_ackrx_last_l=$(get_mib_counter "${listener_ns}" "MPTcpExtMPCapableACKRX") local stat_cookietx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesSent") local stat_cookierx_last=$(get_mib_counter "${listener_ns}" "TcpExtSyncookiesRecv") - ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + ./mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + $extra_args $local_addr < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${listener_ns}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + ./mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $extra_args $connect_addr < "$cin" > "$cout" & local cpid=$! wait $cpid diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh index 191303b652a6..fd99485cf2a4 100755 --- a/tools/testing/selftests/net/mptcp/mptcp_join.sh +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -8,9 +8,11 @@ cin="" cinsent="" cout="" ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) mptcp_connect="" capture=0 +do_all_tests=1 TEST_COUNT=0 @@ -76,6 +78,7 @@ cleanup_partial() for netns in "$ns1" "$ns2"; do ip netns del $netns + rm -f /tmp/$netns.{nstat,out} done } @@ -121,12 +124,6 @@ reset_with_add_addr_timeout() -j DROP } -for arg in "$@"; do - if [ "$arg" = "-c" ]; then - capture=1 - fi -done - ip -Version > /dev/null 2>&1 if [ $? -ne 0 ];then echo "SKIP: Could not run test without ip tool" @@ -237,10 +234,17 @@ do_transfer() sleep 1 fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat -n + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat -n + if [ $speed = "fast" ]; then mptcp_connect="./mptcp_connect -j" - else - mptcp_connect="./mptcp_connect -r" + elif [ $speed = "slow" ]; then + mptcp_connect="./mptcp_connect -r 50" + elif [ $speed = "least" ]; then + mptcp_connect="./mptcp_connect -r 10" fi local local_addr @@ -250,17 +254,26 @@ do_transfer() local_addr="0.0.0.0" fi - ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port \ - -s ${srv_proto} ${local_addr} < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${listener_ns} \ + $mptcp_connect -t ${timeout_poll} -l -p $port -s ${srv_proto} \ + ${local_addr} < "$sin" > "$sout" & spid=$! sleep 1 if [ "$test_link_fail" -eq 0 ];then - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr < "$cin" > "$cout" & else - ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | tee "$cinsent" | \ - ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr > "$cout" & + ( cat "$cin" ; sleep 2; link_failure $listener_ns ; cat "$cin" ) | \ + tee "$cinsent" | \ + timeout ${timeout_test} \ + ip netns exec ${connector_ns} \ + $mptcp_connect -t ${timeout_poll} -p $port -s ${cl_proto} \ + $connect_addr > "$cout" & fi cpid=$! @@ -297,9 +310,12 @@ do_transfer() let id+=1 done fi - else + elif [ $rm_nr_ns1 -eq 8 ]; then sleep 1 ip netns exec ${listener_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns1 -eq 9 ]; then + sleep 1 + ip netns exec ${listener_ns} ./pm_nl_ctl del 0 ${connect_addr} fi fi @@ -336,9 +352,18 @@ do_transfer() let id+=1 done fi - else + elif [ $rm_nr_ns2 -eq 8 ]; then sleep 1 ip netns exec ${connector_ns} ./pm_nl_ctl flush + elif [ $rm_nr_ns2 -eq 9 ]; then + local addr + if is_v6 "${connect_addr}"; then + addr="dead:beef:1::2" + else + addr="10.0.1.2" + fi + sleep 1 + ip netns exec ${connector_ns} ./pm_nl_ctl del 0 $addr fi fi @@ -364,12 +389,19 @@ do_transfer() kill $cappid fi + NSTAT_HISTORY=/tmp/${listener_ns}.nstat ip netns exec ${listener_ns} \ + nstat | grep Tcp > /tmp/${listener_ns}.out + NSTAT_HISTORY=/tmp/${connector_ns}.nstat ip netns exec ${connector_ns} \ + nstat | grep Tcp > /tmp/${connector_ns}.out + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then echo " client exit code $retc, server $rets" 1>&2 echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + ip netns exec ${listener_ns} ss -Menita 1>&2 -o "sport = :$port" + cat /tmp/${listener_ns}.out echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2 - ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + ip netns exec ${connector_ns} ss -Menita 1>&2 -o "dport = :$port" + cat /tmp/${connector_ns}.out cat "$capout" ret=1 @@ -745,6 +777,14 @@ subflows_tests() ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows, limited by server" 2 2 1 + + # single subflow, dev + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow dev ns2eth3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "single subflow, dev" 1 1 1 } signal_address_tests() @@ -788,6 +828,28 @@ signal_address_tests() run_tests $ns1 $ns2 10.0.1.1 chk_join_nr "multiple subflows and signal" 3 3 3 chk_add_nr 1 1 + + # signal addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal addresses" 3 3 3 + chk_add_nr 3 3 + + # signal invalid addresses + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 + chk_join_nr "signal invalid addresses" 1 1 1 + chk_add_nr 3 3 } link_failure_tests() @@ -823,6 +885,26 @@ add_addr_timeout_tests() run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow chk_join_nr "signal address, ADD_ADDR6 timeout" 1 1 1 chk_add_nr 4 0 + + # signal addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "signal addresses, ADD_ADDR timeout" 2 2 2 + chk_add_nr 8 0 + + # signal invalid addresses timeout + reset_with_add_addr_timeout + ip netns exec $ns1 ./pm_nl_ctl limits 2 2 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 2 2 + run_tests $ns1 $ns2 10.0.1.1 0 0 0 least + chk_join_nr "invalid address, ADD_ADDR timeout" 1 1 1 + chk_add_nr 8 0 } remove_tests() @@ -879,6 +961,30 @@ remove_tests() chk_add_nr 1 1 chk_rm_nr 2 2 + # addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove addresses" 3 3 3 + chk_add_nr 3 3 + chk_rm_nr 3 3 invert + + # invalid addresses remove + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -3 0 slow + chk_join_nr "remove invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert + # subflows and signal, flush reset ip netns exec $ns1 ./pm_nl_ctl limits 0 3 @@ -913,6 +1019,37 @@ remove_tests() chk_join_nr "flush addresses" 3 3 3 chk_add_nr 3 3 chk_rm_nr 3 3 invert + + # invalid addresses flush + reset + ip netns exec $ns1 ./pm_nl_ctl limits 3 3 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.12.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal + ip netns exec $ns1 ./pm_nl_ctl add 10.0.14.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 3 3 + run_tests $ns1 $ns2 10.0.1.1 0 -8 0 slow + chk_join_nr "flush invalid addresses" 1 1 1 + chk_add_nr 3 3 + chk_rm_nr 3 1 invert + + # remove id 0 subflow + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow + run_tests $ns1 $ns2 10.0.1.1 0 0 -9 slow + chk_join_nr "remove id 0 subflow" 1 1 1 + chk_rm_nr 1 1 + + # remove id 0 address + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 1 + run_tests $ns1 $ns2 10.0.1.1 0 -9 0 slow + chk_join_nr "remove id 0 address" 1 1 1 + chk_add_nr 1 1 + chk_rm_nr 1 1 invert } add_tests() @@ -1265,7 +1402,8 @@ usage() echo " -4 v4mapped_tests" echo " -b backup_tests" echo " -p add_addr_ports_tests" - echo " -c syncookies_tests" + echo " -k syncookies_tests" + echo " -c capture pcap files" echo " -h help" } @@ -1279,12 +1417,24 @@ make_file "$cin" "client" 1 make_file "$sin" "server" 1 trap cleanup EXIT -if [ -z $1 ]; then +for arg in "$@"; do + # check for "capture" arg before launching tests + if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then + capture=1 + fi + + # exception for the capture option, the rest means: a part of the tests + if [ "${arg}" != "-c" ]; then + do_all_tests=0 + fi +done + +if [ $do_all_tests -eq 1 ]; then all_tests exit $ret fi -while getopts 'fsltra64bpch' opt; do +while getopts 'fsltra64bpkch' opt; do case $opt in f) subflows_tests @@ -1316,9 +1466,11 @@ while getopts 'fsltra64bpch' opt; do p) add_addr_ports_tests ;; - c) + k) syncookies_tests ;; + c) + ;; h | *) usage ;; diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index a617e293734c..3c741abe034e 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -100,12 +100,12 @@ done check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit" check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" -for i in `seq 9 256`; do +ip netns exec $ns1 ./pm_nl_ctl del 9 +for i in `seq 10 255`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $i ip netns exec $ns1 ./pm_nl_ctl del $i - ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 id $((i+1)) done check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 -id 2 flags 10.0.0.9 id 3 flags signal,backup 10.0.1.3 id 4 flags signal 10.0.1.4 id 5 flags signal 10.0.1.5 diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c index 7b4167f3f9a2..115decfdc1ef 100644 --- a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -26,7 +26,7 @@ static void syntax(char *argv[]) { fprintf(stderr, "%s add|get|set|del|flush|dump|accept [<args>]\n", argv[0]); fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n"); - fprintf(stderr, "\tdel <id>\n"); + fprintf(stderr, "\tdel <id> [<ip>]\n"); fprintf(stderr, "\tget <id>\n"); fprintf(stderr, "\tset <ip> [flags backup|nobackup]\n"); fprintf(stderr, "\tflush\n"); @@ -301,6 +301,7 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) 1024]; struct rtattr *rta, *nest; struct nlmsghdr *nh; + u_int16_t family; int nest_start; u_int8_t id; int off = 0; @@ -310,11 +311,14 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, MPTCP_PM_VER); - /* the only argument is the address id */ - if (argc != 3) + /* the only argument is the address id (nonzero) */ + if (argc != 3 && argc != 4) syntax(argv); id = atoi(argv[2]); + /* zero id with the IP address */ + if (!id && argc != 4) + syntax(argv); nest_start = off; nest = (void *)(data + off); @@ -328,6 +332,30 @@ int del_addr(int fd, int pm_family, int argc, char *argv[]) rta->rta_len = RTA_LENGTH(1); memcpy(RTA_DATA(rta), &id, 1); off += NLMSG_ALIGN(rta->rta_len); + + if (!id) { + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[3], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[3], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else { + error(1, errno, "can't parse ip %s", argv[3]); + } + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + } nest->rta_len = off - nest_start; do_nl_req(fd, nh, off, 0); diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index f039ee57eb3c..3aeef3bcb101 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -7,7 +7,8 @@ ns2="ns2-$rndh" ns3="ns3-$rndh" capture=false ksft_skip=4 -timeout=30 +timeout_poll=30 +timeout_test=$((timeout_poll * 2 + 1)) test_cnt=1 ret=0 bail=0 @@ -157,14 +158,20 @@ do_transfer() sleep 1 fi - ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" & + timeout ${timeout_test} \ + ip netns exec ${ns3} \ + ./mptcp_connect -jt ${timeout_poll} -l -p $port \ + 0.0.0.0 < "$sin" > "$sout" & local spid=$! wait_local_port_listen "${ns3}" "${port}" local start start=$(date +%s%3N) - ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" & + timeout ${timeout_test} \ + ip netns exec ${ns1} \ + ./mptcp_connect -jt ${timeout_poll} -p $port \ + 10.0.3.3 < "$cin" > "$cout" & local cpid=$! wait $cpid diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c index 7b01b7c2ec10..066efd30e294 100644 --- a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c +++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c @@ -30,25 +30,25 @@ struct reuse_opts { }; struct reuse_opts unreusable_opts[12] = { - {0, 0, 0, 0}, - {0, 0, 0, 1}, - {0, 0, 1, 0}, - {0, 0, 1, 1}, - {0, 1, 0, 0}, - {0, 1, 0, 1}, - {0, 1, 1, 0}, - {0, 1, 1, 1}, - {1, 0, 0, 0}, - {1, 0, 0, 1}, - {1, 0, 1, 0}, - {1, 0, 1, 1}, + {{0, 0}, {0, 0}}, + {{0, 0}, {0, 1}}, + {{0, 0}, {1, 0}}, + {{0, 0}, {1, 1}}, + {{0, 1}, {0, 0}}, + {{0, 1}, {0, 1}}, + {{0, 1}, {1, 0}}, + {{0, 1}, {1, 1}}, + {{1, 0}, {0, 0}}, + {{1, 0}, {0, 1}}, + {{1, 0}, {1, 0}}, + {{1, 0}, {1, 1}}, }; struct reuse_opts reusable_opts[4] = { - {1, 1, 0, 0}, - {1, 1, 0, 1}, - {1, 1, 1, 0}, - {1, 1, 1, 1}, + {{1, 1}, {0, 0}}, + {{1, 1}, {0, 1}}, + {{1, 1}, {1, 0}}, + {{1, 1}, {1, 1}}, }; int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport) diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index b4cca382d125..59067f64b775 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -2,9 +2,12 @@ /* * Test the SO_TXTIME API * - * Takes two streams of { payload, delivery time }[], one input and one output. - * Sends the input stream and verifies arrival matches the output stream. - * The two streams can differ due to out-of-order delivery and drops. + * Takes a stream of { payload, delivery time }[], to be sent across two + * processes. Start this program on two separate network namespaces or + * connected hosts, one instance in transmit mode and the other in receive + * mode using the '-r' option. Receiver will compare arrival timestamps to + * the expected stream. Sender will read transmit timestamps from the error + * queue. The streams can differ due to out-of-order delivery and drops. */ #define _GNU_SOURCE @@ -28,14 +31,17 @@ #include <sys/types.h> #include <time.h> #include <unistd.h> +#include <poll.h> static int cfg_clockid = CLOCK_TAI; -static bool cfg_do_ipv4; -static bool cfg_do_ipv6; static uint16_t cfg_port = 8000; static int cfg_variance_us = 4000; +static uint64_t cfg_start_time_ns; +static int cfg_mark; +static bool cfg_rx; static uint64_t glob_tstart; +static uint64_t tdeliver_max; /* encode one timed transmission (of a 1B payload) */ struct timed_send { @@ -44,18 +50,21 @@ struct timed_send { }; #define MAX_NUM_PKT 8 -static struct timed_send cfg_in[MAX_NUM_PKT]; -static struct timed_send cfg_out[MAX_NUM_PKT]; +static struct timed_send cfg_buf[MAX_NUM_PKT]; static int cfg_num_pkt; static int cfg_errq_level; static int cfg_errq_type; -static uint64_t gettime_ns(void) +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; +static socklen_t cfg_alen; + +static uint64_t gettime_ns(clockid_t clock) { struct timespec ts; - if (clock_gettime(cfg_clockid, &ts)) + if (clock_gettime(clock, &ts)) error(1, errno, "gettime"); return ts.tv_sec * (1000ULL * 1000 * 1000) + ts.tv_nsec; @@ -75,6 +84,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_iov = &iov; msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&cfg_dst_addr; + msg.msg_namelen = cfg_alen; if (ts->delay_us >= 0) { memset(control, 0, sizeof(control)); @@ -82,6 +93,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_controllen = sizeof(control); tdeliver = glob_tstart + ts->delay_us * 1000; + tdeliver_max = tdeliver_max > tdeliver ? + tdeliver_max : tdeliver; cm = CMSG_FIRSTHDR(&msg); cm->cmsg_level = SOL_SOCKET; @@ -98,7 +111,7 @@ static void do_send_one(int fdt, struct timed_send *ts) } -static bool do_recv_one(int fdr, struct timed_send *ts) +static void do_recv_one(int fdr, struct timed_send *ts) { int64_t tstop, texpect; char rbuf[2]; @@ -106,13 +119,13 @@ static bool do_recv_one(int fdr, struct timed_send *ts) ret = recv(fdr, rbuf, sizeof(rbuf), 0); if (ret == -1 && errno == EAGAIN) - return true; + error(1, EAGAIN, "recv: timeout"); if (ret == -1) error(1, errno, "read"); if (ret != 1) error(1, 0, "read: %dB", ret); - tstop = (gettime_ns() - glob_tstart) / 1000; + tstop = (gettime_ns(cfg_clockid) - glob_tstart) / 1000; texpect = ts->delay_us >= 0 ? ts->delay_us : 0; fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n", @@ -123,8 +136,6 @@ static bool do_recv_one(int fdr, struct timed_send *ts) if (llabs(tstop - texpect) > cfg_variance_us) error(1, 0, "exceeds variance (%d us)", cfg_variance_us); - - return false; } static void do_recv_verify_empty(int fdr) @@ -137,18 +148,18 @@ static void do_recv_verify_empty(int fdr) error(1, 0, "recv: not empty as expected (%d, %d)", ret, errno); } -static void do_recv_errqueue_timeout(int fdt) +static int do_recv_errqueue_timeout(int fdt) { char control[CMSG_SPACE(sizeof(struct sock_extended_err)) + CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0}; char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr) + 1]; struct sock_extended_err *err; + int ret, num_tstamp = 0; struct msghdr msg = {0}; struct iovec iov = {0}; struct cmsghdr *cm; int64_t tstamp = 0; - int ret; iov.iov_base = data; iov.iov_len = sizeof(data); @@ -206,9 +217,47 @@ static void do_recv_errqueue_timeout(int fdt) msg.msg_flags = 0; msg.msg_controllen = sizeof(control); + num_tstamp++; } - error(1, 0, "recv: timeout"); + return num_tstamp; +} + +static void recv_errqueue_msgs(int fdt) +{ + struct pollfd pfd = { .fd = fdt, .events = POLLERR }; + const int timeout_ms = 10; + int ret, num_tstamp = 0; + + do { + ret = poll(&pfd, 1, timeout_ms); + if (ret == -1) + error(1, errno, "poll"); + + if (ret && (pfd.revents & POLLERR)) + num_tstamp += do_recv_errqueue_timeout(fdt); + + if (num_tstamp == cfg_num_pkt) + break; + + } while (gettime_ns(cfg_clockid) < tdeliver_max); +} + +static void start_time_wait(void) +{ + uint64_t now; + int err; + + if (!cfg_start_time_ns) + return; + + now = gettime_ns(CLOCK_REALTIME); + if (cfg_start_time_ns < now) + return; + + err = usleep((cfg_start_time_ns - now) / 1000); + if (err) + error(1, errno, "usleep"); } static void setsockopt_txtime(int fd) @@ -245,6 +294,10 @@ static int setup_tx(struct sockaddr *addr, socklen_t alen) setsockopt_txtime(fd); + if (cfg_mark && + setsockopt(fd, SOL_SOCKET, SO_MARK, &cfg_mark, sizeof(cfg_mark))) + error(1, errno, "setsockopt mark"); + return fd; } @@ -266,31 +319,70 @@ static int setup_rx(struct sockaddr *addr, socklen_t alen) return fd; } -static void do_test(struct sockaddr *addr, socklen_t alen) +static void do_test_tx(struct sockaddr *addr, socklen_t alen) { - int fdt, fdr, i; + int fdt, i; fprintf(stderr, "\nSO_TXTIME ipv%c clock %s\n", addr->sa_family == PF_INET ? '4' : '6', cfg_clockid == CLOCK_TAI ? "tai" : "monotonic"); fdt = setup_tx(addr, alen); - fdr = setup_rx(addr, alen); - glob_tstart = gettime_ns(); + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); for (i = 0; i < cfg_num_pkt; i++) - do_send_one(fdt, &cfg_in[i]); + do_send_one(fdt, &cfg_buf[i]); + + recv_errqueue_msgs(fdt); + + if (close(fdt)) + error(1, errno, "close t"); +} + +static void do_test_rx(struct sockaddr *addr, socklen_t alen) +{ + int fdr, i; + + fdr = setup_rx(addr, alen); + + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); + for (i = 0; i < cfg_num_pkt; i++) - if (do_recv_one(fdr, &cfg_out[i])) - do_recv_errqueue_timeout(fdt); + do_recv_one(fdr, &cfg_buf[i]); do_recv_verify_empty(fdr); if (close(fdr)) error(1, errno, "close r"); - if (close(fdt)) - error(1, errno, "close t"); +} + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + } } static int parse_io(const char *optarg, struct timed_send *array) @@ -323,17 +415,46 @@ static int parse_io(const char *optarg, struct timed_send *array) return aoff / 2; } +static void usage(const char *progname) +{ + fprintf(stderr, "\nUsage: %s [options] <payload>\n" + "Options:\n" + " -4 only IPv4\n" + " -6 only IPv6\n" + " -c <clock> monotonic (default) or tai\n" + " -D <addr> destination IP address (server)\n" + " -S <addr> source IP address (client)\n" + " -r run rx mode\n" + " -t <nsec> start time (UTC nanoseconds)\n" + " -m <mark> socket mark\n" + "\n", + progname); + exit(1); +} + static void parse_opts(int argc, char **argv) { - int c, ilen, olen; + char *daddr = NULL, *saddr = NULL; + int domain = PF_UNSPEC; + int c; - while ((c = getopt(argc, argv, "46c:")) != -1) { + while ((c = getopt(argc, argv, "46c:S:D:rt:m:")) != -1) { switch (c) { case '4': - cfg_do_ipv4 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + cfg_errq_level = SOL_IP; + cfg_errq_type = IP_RECVERR; break; case '6': - cfg_do_ipv6 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + cfg_errq_level = SOL_IPV6; + cfg_errq_type = IPV6_RECVERR; break; case 'c': if (!strcmp(optarg, "tai")) @@ -344,50 +465,50 @@ static void parse_opts(int argc, char **argv) else error(1, 0, "unknown clock id %s", optarg); break; + case 'S': + saddr = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'r': + cfg_rx = true; + break; + case 't': + cfg_start_time_ns = strtol(optarg, NULL, 0); + break; + case 'm': + cfg_mark = strtol(optarg, NULL, 0); + break; default: - error(1, 0, "parse error at %d", optind); + usage(argv[0]); } } - if (argc - optind != 2) - error(1, 0, "Usage: %s [-46] -c <clock> <in> <out>", argv[0]); + if (argc - optind != 1) + usage(argv[0]); + + if (domain == PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + if (!daddr) + error(1, 0, "-D <server addr> required\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S <client addr> required\n"); - ilen = parse_io(argv[optind], cfg_in); - olen = parse_io(argv[optind + 1], cfg_out); - if (ilen != olen) - error(1, 0, "i/o streams len mismatch (%d, %d)\n", ilen, olen); - cfg_num_pkt = ilen; + setup_sockaddr(domain, daddr, &cfg_dst_addr); + setup_sockaddr(domain, saddr, &cfg_src_addr); + + cfg_num_pkt = parse_io(argv[optind], cfg_buf); } int main(int argc, char **argv) { parse_opts(argc, argv); - if (cfg_do_ipv6) { - struct sockaddr_in6 addr6 = {0}; - - addr6.sin6_family = AF_INET6; - addr6.sin6_port = htons(cfg_port); - addr6.sin6_addr = in6addr_loopback; - - cfg_errq_level = SOL_IPV6; - cfg_errq_type = IPV6_RECVERR; - - do_test((void *)&addr6, sizeof(addr6)); - } - - if (cfg_do_ipv4) { - struct sockaddr_in addr4 = {0}; - - addr4.sin_family = AF_INET; - addr4.sin_port = htons(cfg_port); - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - - cfg_errq_level = SOL_IP; - cfg_errq_type = IP_RECVERR; - - do_test((void *)&addr4, sizeof(addr4)); - } + if (cfg_rx) + do_test_rx((void *)&cfg_dst_addr, cfg_alen); + else + do_test_tx((void *)&cfg_src_addr, cfg_alen); return 0; } diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh index 3f7800eaecb1..3f06f4d286a9 100755 --- a/tools/testing/selftests/net/so_txtime.sh +++ b/tools/testing/selftests/net/so_txtime.sh @@ -3,32 +3,85 @@ # # Regression tests for the SO_TXTIME interface -# Run in network namespace -if [[ $# -eq 0 ]]; then - if ! ./in_netns.sh $0 __subprocess; then - # test is time sensitive, can be flaky - echo "test failed: retry once" - ./in_netns.sh $0 __subprocess +set -e + +readonly DEV="veth0" +readonly BIN="./so_txtime" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" +} + +trap cleanup EXIT + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" netns "${NS1}" type veth \ + peer name "${DEV}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +do_test() { + local readonly IP="$1" + local readonly CLOCK="$2" + local readonly TXARGS="$3" + local readonly RXARGS="$4" + + if [[ "${IP}" == "4" ]]; then + local readonly SADDR="${SADDR4}" + local readonly DADDR="${DADDR4}" + elif [[ "${IP}" == "6" ]]; then + local readonly SADDR="${SADDR6}" + local readonly DADDR="${DADDR6}" + else + echo "Invalid IP version ${IP}" + exit 1 fi - exit $? -fi + local readonly START="$(date +%s%N --date="+ 0.1 seconds")" + ip netns exec "${NS2}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${RXARGS}" -r & + ip netns exec "${NS1}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${TXARGS}" + wait "$!" +} -set -e +ip netns exec "${NS1}" tc qdisc add dev "${DEV}" root fq +do_test 4 mono a,-1 a,-1 +do_test 6 mono a,0 a,0 +do_test 6 mono a,10 a,10 +do_test 4 mono a,10,b,20 a,10,b,20 +do_test 6 mono a,20,b,10 b,20,a,20 -tc qdisc add dev lo root fq -./so_txtime -4 -6 -c mono a,-1 a,-1 -./so_txtime -4 -6 -c mono a,0 a,0 -./so_txtime -4 -6 -c mono a,10 a,10 -./so_txtime -4 -6 -c mono a,10,b,20 a,10,b,20 -./so_txtime -4 -6 -c mono a,20,b,10 b,20,a,20 - -if tc qdisc replace dev lo root etf clockid CLOCK_TAI delta 400000; then - ! ./so_txtime -4 -6 -c tai a,-1 a,-1 - ! ./so_txtime -4 -6 -c tai a,0 a,0 - ./so_txtime -4 -6 -c tai a,10 a,10 - ./so_txtime -4 -6 -c tai a,10,b,20 a,10,b,20 - ./so_txtime -4 -6 -c tai a,20,b,10 b,10,a,20 +if ip netns exec "${NS1}" tc qdisc replace dev "${DEV}" root etf clockid CLOCK_TAI delta 400000; then + ! do_test 4 tai a,-1 a,-1 + ! do_test 6 tai a,0 a,0 + do_test 6 tai a,10 a,10 + do_test 4 tai a,10,b,20 a,10,b,20 + do_test 6 tai a,20,b,10 b,10,a,20 else echo "tc ($(tc -V)) does not support qdisc etf. skipping" fi diff --git a/tools/testing/selftests/net/udpgro_fwd.sh b/tools/testing/selftests/net/udpgro_fwd.sh new file mode 100755 index 000000000000..a8fa64136282 --- /dev/null +++ b/tools/testing/selftests/net/udpgro_fwd.sh @@ -0,0 +1,251 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly BASE="ns-$(mktemp -u XXXXXX)" +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +# "overlay" network used for UDP over UDP tunnel traffic +readonly OL_NET_V4=172.16.1. +readonly OL_NET_V6=2001:db8:1:: +readonly NPROCS=`nproc` + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local net + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns + ip -n $BASE$ns link set dev veth$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + ip -n $NS_DST link set veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +} + +create_vxlan_endpoint() { + local -r netns=$1 + local -r bm_dev=$2 + local -r bm_rem_addr=$3 + local -r vxlan_dev=$4 + local -r vxlan_id=$5 + local -r vxlan_port=4789 + + ip -n $netns link set dev $bm_dev up + ip -n $netns link add dev $vxlan_dev type vxlan id $vxlan_id \ + dstport $vxlan_port remote $bm_rem_addr + ip -n $netns link set dev $vxlan_dev up +} + +create_vxlan_pair() { + local ns + + create_ns + + for ns in $SRC $DST; do + # note that 3 - $SRC == $DST and 3 - $DST == $SRC + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V4$((3 - $ns)) vxlan$ns 4 + ip -n $BASE$ns addr add dev vxlan$ns $OL_NET_V4$ns/24 + done + for ns in $SRC $DST; do + create_vxlan_endpoint $BASE$ns veth$ns $BM_NET_V6$((3 - $ns)) vxlan6$ns 6 + ip -n $BASE$ns addr add dev vxlan6$ns $OL_NET_V6$ns/24 nodad + done +} + +is_ipv6() { + if [[ $1 =~ .*:.* ]]; then + return 0 + fi + return 1 +} + +run_test() { + local -r msg=$1 + local -r dst=$2 + local -r pkts=$3 + local -r vxpkts=$4 + local bind=$5 + local rx_args="" + local rx_family="-4" + local family=-4 + local filter=IpInReceives + local ipt=iptables + + printf "%-40s" "$msg" + + if is_ipv6 $dst; then + # rx program does not support '-6' and implies ipv6 usage by default + rx_family="" + family=-6 + filter=Ip6InReceives + ipt=ip6tables + fi + + rx_args="$rx_family" + [ -n "$bind" ] && rx_args="$rx_args -b $bind" + + # send a single GSO packet, segmented in 10 UDP frames. + # Always expect 10 UDP frames on RX side as rx socket does + # not enable GRO + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 4789 + ip netns exec $NS_DST $ipt -A INPUT -p udp --dport 8000 + ip netns exec $NS_DST ./udpgso_bench_rx -C 1000 -R 10 -n 10 -l 1300 $rx_args & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC ./udpgso_bench_tx $family -M 1 -s 13000 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local rcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 8000' | \ + sed -e 's/\[//' -e 's/:.*//'` + if [ $rcv != $pkts ]; then + echo " fail - received $rvs packets, expected $pkts" + ret=1 + return + fi + + local vxrcv=`ip netns exec $NS_DST $ipt"-save" -c | grep 'dport 4789' | \ + sed -e 's/\[//' -e 's/:.*//'` + + # upper net can generate a little noise, allow some tolerance + if [ $vxrcv -lt $vxpkts -o $vxrcv -gt $((vxpkts + 3)) ]; then + echo " fail - received $vxrcv vxlan packets, expected $vxpkts" + ret=1 + return + fi + echo " ok" +} + +run_bench() { + local -r msg=$1 + local -r dst=$2 + local family=-4 + + printf "%-40s" "$msg" + if [ $NPROCS -lt 2 ]; then + echo " skip - needed 2 CPUs found $NPROCS" + return + fi + + is_ipv6 $dst && family=-6 + + # bind the sender and the receiver to different CPUs to try + # get reproducible results + ip netns exec $NS_DST bash -c "echo 2 > /sys/class/net/veth$DST/queues/rx-0/rps_cpus" + ip netns exec $NS_DST taskset 0x2 ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + ip netns exec $NS_SRC taskset 0x1 ./udpgso_bench_tx $family -l 3 -S 1300 -D $dst + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi +} + +for family in 4 6; do + BM_NET=$BM_NET_V4 + OL_NET=$OL_NET_V4 + IPT=iptables + SUFFIX=24 + VXDEV=vxlan + + if [ $family = 6 ]; then + BM_NET=$BM_NET_V6 + OL_NET=$OL_NET_V6 + SUFFIX="64 nodad" + VXDEV=vxlan6 + IPT=ip6tables + fi + + echo "IPv$family" + + create_ns + run_test "No GRO" $BM_NET$DST 10 0 + cleanup + + create_ns + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list" $BM_NET$DST 1 0 + cleanup + + # UDP GRO fwd skips aggregation when find an udp socket with the GRO option + # if there is an UDP tunnel in the running system, such lookup happen + # take place. + # use NAT to circumvent GRO FWD check + create_ns + ip -n $NS_DST addr add dev veth$DST $BM_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $BM_NET$DST_NAT \ + -j DNAT --to-destination $BM_NET$DST + run_test "GRO fwd" $BM_NET$DST_NAT 1 0 $BM_NET$DST + cleanup + + create_ns + run_bench "UDP fwd perf" $BM_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP GRO fwd perf" $BM_NET$DST + cleanup + + create_vxlan_pair + ip netns exec $NS_DST ethtool -K veth$DST rx-gro-list on + run_test "GRO frag list over UDP tunnel" $OL_NET$DST 1 1 + cleanup + + # use NAT to circumvent GRO FWD check + create_vxlan_pair + ip -n $NS_DST addr add dev $VXDEV$DST $OL_NET$DST_NAT/$SUFFIX + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + ip netns exec $NS_DST $IPT -t nat -I PREROUTING -d $OL_NET$DST_NAT \ + -j DNAT --to-destination $OL_NET$DST + + # load arp cache before running the test to reduce the amount of + # stray traffic on top of the UDP tunnel + ip netns exec $NS_SRC ping -q -c 1 $OL_NET$DST_NAT >/dev/null + run_test "GRO fwd over UDP tunnel" $OL_NET$DST_NAT 1 1 $OL_NET$DST + cleanup + + create_vxlan_pair + run_bench "UDP tunnel fwd perf" $OL_NET$DST + ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on + run_bench "UDP tunnel GRO fwd perf" $OL_NET$DST + cleanup +done + +exit $ret diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh new file mode 100755 index 000000000000..2fedc0781ce8 --- /dev/null +++ b/tools/testing/selftests/net/veth.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" +readonly BASE=`basename $STATS` +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +readonly NPROCS=`nproc` +ret=0 + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + rm -f $STATS + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + echo "#kernel" > $BASE + chmod go-rw $BASE +} + +__chk_flag() { + local msg="$1" + local target=$2 + local expected=$3 + local flagname=$4 + + local flag=`ip netns exec $BASE$target ethtool -k veth$target |\ + grep $flagname | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$flag" = "$expected" ]; then + echo " ok " + else + echo " fail - expected $expected found $flag" + ret=1 + fi +} + +chk_gro_flag() { + __chk_flag "$1" $2 $3 generic-receive-offload +} + +chk_tso_flag() { + __chk_flag "$1" $2 $3 tcp-segmentation-offload +} + +chk_gro() { + local msg="$1" + local expected=$2 + + ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null + NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n + + printf "%-60s" "$msg" + ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \ + awk '{print $2}' | tail -n 1` + if [ "$pkts" = "$expected" ]; then + echo " ok " + else + echo " fail - got $pkts packets, expected $expected " + ret=1 + fi +} + +if [ ! -f ../bpf/xdp_dummy.o ]; then + echo "Missing xdp_dummy helper. Build bpf selftest first" + exit -1 +fi + +create_ns +chk_gro_flag "default - gro flag" $SRC off +chk_gro_flag " - peer gro flag" $DST off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +chk_gro " - aggregation" 1 +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro " - aggregation with TSO off" 10 +cleanup + +create_ns +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro on - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST down +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro enabled on link down - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +chk_gro_flag "with xdp attached - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC off +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation" 1 + + +ip -n $NS_DST link set dev veth$DST down +ip -n $NS_SRC link set dev veth$SRC down +chk_gro_flag " - after dev off, flag" $DST on +chk_gro_flag " - peer flag" $SRC off + +ip netns exec $NS_DST ethtool -K veth$DST gro on +ip -n $NS_DST link set dev veth$DST xdp off +chk_gro_flag " - after gro on xdp off, gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip -n $NS_SRC link set dev veth$SRC up +chk_gro " - aggregation" 1 + +ip netns exec $NS_DST ethtool -K veth$DST gro off +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro "aggregation again with default and TSO off" 10 + +exit $ret diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh index 431296c0f91c..427d94816f2d 100755 --- a/tools/testing/selftests/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/netfilter/nft_flowtable.sh @@ -371,6 +371,88 @@ else ip netns exec nsr1 nft list ruleset fi +# Another test: +# Add bridge interface br0 to Router1, with NAT enabled. +ip -net nsr1 link add name br0 type bridge +ip -net nsr1 addr flush dev veth0 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set veth0 master br0 +ip -net nsr1 addr add 10.0.1.1/24 dev br0 +ip -net nsr1 addr add dead:1::1/64 dev br0 +ip -net nsr1 link set up dev br0 + +ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null + +# br0 with NAT enabled. +ip netns exec nsr1 nft -f - <<EOF +flush table ip nat +table ip nat { + chain prerouting { + type nat hook prerouting priority 0; policy accept; + meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345 + } + + chain postrouting { + type nat hook postrouting priority 0; policy accept; + meta oifname "veth1" counter masquerade + } +} +EOF + +if test_tcp_forwarding_nat ns1 ns2; then + echo "PASS: flow offloaded for ns1/ns2 with bridge NAT" +else + echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# Another test: +# Add bridge interface br0 to Router1, with NAT and VLAN. +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set down dev veth0 +ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10 +ip -net nsr1 link set up dev veth0 +ip -net nsr1 link set up dev veth0.10 +ip -net nsr1 link set veth0.10 master br0 + +ip -net ns1 addr flush dev eth0 +ip -net ns1 link add link eth0 name eth0.10 type vlan id 10 +ip -net ns1 link set eth0 up +ip -net ns1 link set eth0.10 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0.10 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0.10 + +if test_tcp_forwarding_nat ns1 ns2; then + echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN" +else + echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2 + ip netns exec nsr1 nft list ruleset + ret=1 +fi + +# restore test topology (remove bridge and VLAN) +ip -net nsr1 link set veth0 nomaster +ip -net nsr1 link set veth0 down +ip -net nsr1 link set veth0.10 down +ip -net nsr1 link delete veth0.10 type vlan +ip -net nsr1 link delete br0 type bridge +ip -net ns1 addr flush dev eth0.10 +ip -net ns1 link set eth0.10 down +ip -net ns1 link set eth0 down +ip -net ns1 link delete eth0.10 type vlan + +# restore address in ns1 and nsr1 +ip -net ns1 link set eth0 up +ip -net ns1 addr add 10.0.1.99/24 dev eth0 +ip -net ns1 route add default via 10.0.1.1 +ip -net ns1 addr add dead:1::99/64 dev eth0 +ip -net ns1 route add default via dead:1::1 +ip -net nsr1 addr add 10.0.1.1/24 dev veth0 +ip -net nsr1 addr add dead:1::1/64 dev veth0 +ip -net nsr1 link set up dev veth0 + KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1) KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1) SPI1=$RANDOM diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json index b8268da5adaa..8e45792703ed 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json @@ -764,5 +764,53 @@ "teardown": [ "$TC actions flush action police" ] + }, + { + "id": "cdd7", + "name": "Add valid police action with packets per second rate limit", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 rate 0bit burst 0b mtu 4096Mb pkts_rate 1000 pkts_burst 200", + "matchCount": "1", + "teardown": [ + "$TC actions flush action police" + ] + }, + { + "id": "f5bc", + "name": "Add invalid police action with both bps and pps", + "category": [ + "actions", + "police" + ], + "setup": [ + [ + "$TC actions flush action police", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions add action police rate 1kbit burst 10k pkts_rate 1000 pkts_burst 200 index 1", + "expExitCode": "255", + "verifyCmd": "$TC actions ls action police", + "matchPattern": "action order [0-9]*: police 0x1 ", + "matchCount": "0", + "teardown": [ + "$TC actions flush action police" + ] } ] diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json index 8e8c1ae12260..e0c5f060ccb9 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json +++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json @@ -24,6 +24,30 @@ ] }, { + "id": "4297", + "name": "Add simple action with change command", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ] + ], + "cmdUnderTest": "$TC actions change action simple sdata \"Not changed\" index 60", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <Not changed>.*index 60 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] + }, + { "id": "6d4c", "name": "Add simple action with duplicate index", "category": [ @@ -151,5 +175,64 @@ "teardown": [ "$TC actions flush action simple" ] + }, + { + "id": "8d07", + "name": "Verify cleanup of failed actions batch add", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + "$TC actions add action simple sdata \"2\" index 2", + [ + "$TC actions add action simple sdata \"1\" index 1 action simple sdata \"2\" index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"2\" index 2", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <2>.*index 2 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] + }, + { + "id": "a68a", + "name": "Verify cleanup of failed actions batch change", + "category": [ + "actions", + "simple" + ], + "setup": [ + [ + "$TC actions flush action simple", + 0, + 1, + 255 + ], + [ + "$TC actions change action simple sdata \"1\" index 1 action simple sdata \"2\" goto chain 42 index 2", + 255 + ], + "$TC actions flush action simple" + ], + "cmdUnderTest": "$TC actions add action simple sdata \"1\" index 1", + "expExitCode": "0", + "verifyCmd": "$TC actions list action simple", + "matchPattern": "action order [0-9]*: Simple <1>.*index 1 ref", + "matchCount": "1", + "teardown": [ + "$TC actions flush action simple" + ] } ] diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index d42115e4284d..8b0cd421ebd3 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -101,7 +101,7 @@ endef ifeq ($(CAN_BUILD_I386),1) $(BINARIES_32): CFLAGS += -m32 $(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): %_32: %.c +$(BINARIES_32): $(OUTPUT)/%_32: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-32,$(t)))) endif @@ -109,7 +109,7 @@ endif ifeq ($(CAN_BUILD_X86_64),1) $(BINARIES_64): CFLAGS += -m64 $(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): %_64: %.c +$(BINARIES_64): $(OUTPUT)/%_64: %.c $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ $(foreach t,$(TARGETS),$(eval $(call gen-target-rule-64,$(t)))) endif |