diff options
Diffstat (limited to 'samples')
73 files changed, 5982 insertions, 857 deletions
diff --git a/samples/Kconfig b/samples/Kconfig index 9cb63188d3ef..c332a3b9de05 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -71,11 +71,10 @@ config SAMPLE_RPMSG_CLIENT the rpmsg bus. config SAMPLE_LIVEPATCH - tristate "Build live patching sample -- loadable modules only" + tristate "Build live patching samples -- loadable modules only" depends on LIVEPATCH && m help - Builds a sample live patch that replaces the procfs handler - for /proc/cmdline to print "this has been live patched". + Build sample live patch demonstrations. config SAMPLE_CONFIGFS tristate "Build configfs patching sample -- loadable modules only" diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 9b4a66e3363e..ec3fc8d88e87 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - # List of programs to build hostprogs-y := test_lru_dist hostprogs-y += sock_example @@ -15,6 +12,7 @@ hostprogs-y += tracex3 hostprogs-y += tracex4 hostprogs-y += tracex5 hostprogs-y += tracex6 +hostprogs-y += tracex7 hostprogs-y += test_probe_write_user hostprogs-y += trace_output hostprogs-y += lathist @@ -29,6 +27,7 @@ hostprogs-y += test_cgrp2_sock hostprogs-y += test_cgrp2_sock2 hostprogs-y += xdp1 hostprogs-y += xdp2 +hostprogs-y += xdp_router_ipv4 hostprogs-y += test_current_task_under_cgroup hostprogs-y += trace_event hostprogs-y += sampleip @@ -40,11 +39,14 @@ hostprogs-y += per_socket_stats_example hostprogs-y += load_sock_ops hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map +hostprogs-y += xdp_redirect_cpu hostprogs-y += xdp_monitor +hostprogs-y += xdp_rxq_info hostprogs-y += syscall_tp # Libbpf dependencies -LIBBPF := ../../tools/lib/bpf/bpf.o +LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o +CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o test_lru_dist-objs := test_lru_dist.o $(LIBBPF) sock_example-objs := sock_example.o $(LIBBPF) @@ -58,6 +60,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o +tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o @@ -68,13 +71,14 @@ map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o -test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o cgroup_helpers.o +test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o $(CGROUP_HELPERS) test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o # reuse xdp1 source intentionally xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o -test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \ +xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o +test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \ test_current_task_under_cgroup_user.o trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o @@ -85,7 +89,9 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o +xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o +xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o # Tell kbuild to always build the programs @@ -99,6 +105,7 @@ always += tracex3_kern.o always += tracex4_kern.o always += tracex5_kern.o always += tracex6_kern.o +always += tracex7_kern.o always += sock_flags_kern.o always += test_probe_write_user_kern.o always += trace_output_kern.o @@ -115,6 +122,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o always += test_cgrp2_tc_kern.o always += xdp1_kern.o always += xdp2_kern.o +always += xdp_router_ipv4_kern.o always += test_current_task_under_cgroup_kern.o always += trace_event_kern.o always += sampleip_kern.o @@ -128,9 +136,13 @@ always += tcp_bufs_kern.o always += tcp_cong_kern.o always += tcp_iw_kern.o always += tcp_clamp_kern.o +always += tcp_basertt_kern.o always += xdp_redirect_kern.o always += xdp_redirect_map_kern.o +always += xdp_redirect_cpu_kern.o always += xdp_monitor_kern.o +always += xdp_rxq_info_kern.o +always += xdp2skb_meta_kern.o always += syscall_tp_kern.o HOSTCFLAGS += -I$(objtree)/usr/include @@ -150,6 +162,7 @@ HOSTLOADLIBES_tracex3 += -lelf HOSTLOADLIBES_tracex4 += -lelf -lrt HOSTLOADLIBES_tracex5 += -lelf HOSTLOADLIBES_tracex6 += -lelf +HOSTLOADLIBES_tracex7 += -lelf HOSTLOADLIBES_test_cgrp2_sock2 += -lelf HOSTLOADLIBES_load_sock_ops += -lelf HOSTLOADLIBES_test_probe_write_user += -lelf @@ -161,6 +174,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt HOSTLOADLIBES_test_overhead += -lelf -lrt HOSTLOADLIBES_xdp1 += -lelf HOSTLOADLIBES_xdp2 += -lelf +HOSTLOADLIBES_xdp_router_ipv4 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf HOSTLOADLIBES_trace_event += -lelf HOSTLOADLIBES_sampleip += -lelf @@ -170,7 +184,9 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf HOSTLOADLIBES_test_map_in_map += -lelf HOSTLOADLIBES_xdp_redirect += -lelf HOSTLOADLIBES_xdp_redirect_map += -lelf +HOSTLOADLIBES_xdp_redirect_cpu += -lelf HOSTLOADLIBES_xdp_monitor += -lelf +HOSTLOADLIBES_xdp_rxq_info += -lelf HOSTLOADLIBES_syscall_tp += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: @@ -178,14 +194,23 @@ HOSTLOADLIBES_syscall_tp += -lelf LLC ?= llc CLANG ?= clang +# Detect that we're cross compiling and use the cross compiler +ifdef CROSS_COMPILE +HOSTCC = $(CROSS_COMPILE)gcc +CLANG_ARCH_ARGS = -target $(ARCH) +endif + # Trick to allow make to be run from this directory -all: +all: $(LIBBPF) $(MAKE) -C ../../ $(CURDIR)/ clean: $(MAKE) -C ../../ M=$(CURDIR) clean @rm -f *~ +$(LIBBPF): FORCE + $(MAKE) -C $(dir $@) $(notdir $@) + $(obj)/syscall_nrs.s: $(src)/syscall_nrs.c $(call if_changed_dep,cc_s_c) @@ -225,9 +250,9 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h $(obj)/%.o: $(src)/%.c $(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \ -I$(srctree)/tools/testing/selftests/bpf/ \ - -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \ - -Wno-compare-distinct-pointer-types \ + -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \ + -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ - -Wno-unknown-warning-option \ + -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst index 79f9a58f1872..5f27e4faca50 100644 --- a/samples/bpf/README.rst +++ b/samples/bpf/README.rst @@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or 'clang' command via redefining LLC or CLANG on the make command line:: make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang + +Cross compiling samples +----------------------- +In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH +environment variables before calling make. This will direct make to build +samples for the cross target. + +export ARCH=arm64 +export CROSS_COMPILE="aarch64-linux-gnu-" +make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 2325d7ad76df..69806d74fa53 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -193,8 +193,18 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return -1; } event_fd[prog_cnt - 1] = efd; - ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); - ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n", + strerror(errno)); + return -1; + } + err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd); + if (err < 0) { + printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n", + strerror(errno)); + return -1; + } return 0; } @@ -222,6 +232,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, inner_map_fd, maps[i].def.max_entries, @@ -229,6 +240,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps, numa_node); } else { map_fd[i] = bpf_create_map_node(maps[i].def.type, + maps[i].name, maps[i].def.key_size, maps[i].def.value_size, maps[i].def.max_entries, @@ -683,105 +695,3 @@ struct ksym *ksym_search(long key) return &syms[0]; } -int set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - struct sockaddr_nl sa; - int sock, seq = 0, len, ret = -1; - char buf[4096]; - struct nlattr *nla, *nla_xdp; - struct { - struct nlmsghdr nh; - struct ifinfomsg ifinfo; - char attrbuf[64]; - } req; - struct nlmsghdr *nh; - struct nlmsgerr *err; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - - sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (sock < 0) { - printf("open netlink socket: %s\n", strerror(errno)); - return -1; - } - - if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { - printf("bind to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - memset(&req, 0, sizeof(req)); - req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); - req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; - req.nh.nlmsg_type = RTM_SETLINK; - req.nh.nlmsg_pid = 0; - req.nh.nlmsg_seq = ++seq; - req.ifinfo.ifi_family = AF_UNSPEC; - req.ifinfo.ifi_index = ifindex; - - /* started nested attribute for XDP */ - nla = (struct nlattr *)(((char *)&req) - + NLMSG_ALIGN(req.nh.nlmsg_len)); - nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; - nla->nla_len = NLA_HDRLEN; - - /* add XDP fd */ - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); - memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len += nla_xdp->nla_len; - - /* if user passed in any flags, add those too */ - if (flags) { - nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); - nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; - nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); - memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); - nla->nla_len += nla_xdp->nla_len; - } - - req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); - - if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) { - printf("send to netlink: %s\n", strerror(errno)); - goto cleanup; - } - - len = recv(sock, buf, sizeof(buf), 0); - if (len < 0) { - printf("recv from netlink: %s\n", strerror(errno)); - goto cleanup; - } - - for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len); - nh = NLMSG_NEXT(nh, len)) { - if (nh->nlmsg_pid != getpid()) { - printf("Wrong pid %d, expected %d\n", - nh->nlmsg_pid, getpid()); - goto cleanup; - } - if (nh->nlmsg_seq != seq) { - printf("Wrong seq %d, expected %d\n", - nh->nlmsg_seq, seq); - goto cleanup; - } - switch (nh->nlmsg_type) { - case NLMSG_ERROR: - err = (struct nlmsgerr *)NLMSG_DATA(nh); - if (!err->error) - continue; - printf("nlmsg error %s\n", strerror(-err->error)); - goto cleanup; - case NLMSG_DONE: - break; - } - } - - ret = 0; - -cleanup: - close(sock); - return ret; -} diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 7d57a4248893..453c200b389b 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -61,5 +61,5 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); -int set_link_xdp_fd(int ifindex, int fd, __u32 flags); +int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); #endif diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c deleted file mode 100644 index 09afaddfc9ba..000000000000 --- a/samples/bpf/cgroup_helpers.c +++ /dev/null @@ -1,178 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include <sched.h> -#include <sys/mount.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <linux/limits.h> -#include <stdio.h> -#include <linux/sched.h> -#include <fcntl.h> -#include <unistd.h> -#include <ftw.h> - - -#include "cgroup_helpers.h" - -/* - * To avoid relying on the system setup, when setup_cgroup_env is called - * we create a new mount namespace, and cgroup namespace. The cgroup2 - * root is mounted at CGROUP_MOUNT_PATH - * - * Unfortunately, most people don't have cgroupv2 enabled at this point in time. - * It's easier to create our own mount namespace and manage it ourselves. - * - * We assume /mnt exists. - */ - -#define WALK_FD_LIMIT 16 -#define CGROUP_MOUNT_PATH "/mnt" -#define CGROUP_WORK_DIR "/cgroup-test-work-dir" -#define format_cgroup_path(buf, path) \ - snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \ - CGROUP_WORK_DIR, path) - -/** - * setup_cgroup_environment() - Setup the cgroup environment - * - * After calling this function, cleanup_cgroup_environment should be called - * once testing is complete. - * - * This function will print an error to stderr and return 1 if it is unable - * to setup the cgroup environment. If setup is successful, 0 is returned. - */ -int setup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - - if (unshare(CLONE_NEWNS)) { - log_err("unshare"); - return 1; - } - - if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { - log_err("mount fakeroot"); - return 1; - } - - if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) { - log_err("mount cgroup2"); - return 1; - } - - /* Cleanup existing failed runs, now that the environment is setup */ - cleanup_cgroup_environment(); - - if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) { - log_err("mkdir cgroup work dir"); - return 1; - } - - return 0; -} - -static int nftwfunc(const char *filename, const struct stat *statptr, - int fileflags, struct FTW *pfwt) -{ - if ((fileflags & FTW_D) && rmdir(filename)) - log_err("Removing cgroup: %s", filename); - return 0; -} - - -static int join_cgroup_from_top(char *cgroup_path) -{ - char cgroup_procs_path[PATH_MAX + 1]; - pid_t pid = getpid(); - int fd, rc = 0; - - snprintf(cgroup_procs_path, sizeof(cgroup_procs_path), - "%s/cgroup.procs", cgroup_path); - - fd = open(cgroup_procs_path, O_WRONLY); - if (fd < 0) { - log_err("Opening Cgroup Procs: %s", cgroup_procs_path); - return 1; - } - - if (dprintf(fd, "%d\n", pid) < 0) { - log_err("Joining Cgroup"); - rc = 1; - } - - close(fd); - return rc; -} - -/** - * join_cgroup() - Join a cgroup - * @path: The cgroup path, relative to the workdir, to join - * - * This function expects a cgroup to already be created, relative to the cgroup - * work dir, and it joins it. For example, passing "/my-cgroup" as the path - * would actually put the calling process into the cgroup - * "/cgroup-test-work-dir/my-cgroup" - * - * On success, it returns 0, otherwise on failure it returns 1. - */ -int join_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - - format_cgroup_path(cgroup_path, path); - return join_cgroup_from_top(cgroup_path); -} - -/** - * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment - * - * This is an idempotent function to delete all temporary cgroups that - * have been created during the test, including the cgroup testing work - * directory. - * - * At call time, it moves the calling process to the root cgroup, and then - * runs the deletion process. It is idempotent, and should not fail, unless - * a process is lingering. - * - * On failure, it will print an error to stderr, and try to continue. - */ -void cleanup_cgroup_environment(void) -{ - char cgroup_workdir[PATH_MAX + 1]; - - format_cgroup_path(cgroup_workdir, ""); - join_cgroup_from_top(CGROUP_MOUNT_PATH); - nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT); -} - -/** - * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD - * @path: The cgroup path, relative to the workdir, to join - * - * This function creates a cgroup under the top level workdir and returns the - * file descriptor. It is idempotent. - * - * On success, it returns the file descriptor. On failure it returns 0. - * If there is a failure, it prints the error to stderr. - */ -int create_and_get_cgroup(char *path) -{ - char cgroup_path[PATH_MAX + 1]; - int fd; - - format_cgroup_path(cgroup_path, path); - if (mkdir(cgroup_path, 0777) && errno != EEXIST) { - log_err("mkdiring cgroup"); - return 0; - } - - fd = open(cgroup_path, O_RDONLY); - if (fd < 0) { - log_err("Opening Cgroup"); - return 0; - } - - return fd; -} diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h deleted file mode 100644 index 06485e0002b3..000000000000 --- a/samples/bpf/cgroup_helpers.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __CGROUP_HELPERS_H -#define __CGROUP_HELPERS_H -#include <errno.h> -#include <string.h> - -#define clean_errno() (errno == 0 ? "None" : strerror(errno)) -#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \ - __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__) - - -int create_and_get_cgroup(char *path); -int join_cgroup(char *path); -int setup_cgroup_environment(void); -void cleanup_cgroup_environment(void); - -#endif diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index 098c857f1eda..2b2ffb97018b 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpgrp") +SEC("kprobe/sys_getppid") int stress_array_map_lookup(struct pt_regs *ctx) { u32 key = 1, i; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index f388254896f6..519d9af4b04a 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu) inner_lru_map_fds[cpu] = bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH, + test_map_names[INNER_LRU_HASH_PREALLOC], sizeof(uint32_t), sizeof(long), inner_lru_hash_size, 0, @@ -282,7 +283,7 @@ static void test_array_lookup(int cpu) start_time = time_get_ns(); for (i = 0; i < max_cnt; i++) - syscall(__NR_getpgrp, 0); + syscall(__NR_getppid, 0); printf("%d:array_lookup %lld lookups per sec\n", cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); } diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c index a3cb91ebf4e7..9169d3207f18 100644 --- a/samples/bpf/syscall_tp_user.c +++ b/samples/bpf/syscall_tp_user.c @@ -23,6 +23,13 @@ * This requires kernel CONFIG_FTRACE_SYSCALLS to be set. */ +static void usage(const char *cmd) +{ + printf("USAGE: %s [-i num_progs] [-h]\n", cmd); + printf(" -i num_progs # number of progs of the test\n"); + printf(" -h # help\n"); +} + static void verify_map(int map_id) { __u32 key = 0; @@ -32,22 +39,29 @@ static void verify_map(int map_id) fprintf(stderr, "map_lookup failed: %s\n", strerror(errno)); return; } - if (val == 0) + if (val == 0) { fprintf(stderr, "failed: map #%d returns value 0\n", map_id); + return; + } + val = 0; + if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) { + fprintf(stderr, "map_update failed: %s\n", strerror(errno)); + return; + } } -int main(int argc, char **argv) +static int test(char *filename, int num_progs) { - struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - char filename[256]; - int fd; + int i, fd, map0_fds[num_progs], map1_fds[num_progs]; - setrlimit(RLIMIT_MEMLOCK, &r); - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - - if (load_bpf_file(filename)) { - fprintf(stderr, "%s", bpf_log_buf); - return 1; + for (i = 0; i < num_progs; i++) { + if (load_bpf_file(filename)) { + fprintf(stderr, "%s", bpf_log_buf); + return 1; + } + printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]); + map0_fds[i] = map_fd[0]; + map1_fds[i] = map_fd[1]; } /* current load_bpf_file has perf_event_open default pid = -1 @@ -64,8 +78,34 @@ int main(int argc, char **argv) close(fd); /* verify the map */ - verify_map(map_fd[0]); - verify_map(map_fd[1]); + for (i = 0; i < num_progs; i++) { + verify_map(map0_fds[i]); + verify_map(map1_fds[i]); + } return 0; } + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + int opt, num_progs = 1; + char filename[256]; + + while ((opt = getopt(argc, argv, "i:h")) != -1) { + switch (opt) { + case 'i': + num_progs = atoi(optarg); + break; + case 'h': + default: + usage(argv[0]); + return 0; + } + } + + setrlimit(RLIMIT_MEMLOCK, &r); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + return test(filename, num_progs); +} diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c index 370b749f5ee6..efdc16d195ff 100644 --- a/samples/bpf/tcbpf2_kern.c +++ b/samples/bpf/tcbpf2_kern.c @@ -15,6 +15,7 @@ #include <uapi/linux/tcp.h> #include <uapi/linux/filter.h> #include <uapi/linux/pkt_cls.h> +#include <uapi/linux/erspan.h> #include <net/ipv6.h> #include "bpf_helpers.h" #include "bpf_endian.h" @@ -39,10 +40,6 @@ struct vxlan_metadata { u32 gbp; }; -struct erspan_metadata { - __be32 index; -}; - SEC("gre_set_tunnel") int _gre_set_tunnel(struct __sk_buff *skb) { @@ -81,6 +78,49 @@ int _gre_get_tunnel(struct __sk_buff *skb) return TC_ACT_OK; } +SEC("ip6gretap_set_tunnel") +int _ip6gretap_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv6[3] = _htonl(0x11); /* ::11 */ + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + key.tunnel_label = 0xabcde; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("ip6gretap_get_tunnel") +int _ip6gretap_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "key %d remote ip6 ::%x label %x\n"; + struct bpf_tunnel_key key; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv6[3], key.tunnel_label); + + return TC_ACT_OK; +} + SEC("erspan_set_tunnel") int _erspan_set_tunnel(struct __sk_buff *skb) { @@ -100,7 +140,20 @@ int _erspan_set_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - md.index = htonl(123); + __builtin_memset(&md, 0, sizeof(md)); +#ifdef ERSPAN_V1 + md.version = 1; + md.u.index = bpf_htonl(123); +#else + u8 direction = 1; + u8 hwid = 7; + + md.version = 2; + md.u.md2.dir = direction; + md.u.md2.hwid = hwid & 0xf; + md.u.md2.hwid_upper = (hwid >> 4) & 0x3; +#endif + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); if (ret < 0) { ERROR(ret); @@ -113,7 +166,7 @@ int _erspan_set_tunnel(struct __sk_buff *skb) SEC("erspan_get_tunnel") int _erspan_get_tunnel(struct __sk_buff *skb) { - char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n"; + char fmt[] = "key %d remote ip 0x%x erspan version %d\n"; struct bpf_tunnel_key key; struct erspan_metadata md; u32 index; @@ -131,9 +184,107 @@ int _erspan_get_tunnel(struct __sk_buff *skb) return TC_ACT_SHOT; } - index = bpf_ntohl(md.index); bpf_trace_printk(fmt, sizeof(fmt), - key.tunnel_id, key.remote_ipv4, index); + key.tunnel_id, key.remote_ipv4, md.version); + +#ifdef ERSPAN_V1 + char fmt2[] = "\tindex %x\n"; + + index = bpf_ntohl(md.u.index); + bpf_trace_printk(fmt2, sizeof(fmt2), index); +#else + char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; + + bpf_trace_printk(fmt2, sizeof(fmt2), + md.u.md2.dir, + (md.u.md2.hwid_upper << 4) + md.u.md2.hwid, + bpf_ntohl(md.u.md2.timestamp)); +#endif + + return TC_ACT_OK; +} + +SEC("ip4ip6erspan_set_tunnel") +int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb) +{ + struct bpf_tunnel_key key; + struct erspan_metadata md; + int ret; + + __builtin_memset(&key, 0x0, sizeof(key)); + key.remote_ipv6[3] = _htonl(0x11); + key.tunnel_id = 2; + key.tunnel_tos = 0; + key.tunnel_ttl = 64; + + ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), + BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + __builtin_memset(&md, 0, sizeof(md)); + +#ifdef ERSPAN_V1 + md.u.index = htonl(123); + md.version = 1; +#else + u8 direction = 0; + u8 hwid = 17; + + md.version = 2; + md.u.md2.dir = direction; + md.u.md2.hwid = hwid & 0xf; + md.u.md2.hwid_upper = (hwid >> 4) & 0x3; +#endif + + ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +SEC("ip4ip6erspan_get_tunnel") +int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb) +{ + char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n"; + struct bpf_tunnel_key key; + struct erspan_metadata md; + u32 index; + int ret; + + ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md)); + if (ret < 0) { + ERROR(ret); + return TC_ACT_SHOT; + } + + bpf_trace_printk(fmt, sizeof(fmt), + key.tunnel_id, key.remote_ipv4, md.version); + +#ifdef ERSPAN_V1 + char fmt2[] = "\tindex %x\n"; + + index = bpf_ntohl(md.u.index); + bpf_trace_printk(fmt2, sizeof(fmt2), index); +#else + char fmt2[] = "\tdirection %d hwid %x timestamp %u\n"; + + bpf_trace_printk(fmt2, sizeof(fmt2), + md.u.md2.dir, + (md.u.md2.hwid_upper << 4) + md.u.md2.hwid, + bpf_ntohl(md.u.md2.timestamp)); +#endif return TC_ACT_OK; } diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c new file mode 100644 index 000000000000..4bf4fc597db9 --- /dev/null +++ b/samples/bpf/tcp_basertt_kern.c @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * BPF program to set base_rtt to 80us when host is running TCP-NV and + * both hosts are in the same datacenter (as determined by IPv6 prefix). + * + * Use load_sock_ops to load this BPF program. + */ + +#include <uapi/linux/bpf.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <linux/socket.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define DEBUG 1 + +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) + +SEC("sockops") +int bpf_basertt(struct bpf_sock_ops *skops) +{ + char cong[20]; + char nv[] = "nv"; + int rv = 0, n; + int op; + + op = (int) skops->op; + +#ifdef DEBUG + bpf_printk("BPF command: %d\n", op); +#endif + + /* Check if both hosts are in the same datacenter. For this + * example they are if the 1st 5.5 bytes in the IPv6 address + * are the same. + */ + if (skops->family == AF_INET6 && + skops->local_ip6[0] == skops->remote_ip6[0] && + (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) == + (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) { + switch (op) { + case BPF_SOCK_OPS_BASE_RTT: + n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION, + cong, sizeof(cong)); + if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) { + /* Set base_rtt to 80us */ + rv = 80; + } else if (n) { + rv = n; + } else { + rv = -1; + } + break; + default: + rv = -1; + } + } else { + rv = -1; + } +#ifdef DEBUG + bpf_printk("Returning %d\n", rv); +#endif + skops->reply = rv; + return 1; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme new file mode 100644 index 000000000000..831fb601e3c9 --- /dev/null +++ b/samples/bpf/tcp_bpf.readme @@ -0,0 +1,26 @@ +This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops) +programs. These programs attach to a cgroupv2. The following commands create +a cgroupv2 and attach a bash shell to the group. + + mkdir -p /tmp/cgroupv2 + mount -t cgroup2 none /tmp/cgroupv2 + mkdir -p /tmp/cgroupv2/foo + bash + echo $$ >> /tmp/cgroupv2/foo/cgroup.procs + +Anything that runs under this shell belongs to the foo cgroupv2 To load +(attach) one of the tcp_*_kern.o programs: + + ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + +If the "-l" flag is used, the load_sock_ops program will continue to run +printing the BPF log buffer. The tcp_*_kern.o programs use special print +functions to print logging information (if enabled by the ifdef). + +If using netperf/netserver to create traffic, you need to run them under the +cgroupv2 to which the BPF programs are attached (i.e. under bash shell +attached to the cgroupv2). + +To remove (unattach) a socket_ops BPF program from a cgroupv2: + + ./load_sock_ops -r /tmp/cgroupv2/foo diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index ee83bbabd17c..0566b7fa38a1 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -41,8 +41,10 @@ int bpf_bufs(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -61,8 +63,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: /* Nothing to do */ @@ -71,8 +73,8 @@ int bpf_bufs(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index d68eadd9ca2d..f4225c9d2c0c 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -41,8 +41,10 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* For testing purposes, only execute rest of BPF program * if neither port numberis 55601 */ - if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) - return -1; + if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 0; + } op = (int) skops->op; @@ -66,9 +68,9 @@ int bpf_clamp(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, @@ -80,12 +82,12 @@ int bpf_clamp(struct bpf_sock_ops *skops) rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_SNDCWND_CLAMP, &clamp, sizeof(clamp)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_SNDBUF, &bufsize, - sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, - SO_RCVBUF, &bufsize, - sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_SNDBUF, &bufsize, + sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, + SO_RCVBUF, &bufsize, + sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index dac15bce1fa9..ad0f1ba8206a 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -39,8 +39,10 @@ int bpf_cong(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index 23c5122ef819..4ca5ecc9f580 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -42,8 +42,10 @@ int bpf_iw(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; @@ -62,8 +64,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of active connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw, @@ -73,8 +75,8 @@ int bpf_iw(struct bpf_sock_ops *skops) /* Set sndbuf and rcvbuf of passive connections */ rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); - rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, - &bufsize, sizeof(bufsize)); + rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF, + &bufsize, sizeof(bufsize)); break; default: rv = -1; diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index 3f2a228f81ce..09ff65b40b31 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -38,8 +38,10 @@ int bpf_rwnd(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != - 55601 && skops->local_port != 55601) - return -1; + 55601 && skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index 3c3fc83d81cb..232bb242823e 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -38,8 +38,10 @@ int bpf_synrto(struct bpf_sock_ops *skops) * if neither port numberis 55601 */ if (bpf_ntohl(skops->remote_port) != 55601 && - skops->local_port != 55601) - return -1; + skops->local_port != 55601) { + skops->reply = -1; + return 1; + } op = (int) skops->op; diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c index 3049b1f26267..1af412ec6007 100644 --- a/samples/bpf/test_cgrp2_attach2.c +++ b/samples/bpf/test_cgrp2_attach2.c @@ -30,7 +30,7 @@ #define FOO "/foo" #define BAR "/foo/bar/" -#define PING_CMD "ping -c1 -w1 127.0.0.1" +#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null" char bpf_log_buf[BPF_LOG_BUF_SIZE]; @@ -55,8 +55,7 @@ static int prog_load(int verdict) return ret; } - -int main(int argc, char **argv) +static int test_foo_bar(void) { int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0; @@ -79,7 +78,8 @@ int main(int argc, char **argv) if (join_cgroup(FOO)) goto err; - if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo"); goto err; } @@ -98,7 +98,8 @@ int main(int argc, char **argv) printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -115,7 +116,8 @@ int main(int argc, char **argv) "This ping in cgroup /foo/bar should fail...\n"); assert(system(PING_CMD) != 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -129,7 +131,8 @@ int main(int argc, char **argv) "This ping in cgroup /foo/bar should pass...\n"); assert(system(PING_CMD) == 0); - if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { log_err("Attaching prog to /foo/bar"); goto err; } @@ -162,13 +165,15 @@ int main(int argc, char **argv) goto err; } - if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) { + if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { errno = 0; log_err("Unexpected success attaching overridable prog to /foo/bar"); goto err; } - if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) { + if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { errno = 0; log_err("Unexpected success attaching overridable prog to /foo"); goto err; @@ -189,8 +194,229 @@ out: close(bar); cleanup_cgroup_environment(); if (!rc) - printf("PASS\n"); + printf("### override:PASS\n"); else - printf("FAIL\n"); + printf("### override:FAIL\n"); return rc; } + +static int map_fd = -1; + +static int prog_load_cnt(int verdict, int val) +{ + if (map_fd < 0) + map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0); + if (map_fd < 0) { + printf("failed to create map '%s'\n", strerror(errno)); + return -1; + } + + struct bpf_insn prog[] = { + BPF_MOV32_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ + BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); + int ret; + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB, + prog, insns_cnt, "GPL", 0, + bpf_log_buf, BPF_LOG_BUF_SIZE); + + if (ret < 0) { + log_err("Loading program"); + printf("Output from verifier:\n%s\n-------\n", bpf_log_buf); + return 0; + } + return ret; +} + + +static int test_multiprog(void) +{ + __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id; + int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0; + int drop_prog, allow_prog[6] = {}, rc = 0; + unsigned long long value; + int i = 0; + + for (i = 0; i < 6; i++) { + allow_prog[i] = prog_load_cnt(1, 1 << i); + if (!allow_prog[i]) + goto err; + } + drop_prog = prog_load_cnt(0, 1); + if (!drop_prog) + goto err; + + if (setup_cgroup_environment()) + goto err; + + cg1 = create_and_get_cgroup("/cg1"); + if (!cg1) + goto err; + cg2 = create_and_get_cgroup("/cg1/cg2"); + if (!cg2) + goto err; + cg3 = create_and_get_cgroup("/cg1/cg2/cg3"); + if (!cg3) + goto err; + cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4"); + if (!cg4) + goto err; + cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5"); + if (!cg5) + goto err; + + if (join_cgroup("/cg1/cg2/cg3/cg4/cg5")) + goto err; + + if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { + log_err("Attaching prog to cg1"); + goto err; + } + if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { + log_err("Unexpected success attaching the same prog to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { + log_err("Attaching prog2 to cg1"); + goto err; + } + if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { + log_err("Attaching prog to cg2"); + goto err; + } + if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_MULTI)) { + log_err("Attaching prog to cg3"); + goto err; + } + if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS, + BPF_F_ALLOW_OVERRIDE)) { + log_err("Attaching prog to cg4"); + goto err; + } + if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) { + log_err("Attaching prog to cg5"); + goto err; + } + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 32); + + /* query the number of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + NULL, NULL, &prog_cnt) == 0); + assert(prog_cnt == 4); + /* retrieve prog_ids of effective progs in cg5 */ + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 4); + assert(attach_flags == 0); + saved_prog_id = prog_ids[0]; + /* check enospc handling */ + prog_ids[0] = 0; + prog_cnt = 2; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == -1 && + errno == ENOSPC); + assert(prog_cnt == 4); + /* check that prog_ids are returned even when buffer is too small */ + assert(prog_ids[0] == saved_prog_id); + /* retrieve prog_id of single attached prog in cg5 */ + prog_ids[0] = 0; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 1); + assert(prog_ids[0] == saved_prog_id); + + /* detach bottom program and ping again */ + if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg5"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 8 + 16); + + /* detach 3rd from bottom program and ping again */ + errno = 0; + if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Unexpected success on detach from cg3"); + goto err; + } + if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching from cg3"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 16); + + /* detach 2nd from bottom program and ping again */ + if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) { + log_err("Detaching prog from cg4"); + goto err; + } + value = 0; + assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0); + assert(system(PING_CMD) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0); + assert(value == 1 + 2 + 4); + + prog_cnt = 4; + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE, + &attach_flags, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 3); + assert(attach_flags == 0); + assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, + NULL, prog_ids, &prog_cnt) == 0); + assert(prog_cnt == 0); + goto out; +err: + rc = 1; + +out: + for (i = 0; i < 6; i++) + if (allow_prog[i] > 0) + close(allow_prog[i]); + close(cg1); + close(cg2); + close(cg3); + close(cg4); + close(cg5); + cleanup_cgroup_environment(); + if (!rc) + printf("### multi:PASS\n"); + else + printf("### multi:FAIL\n"); + return rc; +} + +int main(int argc, char **argv) +{ + int rc = 0; + + rc = test_foo_bar(); + if (rc) + return rc; + + return test_multiprog(); +} diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh new file mode 100755 index 000000000000..e68b9ee6814b --- /dev/null +++ b/samples/bpf/test_override_return.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir tmpmnt +./tracex7 $DEVICE +if [ $? -eq 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" +fi +losetup -d $DEVICE diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh index 312e1722a39f..43ce049996ee 100755 --- a/samples/bpf/test_tunnel_bpf.sh +++ b/samples/bpf/test_tunnel_bpf.sh @@ -33,10 +33,43 @@ function add_gre_tunnel { ip addr add dev $DEV 10.1.1.200/24 } -function add_erspan_tunnel { +function add_ip6gretap_tunnel { + + # assign ipv6 address + ip netns exec at_ns0 ip addr add ::11/96 dev veth0 + ip netns exec at_ns0 ip link set dev veth0 up + ip addr add dev veth1 ::22/96 + ip link set dev veth1 up + # in namespace ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123 + ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \ + local ::11 remote ::22 + + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96 + ip netns exec at_ns0 ip link set dev $DEV_NS up + + # out of namespace + ip link add dev $DEV type $TYPE external + ip addr add dev $DEV 10.1.1.200/24 + ip addr add dev $DEV fc80::200/24 + ip link set dev $DEV up +} + +function add_erspan_tunnel { + # in namespace + if [ "$1" == "v1" ]; then + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 \ + erspan_ver 1 erspan 123 + else + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local 172.16.1.100 remote 172.16.1.200 \ + erspan_ver 2 erspan_dir egress erspan_hwid 3 + fi ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 @@ -46,6 +79,35 @@ function add_erspan_tunnel { ip addr add dev $DEV 10.1.1.200/24 } +function add_ip6erspan_tunnel { + + # assign ipv6 address + ip netns exec at_ns0 ip addr add ::11/96 dev veth0 + ip netns exec at_ns0 ip link set dev veth0 up + ip addr add dev veth1 ::22/96 + ip link set dev veth1 up + + # in namespace + if [ "$1" == "v1" ]; then + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local ::11 remote ::22 \ + erspan_ver 1 erspan 123 + else + ip netns exec at_ns0 \ + ip link add dev $DEV_NS type $TYPE seq key 2 \ + local ::11 remote ::22 \ + erspan_ver 2 erspan_dir egress erspan_hwid 7 + fi + ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 + ip netns exec at_ns0 ip link set dev $DEV_NS up + + # out of namespace + ip link add dev $DEV type $TYPE external + ip addr add dev $DEV 10.1.1.200/24 + ip link set dev $DEV up +} + function add_vxlan_tunnel { # Set static ARP entry here because iptables set-mark works # on L3 packet, as a result not applying to ARP packets, @@ -113,18 +175,65 @@ function test_gre { cleanup } +function test_ip6gre { + TYPE=ip6gre + DEV_NS=ip6gre00 + DEV=ip6gre11 + config_device + # reuse the ip6gretap function + add_ip6gretap_tunnel + attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel + # underlay + ping6 -c 4 ::11 + # overlay: ipv4 over ipv6 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + ping -c 1 10.1.1.100 + # overlay: ipv6 over ipv6 + ip netns exec at_ns0 ping6 -c 1 fc80::200 + cleanup +} + +function test_ip6gretap { + TYPE=ip6gretap + DEV_NS=ip6gretap00 + DEV=ip6gretap11 + config_device + add_ip6gretap_tunnel + attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel + # underlay + ping6 -c 4 ::11 + # overlay: ipv4 over ipv6 + ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200 + ping -c 1 10.1.1.100 + # overlay: ipv6 over ipv6 + ip netns exec at_ns0 ping6 -c 1 fc80::200 + cleanup +} + function test_erspan { TYPE=erspan DEV_NS=erspan00 DEV=erspan11 config_device - add_erspan_tunnel + add_erspan_tunnel $1 attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel ping -c 1 10.1.1.100 ip netns exec at_ns0 ping -c 1 10.1.1.200 cleanup } +function test_ip6erspan { + TYPE=ip6erspan + DEV_NS=ip6erspan00 + DEV=ip6erspan11 + config_device + add_ip6erspan_tunnel $1 + attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel + ping6 -c 3 ::11 + ip netns exec at_ns0 ping -c 1 10.1.1.200 + cleanup +} + function test_vxlan { TYPE=vxlan DEV_NS=vxlan00 @@ -175,9 +284,12 @@ function cleanup { ip link del veth1 ip link del ipip11 ip link del gretap11 + ip link del ip6gre11 + ip link del ip6gretap11 ip link del vxlan11 ip link del geneve11 ip link del erspan11 + ip link del ip6erspan11 pkill tcpdump pkill cat set -ex @@ -187,8 +299,16 @@ trap cleanup 0 2 3 6 9 cleanup echo "Testing GRE tunnel..." test_gre +echo "Testing IP6GRE tunnel..." +test_ip6gre +echo "Testing IP6GRETAP tunnel..." +test_ip6gretap echo "Testing ERSPAN tunnel..." -test_erspan +test_erspan v1 +test_erspan v2 +echo "Testing IP6ERSPAN tunnel..." +test_ip6erspan v1 +test_ip6erspan v2 echo "Testing VXLAN tunnel..." test_vxlan echo "Testing GENEVE tunnel..." diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 41b6115a32eb..a77a583d94d4 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = { SEC("perf_event") int bpf_prog1(struct bpf_perf_event_data *ctx) { + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; struct key_t key; u64 *val, one = 1; + int ret; if (ctx->sample_period < 10000) /* ignore warmup */ @@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) return 0; } + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 7bd827b84a67..bf4f1b6d9a52 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) int *pmu_fd = malloc(nr_cpus * sizeof(int)); int i, error = 0; + /* system wide perf event, no need to inherit */ + attr->inherit = 0; + /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); @@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr) { int pmu_fd; + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; + /* open task bound event */ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { @@ -175,14 +183,12 @@ static void test_bpf_perf_event(void) .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .inherit = 1, }; struct perf_event_attr attr_type_sw = { .sample_freq = SAMPLE_FREQ, .freq = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, - .inherit = 1, }; struct perf_event_attr attr_hw_cache_l1d = { .sample_freq = SAMPLE_FREQ, @@ -192,7 +198,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), - .inherit = 1, }; struct perf_event_attr attr_hw_cache_branch_miss = { .sample_freq = SAMPLE_FREQ, @@ -202,7 +207,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_BPU | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), - .inherit = 1, }; struct perf_event_attr attr_type_raw = { .sample_freq = SAMPLE_FREQ, @@ -210,7 +214,6 @@ static void test_bpf_perf_event(void) .type = PERF_TYPE_RAW, /* Intel Instruction Retired */ .config = 0xc0, - .inherit = 1, }; printf("Test HW_CPU_CYCLES\n"); diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index e7d180305974..46c557afac73 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = { .value_size = sizeof(u64), .max_entries = 64, }; +struct bpf_map_def SEC("maps") values2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(struct bpf_perf_event_value), + .max_entries = 64, +}; SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) @@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } +SEC("kprobe/htab_map_lookup_elem") +int bpf_prog2(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + int error; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index a8c22dcf8e4e..89ab8d408474 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -23,6 +23,7 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) { + struct bpf_perf_event_value value2; int pmu_fd, error = 0; cpu_set_t set; __u64 value; @@ -47,8 +48,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) fprintf(stderr, "Value missing for CPU %d\n", cpu); error = 1; goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); } - fprintf(stderr, "CPU %d: %llu\n", cpu, value); on_exit: assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c new file mode 100644 index 000000000000..1ab308a43e0f --- /dev/null +++ b/samples/bpf/tracex7_kern.c @@ -0,0 +1,16 @@ +#include <uapi/linux/ptrace.h> +#include <uapi/linux/bpf.h> +#include <linux/version.h> +#include "bpf_helpers.h" + +SEC("kprobe/open_ctree") +int bpf_prog1(struct pt_regs *ctx) +{ + unsigned long rc = -12; + + bpf_override_return(ctx, rc); + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c new file mode 100644 index 000000000000..8a52ac492e8b --- /dev/null +++ b/samples/bpf/tracex7_user.c @@ -0,0 +1,28 @@ +#define _GNU_SOURCE + +#include <stdio.h> +#include <linux/bpf.h> +#include <unistd.h> +#include "libbpf.h" +#include "bpf_load.h" + +int main(int argc, char **argv) +{ + FILE *f; + char filename[256]; + char command[256]; + int ret; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + snprintf(command, 256, "mount %s tmpmnt/", argv[1]); + f = popen(command, "r"); + ret = pclose(f); + + return ret ? 0 : 1; +} diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index 2431c0321b71..b901ee2b3336 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -14,6 +14,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -24,7 +25,7 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -69,6 +70,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int opt; @@ -91,6 +93,12 @@ int main(int argc, char **argv) usage(basename(argv[0])); return 1; } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex = strtoul(argv[optind], NULL, 0); snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); @@ -108,7 +116,7 @@ int main(int argc, char **argv) signal(SIGINT, int_exit); signal(SIGTERM, int_exit); - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh new file mode 100755 index 000000000000..b9c9549c4c27 --- /dev/null +++ b/samples/bpf/xdp2skb_meta.sh @@ -0,0 +1,220 @@ +#!/bin/bash +# +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. +# +# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load +# eBPF programs, both for XDP and clsbpf. Shell script function +# wrappers and even long options parsing is illustrated, for ease of +# use. +# +# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs +# that need to collaborate between XDP and TC hooks. Thus, it is +# convenient that the same tool load both programs that need to work +# together. +# +BPF_FILE=xdp2skb_meta_kern.o +DIR=$(dirname $0) + +export TC=/usr/sbin/tc +export IP=/usr/sbin/ip + +function usage() { + echo "" + echo "Usage: $0 [-vfh] --dev ethX" + echo " -d | --dev : Network device (required)" + echo " --flush : Cleanup flush TC and XDP progs" + echo " --list : (\$LIST) List TC and XDP progs" + echo " -v | --verbose : (\$VERBOSE) Verbose" + echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)" + echo "" +} + +## -- General shell logging cmds -- +function err() { + local exitcode=$1 + shift + echo "ERROR: $@" >&2 + exit $exitcode +} + +function info() { + if [[ -n "$VERBOSE" ]]; then + echo "# $@" + fi +} + +## -- Helper function calls -- + +# Wrapper call for TC and IP +# - Will display the offending command on failure +function _call_cmd() { + local cmd="$1" + local allow_fail="$2" + shift 2 + if [[ -n "$VERBOSE" ]]; then + echo "$(basename $cmd) $@" + fi + if [[ -n "$DRYRUN" ]]; then + return + fi + $cmd "$@" + local status=$? + if (( $status != 0 )); then + if [[ "$allow_fail" == "" ]]; then + err 2 "Exec error($status) occurred cmd: \"$cmd $@\"" + fi + fi +} +function call_tc() { + _call_cmd "$TC" "" "$@" +} +function call_tc_allow_fail() { + _call_cmd "$TC" "allow_fail" "$@" +} +function call_ip() { + _call_cmd "$IP" "" "$@" +} + +## --- Parse command line arguments / parameters --- +# Using external program "getopt" to get --long-options +OPTIONS=$(getopt -o vfhd: \ + --long verbose,flush,help,list,dev:,dry-run -- "$@") +if (( $? != 0 )); then + err 4 "Error calling getopt" +fi +eval set -- "$OPTIONS" + +unset DEV +unset FLUSH +while true; do + case "$1" in + -d | --dev ) # device + DEV=$2 + info "Device set to: DEV=$DEV" >&2 + shift 2 + ;; + -v | --verbose) + VERBOSE=yes + # info "Verbose mode: VERBOSE=$VERBOSE" >&2 + shift + ;; + --dry-run ) + DRYRUN=yes + VERBOSE=yes + info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2 + shift + ;; + -f | --flush ) + FLUSH=yes + shift + ;; + --list ) + LIST=yes + shift + ;; + -- ) + shift + break + ;; + -h | --help ) + usage; + exit 0 + ;; + * ) + shift + break + ;; + esac +done + +FILE="$DIR/$BPF_FILE" +if [[ ! -e $FILE ]]; then + err 3 "Missing BPF object file ($FILE)" +fi + +if [[ -z $DEV ]]; then + usage + err 2 "Please specify network device -- required option --dev" +fi + +## -- Function calls -- + +function list_tc() +{ + local device="$1" + shift + info "Listing current TC ingress rules" + call_tc filter show dev $device ingress +} + +function list_xdp() +{ + local device="$1" + shift + info "Listing current XDP device($device) setting" + call_ip link show dev $device | grep --color=auto xdp +} + +function flush_tc() +{ + local device="$1" + shift + info "Flush TC on device: $device" + call_tc_allow_fail filter del dev $device ingress + call_tc_allow_fail qdisc del dev $device clsact +} + +function flush_xdp() +{ + local device="$1" + shift + info "Flush XDP on device: $device" + call_ip link set dev $device xdp off +} + +function attach_tc_mark() +{ + local device="$1" + local file="$2" + local prog="tc_mark" + shift 2 + + # Re-attach clsact to clear/flush existing role + call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null + call_tc qdisc add dev $device clsact + + # Attach BPF prog + call_tc filter add dev $device ingress \ + prio 1 handle 1 bpf da obj $file sec $prog +} + +function attach_xdp_mark() +{ + local device="$1" + local file="$2" + local prog="xdp_mark" + shift 2 + + # Remove XDP prog in-case it's already loaded + # TODO: Need ip-link option to override/replace existing XDP prog + flush_xdp $device + + # Attach XDP/BPF prog + call_ip link set dev $device xdp obj $file sec $prog +} + +if [[ -n $FLUSH ]]; then + flush_tc $DEV + flush_xdp $DEV + exit 0 +fi + +if [[ -n $LIST ]]; then + list_tc $DEV + list_xdp $DEV + exit 0 +fi + +attach_tc_mark $DEV $FILE +attach_xdp_mark $DEV $FILE diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c new file mode 100644 index 000000000000..0c12048ac79f --- /dev/null +++ b/samples/bpf/xdp2skb_meta_kern.c @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto transfer info from XDP to SKB, e.g. skb->mark + * ----------------------------------------------------------- + * This uses the XDP data_meta infrastructure, and is a cooperation + * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook. + * + * Notice: This example does not use the BPF C-loader (bpf_load.c), + * but instead rely on the iproute2 TC tool for loading BPF-objects. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/pkt_cls.h> + +#include "bpf_helpers.h" + +/* + * This struct is stored in the XDP 'data_meta' area, which is located + * just in-front-of the raw packet payload data. The meaning is + * specific to these two BPF programs that use it as a communication + * channel. XDP adjust/increase the area via a bpf-helper, and TC use + * boundary checks to see if data have been provided. + * + * The struct must be 4 byte aligned, which here is enforced by the + * struct __attribute__((aligned(4))). + */ +struct meta_info { + __u32 mark; +} __attribute__((aligned(4))); + +SEC("xdp_mark") +int _xdp_mark(struct xdp_md *ctx) +{ + struct meta_info *meta; + void *data, *data_end; + int ret; + + /* Reserve space in-front of data pointer for our meta info. + * (Notice drivers not supporting data_meta will fail here!) + */ + ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta)); + if (ret < 0) + return XDP_ABORTED; + + /* Notice: Kernel-side verifier requires that loading of + * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(), + * as pkt-data pointers are invalidated. Helpers that require + * this are determined/marked by bpf_helper_changes_pkt_data() + */ + data = (void *)(unsigned long)ctx->data; + + /* Check data_meta have room for meta_info struct */ + meta = (void *)(unsigned long)ctx->data_meta; + if (meta + 1 > data) + return XDP_ABORTED; + + meta->mark = 42; + + return XDP_PASS; +} + +SEC("tc_mark") +int _tc_mark(struct __sk_buff *ctx) +{ + void *data = (void *)(unsigned long)ctx->data; + void *data_end = (void *)(unsigned long)ctx->data_end; + void *data_meta = (void *)(unsigned long)ctx->data_meta; + struct meta_info *meta = data_meta; + + /* Check XDP gave us some data_meta */ + if (meta + 1 > data) { + ctx->mark = 41; + /* Skip "accept" if no data_meta is avail */ + return TC_ACT_OK; + } + + /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */ + ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */ + + return TC_ACT_OK; +} + +/* Manually attaching these programs: +export DEV=ixgbe2 +export FILE=xdp2skb_meta_kern.o + +# via TC command +tc qdisc del dev $DEV clsact 2> /dev/null +tc qdisc add dev $DEV clsact +tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark +tc filter show dev $DEV ingress + +# XDP via IP command: +ip link set dev $DEV xdp off +ip link set dev $DEV xdp obj $FILE sec xdp_mark + +# Use iptable to "see" if SKBs are marked +iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29 +iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a + +# Hint: catch XDP_ABORTED errors via +perf record -e xdp:* +perf script + +*/ diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 74f3fd8ed729..211db8ded0de 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -1,6 +1,7 @@ -/* XDP monitor tool, based on tracepoints +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc. * - * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * XDP monitor tool, based on tracepoints */ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" @@ -13,23 +14,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = { /* TODO: have entries for all possible errno's */ }; +#define XDP_UNKNOWN XDP_REDIRECT + 1 +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = XDP_UNKNOWN + 1, +}; + /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h */ struct xdp_redirect_ctx { - unsigned short common_type; // offset:0; size:2; signed:0; - unsigned char common_flags; // offset:2; size:1; signed:0; - unsigned char common_preempt_count;// offset:3; size:1; signed:0; - int common_pid; // offset:4; size:4; signed:1; - - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 enum { XDP_REDIRECT_SUCCESS = 0, @@ -48,7 +53,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); if (!cnt) - return 0; + return 1; *cnt += 1; return 0; /* Indicate event was filtered (no further processing)*/ @@ -86,3 +91,120 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) { return xdp_redirect_collect_stat(ctx); } + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + u64 *cnt; + u32 key; + + key = ctx->act; + if (key > XDP_REDIRECT) + key = XDP_UNKNOWN; + + cnt = bpf_map_lookup_elem(&exception_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; +} + +/* Common stats data record shared with _user.c */ +struct datarec { + u64 processed; + u64 dropped; + u64 info; +}; +#define MAX_CPUS 64 + +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->info += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->info++; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index b51b4f5e3257..eec14520d513 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -1,4 +1,5 @@ -/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. +/* SPDX-License-Identifier: GPL-2.0 + * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. */ static const char *__doc__= "XDP monitor tool, based on tracepoints\n" @@ -20,6 +21,7 @@ static const char *__doc_err_only__= #include <unistd.h> #include <locale.h> +#include <sys/resource.h> #include <getopt.h> #include <net/if.h> #include <time.h> @@ -39,6 +41,9 @@ static const struct option long_options[] = { {0, 0, NULL, 0 } }; +/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */ +#define EXIT_FAIL_MEM 5 + static void usage(char *argv[]) { int i; @@ -61,7 +66,7 @@ static void usage(char *argv[]) } #define NANOSEC_PER_SEC 1000000000 /* 10^9 */ -__u64 gettime(void) +static __u64 gettime(void) { struct timespec t; int res; @@ -89,98 +94,437 @@ static const char *err2str(int err) return redir_names[err]; return NULL; } +/* enum xdp_action */ +#define XDP_UNKNOWN XDP_REDIRECT + 1 +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 info; +}; +#define MAX_CPUS 64 + +/* Userspace structs for collection of stats from maps */ struct record { - __u64 counter; __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct u64rec { + __u64 processed; +}; +struct record_u64 { + /* record for _kern side __u64 values */ + __u64 timestamp; + struct u64rec total; + struct u64rec *cpu; }; struct stats_record { - struct record xdp_redir[REDIR_RES_MAX]; + struct record_u64 xdp_redirect[REDIR_RES_MAX]; + struct record_u64 xdp_exception[XDP_ACTION_MAX]; + struct record xdp_cpumap_kthread; + struct record xdp_cpumap_enqueue[MAX_CPUS]; }; -static void stats_print_headers(bool err_only) -{ - if (err_only) - printf("\n%s\n", __doc_err_only__); - - printf("%-14s %-10s %-18s %-9s\n", - "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period"); -} - -static void stats_print(struct stats_record *rec, - struct stats_record *prev, - bool err_only) +static bool map_collect_record(int fd, __u32 key, struct record *rec) { - int i = 0; + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_info = 0; + int i; - if (err_only) - i = REDIR_ERROR; - - for (; i < REDIR_RES_MAX; i++) { - struct record *r = &rec->xdp_redir[i]; - struct record *p = &prev->xdp_redir[i]; - __u64 period = 0; - __u64 packets = 0; - double pps = 0; - double period_ = 0; - - if (p->timestamp) { - packets = r->counter - p->counter; - period = r->timestamp - p->timestamp; - if (period > 0) { - period_ = ((double) period / NANOSEC_PER_SEC); - pps = packets / period_; - } - } + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); - printf("%-14s %-10.0f %'-18.0f %f\n", - err2str(i), pps, pps, period_); + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].info = values[i].info; + sum_info += values[i].info; } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.info = sum_info; + return true; } -static __u64 get_key32_value64_percpu(int fd, __u32 key) +static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec) { /* For percpu maps, userspace gets a value per possible CPU */ unsigned int nr_cpus = bpf_num_possible_cpus(); - __u64 values[nr_cpus]; - __u64 sum = 0; + struct u64rec values[nr_cpus]; + __u64 sum_total = 0; int i; if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { fprintf(stderr, "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); - return 0; + return false; } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); - /* Sum values from each CPU */ + /* Record and sum values from each CPU */ for (i = 0; i < nr_cpus; i++) { - sum += values[i]; + rec->cpu[i].processed = values[i].processed; + sum_total += values[i].processed; + } + rec->total.processed = sum_total; + return true; +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_period_u64(struct record_u64 *r, struct record_u64 *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; + } + return pps; +} + +static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->processed - p->processed; + pps = packets / period; } - return sum; + return pps; } -static bool stats_collect(int fd, struct stats_record *rec) +static double calc_drop(struct datarec *r, struct datarec *p, double period) { + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->dropped - p->dropped; + pps = packets / period; + } + return pps; +} + +static double calc_info(struct datarec *r, struct datarec *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->info - p->info; + pps = packets / period; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + bool err_only) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + int rec_i = 0, i, to_cpu; + double t = 0, pps = 0; + + /* Header */ + printf("%-15s %-7s %-12s %-12s %-9s\n", + "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* tracepoint: xdp:xdp_redirect_* */ + if (err_only) + rec_i = REDIR_ERROR; + + for (; rec_i < REDIR_RES_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_redirect[rec_i]; + prev = &stats_prev->xdp_redirect[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "XDP_REDIRECT", i, + rec_i ? 0.0: pps, rec_i ? pps : 0.0, + err2str(rec_i)); + } + pps = calc_pps_u64(&rec->total, &prev->total, t); + printf(fmt2, "XDP_REDIRECT", "total", + rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i)); + } + + /* tracepoint: xdp:xdp_exception */ + for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) { + struct record_u64 *rec, *prev; + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n"; + + rec = &stats_rec->xdp_exception[rec_i]; + prev = &stats_prev->xdp_exception[rec_i]; + t = calc_period_u64(rec, prev); + + for (i = 0; i < nr_cpus; i++) { + struct u64rec *r = &rec->cpu[i]; + struct u64rec *p = &prev->cpu[i]; + + pps = calc_pps_u64(r, p, t); + if (pps > 0) + printf(fmt1, "Exception", i, + 0.0, pps, err2str(rec_i)); + } + pps = calc_pps_u64(&rec->total, &prev->total, t); + if (pps > 0) + printf(fmt2, "Exception", "total", + 0.0, pps, action2str(rec_i)); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n"; + struct record *rec, *prev; + char *info_str = ""; + double drop, info; + + rec = &stats_rec->xdp_cpumap_enqueue[to_cpu]; + prev = &stats_prev->xdp_cpumap_enqueue[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt1, "cpumap-enqueue", + i, to_cpu, pps, drop, info, info_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) { + info_str = "bulk-average"; + info = pps / info; /* calc average bulk size */ + } + printf(fmt2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, info, info_str); + } + } + + /* cpumap kthread stats */ + { + char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n"; + char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n"; + struct record *rec, *prev; + double drop, info; + char *i_str = ""; + + rec = &stats_rec->xdp_cpumap_kthread; + prev = &stats_prev->xdp_cpumap_kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop(r, p, t); + info = calc_info(r, p, t); + if (info > 0) + i_str = "sched"; + if (pps > 0) + printf(fmt1, "cpumap-kthread", + i, pps, drop, info, i_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop(&rec->total, &prev->total, t); + info = calc_info(&rec->total, &prev->total, t); + if (info > 0) + i_str = "sched-sum"; + printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str); + } + + printf("\n"); +} + +static bool stats_collect(struct stats_record *rec) +{ + int fd; int i; /* TODO: Detect if someone unloaded the perf event_fd's, as * this can happen by someone running perf-record -e */ - for (i = 0; i < REDIR_RES_MAX; i++) { - rec->xdp_redir[i].timestamp = gettime(); - rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); + fd = map_data[0].fd; /* map0: redirect_err_cnt */ + for (i = 0; i < REDIR_RES_MAX; i++) + map_collect_record_u64(fd, i, &rec->xdp_redirect[i]); + + fd = map_data[1].fd; /* map1: exception_cnt */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + map_collect_record_u64(fd, i, &rec->xdp_exception[i]); } + + fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]); + + fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */ + map_collect_record(fd, 0, &rec->xdp_cpumap_kthread); + return true; } +static void *alloc_rec_per_cpu(int record_size) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + void *array; + size_t size; + + size = record_size * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int rec_sz; + int i; + + /* Alloc main stats_record structure */ + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + + /* Alloc stats stored per CPU for each record */ + rec_sz = sizeof(struct u64rec); + for (i = 0; i < REDIR_RES_MAX; i++) + rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < XDP_ACTION_MAX; i++) + rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz); + + rec_sz = sizeof(struct datarec); + rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz); + + for (i = 0; i < MAX_CPUS; i++) + rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < REDIR_RES_MAX; i++) + free(r->xdp_redirect[i].cpu); + + for (i = 0; i < XDP_ACTION_MAX; i++) + free(r->xdp_exception[i].cpu); + + free(r->xdp_cpumap_kthread.cpu); + + for (i = 0; i < MAX_CPUS; i++) + free(r->xdp_cpumap_enqueue[i].cpu); + + free(r); +} + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + static void stats_poll(int interval, bool err_only) { - struct stats_record rec, prev; - int map_fd; + struct stats_record *rec, *prev; - memset(&rec, 0, sizeof(rec)); + rec = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(rec); + + if (err_only) + printf("\n%s\n", __doc_err_only__); /* Trick to pretty printf with thousands separators use %' */ setlocale(LC_NUMERIC, "en_US"); @@ -190,23 +534,26 @@ static void stats_poll(int interval, bool err_only) printf("\n%s", __doc__); /* TODO Need more advanced stats on error types */ - if (verbose) - printf(" - Stats map: %s\n", map_data[0].name); - map_fd = map_data[0].fd; - - stats_print_headers(err_only); + if (verbose) { + printf(" - Stats map0: %s\n", map_data[0].name); + printf(" - Stats map1: %s\n", map_data[1].name); + printf("\n"); + } fflush(stdout); while (1) { - memcpy(&prev, &rec, sizeof(rec)); - stats_collect(map_fd, &rec); - stats_print(&rec, &prev, err_only); + swap(&prev, &rec); + stats_collect(rec); + stats_print(rec, prev, err_only); fflush(stdout); sleep(interval); } + + free_stats_record(rec); + free_stats_record(prev); } -void print_bpf_prog_info(void) +static void print_bpf_prog_info(void) { int i; @@ -235,6 +582,7 @@ void print_bpf_prog_info(void) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int longindex = 0, opt; int ret = EXIT_SUCCESS; char bpf_obj_file[256]; @@ -265,13 +613,18 @@ int main(int argc, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAILURE; + } + if (load_bpf_file(bpf_obj_file)) { printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return 1; + return EXIT_FAILURE; } if (!prog_fd[0]) { printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return 1; + return EXIT_FAILURE; } if (debug) { diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c new file mode 100644 index 000000000000..303e9e7161f3 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_kern.c @@ -0,0 +1,609 @@ +/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP) + * + * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/if_vlan.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/udp.h> + +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +#define MAX_CPUS 12 /* WARNING - sync with _user.c */ + +/* Special map type that can XDP_REDIRECT frames to another CPU */ +struct bpf_map_def SEC("maps") cpu_map = { + .type = BPF_MAP_TYPE_CPUMAP, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; + +/* Common stats data record to keep userspace more simple */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; + +/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success + * feedback. Redirect TX errors can be caught via a tracepoint. + */ +struct bpf_map_def SEC("maps") rx_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") redirect_err_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 2, + /* TODO: have entries for all possible errno's */ +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_CPUS, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") cpumap_kthread_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Set of maps controlling available CPU, and for iterating through + * selectable redirect CPUs. + */ +struct bpf_map_def SEC("maps") cpus_available = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = MAX_CPUS, +}; +struct bpf_map_def SEC("maps") cpus_count = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; +struct bpf_map_def SEC("maps") cpus_iterator = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u32), + .max_entries = 1, +}; + +/* Used by trace point */ +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +/* Helper parse functions */ + +/* Parse Ethernet layer 2, extract network layer 3 offset and protocol + * + * Returns false on error and non-supported ether-type + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +static __always_inline +bool parse_eth(struct ethhdr *eth, void *data_end, + u16 *eth_proto, u64 *l3_offset) +{ + u16 eth_type; + u64 offset; + + offset = sizeof(*eth); + if ((void *)eth + offset > data_end) + return false; + + eth_type = eth->h_proto; + + /* Skip non 802.3 Ethertypes */ + if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN)) + return false; + + /* Handle VLAN tagged packet */ + if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) { + struct vlan_hdr *vlan_hdr; + + vlan_hdr = (void *)eth + offset; + offset += sizeof(*vlan_hdr); + if ((void *)eth + offset > data_end) + return false; + eth_type = vlan_hdr->h_vlan_encapsulated_proto; + } + /* TODO: Handle double VLAN tagged packet */ + + *eth_proto = ntohs(eth_type); + *l3_offset = offset; + return true; +} + +static __always_inline +u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + struct udphdr *udph; + u16 dport; + + if (iph + 1 > data_end) + return 0; + if (!(iph->protocol == IPPROTO_UDP)) + return 0; + + udph = (void *)(iph + 1); + if (udph + 1 > data_end) + return 0; + + dport = ntohs(udph->dest); + return dport; +} + +static __always_inline +int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + return iph->protocol; +} + +static __always_inline +int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ipv6hdr *ip6h = data + nh_off; + + if (ip6h + 1 > data_end) + return 0; + return ip6h->nexthdr; +} + +SEC("xdp_cpu_map0") +int xdp_prognum0_no_touch(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map1_touch_data") +int xdp_prognum1_touch_data(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 *cpu_selected; + u32 cpu_dest; + u16 eth_type; + u32 key = 0; + + /* Only use first entry in cpus_available */ + cpu_selected = bpf_map_lookup_elem(&cpus_available, &key); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Validate packet length is minimum Eth header size */ + if (eth + 1 > data_end) + return XDP_ABORTED; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Read packet data, and use it (drop non 802.3 Ethertypes) */ + eth_type = eth->h_proto; + if (ntohs(eth_type) < ETH_P_802_3_MIN) { + rec->dropped++; + return XDP_DROP; + } + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map2_round_robin") +int xdp_prognum2_round_robin(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + struct datarec *rec; + u32 cpu_dest; + u32 *cpu_lookup; + u32 key0 = 0; + + u32 *cpu_selected; + u32 *cpu_iterator; + u32 *cpu_max; + u32 cpu_idx; + + cpu_max = bpf_map_lookup_elem(&cpus_count, &key0); + if (!cpu_max) + return XDP_ABORTED; + + cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0); + if (!cpu_iterator) + return XDP_ABORTED; + cpu_idx = *cpu_iterator; + + *cpu_iterator += 1; + if (*cpu_iterator == *cpu_max) + *cpu_iterator = 0; + + cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_selected) + return XDP_ABORTED; + cpu_dest = *cpu_selected; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key0); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map3_proto_separate") +int xdp_prognum3_proto_separate(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + +SEC("xdp_cpu_map4_ddos_filter_pktgen") +int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct ethhdr *eth = data; + u8 ip_proto = IPPROTO_UDP; + struct datarec *rec; + u16 eth_proto = 0; + u64 l3_offset = 0; + u32 cpu_dest = 0; + u32 cpu_idx = 0; + u16 dest_port; + u32 *cpu_lookup; + u32 key = 0; + + /* Count RX packet in map */ + rec = bpf_map_lookup_elem(&rx_cnt, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + if (!(parse_eth(eth, data_end, ð_proto, &l3_offset))) + return XDP_PASS; /* Just skip */ + + /* Extract L4 protocol */ + switch (eth_proto) { + case ETH_P_IP: + ip_proto = get_proto_ipv4(ctx, l3_offset); + break; + case ETH_P_IPV6: + ip_proto = get_proto_ipv6(ctx, l3_offset); + break; + case ETH_P_ARP: + cpu_idx = 0; /* ARP packet handled on separate CPU */ + break; + default: + cpu_idx = 0; + } + + /* Choose CPU based on L4 protocol */ + switch (ip_proto) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + cpu_idx = 2; + break; + case IPPROTO_TCP: + cpu_idx = 0; + break; + case IPPROTO_UDP: + cpu_idx = 1; + /* DDoS filter UDP port 9 (pktgen) */ + dest_port = get_dest_port_ipv4_udp(ctx, l3_offset); + if (dest_port == 9) { + if (rec) + rec->dropped++; + return XDP_DROP; + } + break; + default: + cpu_idx = 0; + } + + cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx); + if (!cpu_lookup) + return XDP_ABORTED; + cpu_dest = *cpu_lookup; + + if (cpu_dest >= MAX_CPUS) { + rec->issue++; + return XDP_ABORTED; + } + + return bpf_redirect_map(&cpu_map, cpu_dest, 0); +} + + +char _license[] SEC("license") = "GPL"; + +/*** Trace point code ***/ + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_redirect_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 + +enum { + XDP_REDIRECT_SUCCESS = 0, + XDP_REDIRECT_ERROR = 1 +}; + +static __always_inline +int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) +{ + u32 key = XDP_REDIRECT_ERROR; + struct datarec *rec; + int err = ctx->err; + + if (!err) + key = XDP_REDIRECT_SUCCESS; + + rec = bpf_map_lookup_elem(&redirect_err_cnt, &key); + if (!rec) + return 0; + rec->dropped += 1; + + return 0; /* Indicate event was filtered (no further processing)*/ + /* + * Returning 1 here would allow e.g. a perf-record tracepoint + * to see and record these events, but it doesn't work well + * in-practice as stopping perf-record also unload this + * bpf_prog. Plus, there is additional overhead of doing so. + */ +} + +SEC("tracepoint/xdp/xdp_redirect_err") +int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +SEC("tracepoint/xdp/xdp_redirect_map_err") +int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx) +{ + return xdp_redirect_collect_stat(ctx); +} + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&exception_cnt, &key); + if (!rec) + return 1; + rec->dropped += 1; + + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_enqueue_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int to_cpu; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_enqueue") +int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx) +{ + u32 to_cpu = ctx->to_cpu; + struct datarec *rec; + + if (to_cpu >= MAX_CPUS) + return 1; + + rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Record bulk events, then userspace can calc average bulk size */ + if (ctx->processed > 0) + rec->issue += 1; + + /* Inception: It's possible to detect overload situations, via + * this tracepoint. This can be used for creating a feedback + * loop to XDP, which can take appropriate actions to mitigate + * this overload situation. + */ + return 0; +} + +/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct cpumap_kthread_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int map_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int cpu; // offset:16; size:4; signed:1; + unsigned int drops; // offset:20; size:4; signed:0; + unsigned int processed; // offset:24; size:4; signed:0; + int sched; // offset:28; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_cpumap_kthread") +int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx) +{ + struct datarec *rec; + u32 key = 0; + + rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key); + if (!rec) + return 0; + rec->processed += ctx->processed; + rec->dropped += ctx->drops; + + /* Count times kthread yielded CPU via schedule call */ + if (ctx->sched) + rec->issue++; + + return 0; +} diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c new file mode 100644 index 000000000000..23744a8aaf21 --- /dev/null +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -0,0 +1,697 @@ +/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc. + */ +static const char *__doc__ = + " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\""; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +#define MAX_CPUS 12 /* WARNING - sync with _kern.c */ + +/* How many xdp_progs are defined in _kern.c */ +#define MAX_PROG 5 + +/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead + * use bpf/libbpf.h), but cannot as (currently) needed for XDP + * attaching to a device via bpf_set_link_xdp_fd() + */ +#include "libbpf.h" +#include "bpf_load.h" + +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"debug", no_argument, NULL, 'D' }, + {"sec", required_argument, NULL, 's' }, + {"prognum", required_argument, NULL, 'p' }, + {"qsize", required_argument, NULL, 'q' }, + {"cpu", required_argument, NULL, 'c' }, + {"stress-mode", no_argument, NULL, 'x' }, + {"no-separators", no_argument, NULL, 'z' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + if (ifindex > -1) + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + exit(EXIT_OK); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf("\n"); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); +} + +/* gettime returns the current time of day in nanoseconds. + * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC) + * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE) + */ +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 dropped; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record rx_cnt; + struct record redir_err; + struct record kthread; + struct record exception; + struct record enq[MAX_CPUS]; +}; + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_dropped = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].dropped = values[i].dropped; + sum_dropped += values[i].dropped; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.dropped = sum_dropped; + rec->total.issue = sum_issue; + return true; +} + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + size_t size; + + size = sizeof(struct datarec) * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + struct stats_record *rec; + int i; + + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rx_cnt.cpu = alloc_record_per_cpu(); + rec->redir_err.cpu = alloc_record_per_cpu(); + rec->kthread.cpu = alloc_record_per_cpu(); + rec->exception.cpu = alloc_record_per_cpu(); + for (i = 0; i < MAX_CPUS; i++) + rec->enq[i].cpu = alloc_record_per_cpu(); + + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + int i; + + for (i = 0; i < MAX_CPUS; i++) + free(r->enq[i].cpu); + free(r->exception.cpu); + free(r->kthread.cpu); + free(r->redir_err.cpu); + free(r->rx_cnt.cpu); + free(r); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->dropped - p->dropped; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int prog_num) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + double pps = 0, drop = 0, err = 0; + struct record *rec, *prev; + int to_cpu; + double t; + int i; + + /* Header */ + printf("Running XDP/eBPF prog_num:%d\n", prog_num); + printf("%-15s %-7s %-14s %-11s %-9s\n", + "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info"); + + /* XDP rx_cnt */ + { + char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n"; + char *errstr = ""; + + rec = &stats_rec->rx_cnt; + prev = &stats_prev->rx_cnt; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "cpu-dest/err"; + if (pps > 0) + printf(fmt_rx, "XDP-RX", + i, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX", "total", pps, drop); + } + + /* cpumap enqueue stats */ + for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) { + char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n"; + char *errstr = ""; + + rec = &stats_rec->enq[to_cpu]; + prev = &stats_prev->enq[to_cpu]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + if (pps > 0) + printf(fmt, "cpumap-enqueue", + i, to_cpu, pps, drop, err, errstr); + } + pps = calc_pps(&rec->total, &prev->total, t); + if (pps > 0) { + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) { + errstr = "bulk-average"; + err = pps / err; /* calc average bulk size */ + } + printf(fm2, "cpumap-enqueue", + "sum", to_cpu, pps, drop, err, errstr); + } + } + + /* cpumap kthread stats */ + { + char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n"; + char *e_str = ""; + + rec = &stats_rec->kthread; + prev = &stats_prev->kthread; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + e_str = "sched"; + if (pps > 0) + printf(fmt_k, "cpumap_kthread", + i, pps, drop, err, e_str); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (err > 0) + e_str = "sched-sum"; + printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str); + } + + /* XDP redirect err tracepoints (very unlikely) */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->redir_err; + prev = &stats_prev->redir_err; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "redirect_err", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "redirect_err", "total", pps, drop); + } + + /* XDP general exception tracepoints */ + { + char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n"; + char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n"; + + rec = &stats_rec->exception; + prev = &stats_prev->exception; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps(r, p, t); + drop = calc_drop_pps(r, p, t); + if (pps > 0) + printf(fmt_err, "xdp_exception", i, pps, drop); + } + pps = calc_pps(&rec->total, &prev->total, t); + drop = calc_drop_pps(&rec->total, &prev->total, t); + printf(fm2_err, "xdp_exception", "total", pps, drop); + } + + printf("\n"); + fflush(stdout); +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i; + + fd = map_fd[1]; /* map: rx_cnt */ + map_collect_percpu(fd, 0, &rec->rx_cnt); + + fd = map_fd[2]; /* map: redirect_err_cnt */ + map_collect_percpu(fd, 1, &rec->redir_err); + + fd = map_fd[3]; /* map: cpumap_enqueue_cnt */ + for (i = 0; i < MAX_CPUS; i++) + map_collect_percpu(fd, i, &rec->enq[i]); + + fd = map_fd[4]; /* map: cpumap_kthread_cnt */ + map_collect_percpu(fd, 0, &rec->kthread); + + fd = map_fd[8]; /* map: exception_cnt */ + map_collect_percpu(fd, 0, &rec->exception); +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static int create_cpu_entry(__u32 cpu, __u32 queue_size, + __u32 avail_idx, bool new) +{ + __u32 curr_cpus_count = 0; + __u32 key = 0; + int ret; + + /* Add a CPU entry to cpumap, as this allocate a cpu entry in + * the kernel for the cpu. + */ + ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0); + if (ret) { + fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret); + exit(EXIT_FAIL_BPF); + } + + /* Inform bpf_prog's that a new CPU is available to select + * from via some control maps. + */ + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0); + if (ret) { + fprintf(stderr, "Add to avail CPUs failed\n"); + exit(EXIT_FAIL_BPF); + } + + /* When not replacing/updating existing entry, bump the count */ + /* map_fd[6] = cpus_count */ + ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count); + if (ret) { + fprintf(stderr, "Failed reading curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + if (new) { + curr_cpus_count++; + ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0); + if (ret) { + fprintf(stderr, "Failed write curr cpus_count\n"); + exit(EXIT_FAIL_BPF); + } + } + /* map_fd[7] = cpus_iterator */ + printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n", + new ? "Add-new":"Replace", cpu, avail_idx, + queue_size, curr_cpus_count); + + return 0; +} + +/* CPUs are zero-indexed. Thus, add a special sentinel default value + * in map cpus_available to mark CPU index'es not configured + */ +static void mark_cpus_unavailable(void) +{ + __u32 invalid_cpu = MAX_CPUS; + int ret, i; + + for (i = 0; i < MAX_CPUS; i++) { + /* map_fd[5] = cpus_available */ + ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0); + if (ret) { + fprintf(stderr, "Failed marking CPU unavailable\n"); + exit(EXIT_FAIL_BPF); + } + } +} + +/* Stress cpumap management code by concurrently changing underlying cpumap */ +static void stress_cpumap(void) +{ + /* Changing qsize will cause kernel to free and alloc a new + * bpf_cpu_map_entry, with an associated/complicated tear-down + * procedure. + */ + create_cpu_entry(1, 1024, 0, false); + create_cpu_entry(1, 128, 0, false); + create_cpu_entry(1, 16000, 0, false); +} + +static void stats_poll(int interval, bool use_separators, int prog_num, + bool stress_mode) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, prog_num); + sleep(interval); + if (stress_mode) + stress_cpumap(); + } + + free_stats_record(record); + free_stats_record(prev); +} + +int main(int argc, char **argv) +{ + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + bool use_separators = true; + bool stress_mode = false; + char filename[256]; + bool debug = false; + int added_cpus = 0; + int longindex = 0; + int interval = 2; + int prog_num = 0; + int add_cpu = -1; + __u32 qsize; + int opt; + + /* Notice: choosing he queue size is very important with the + * ixgbe driver, because it's driver page recycling trick is + * dependend on pages being returned quickly. The number of + * out-standing packets in the system must be less-than 2x + * RX-ring size. + */ + qsize = 128+64; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_bpf_file(filename)) { + fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf); + return EXIT_FAIL; + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + mark_cpus_unavailable(); + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'D': + debug = true; + break; + case 'x': + stress_mode = true; + break; + case 'z': + use_separators = false; + break; + case 'p': + /* Selecting eBPF prog to load */ + prog_num = atoi(optarg); + if (prog_num < 0 || prog_num >= MAX_PROG) { + fprintf(stderr, + "--prognum too large err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 'c': + /* Add multiple CPUs */ + add_cpu = strtoul(optarg, NULL, 0); + if (add_cpu >= MAX_CPUS) { + fprintf(stderr, + "--cpu nr too large for cpumap err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + create_cpu_entry(add_cpu, qsize, added_cpus, true); + added_cpus++; + break; + case 'q': + qsize = atoi(optarg); + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + /* Required option */ + if (add_cpu == -1) { + fprintf(stderr, "ERR: required option --cpu missing\n"); + fprintf(stderr, " Specify multiple --cpu option to add more\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + + /* Remove XDP program when program is interrupted */ + signal(SIGINT, int_exit); + + if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + if (debug) { + printf("Debug-mode reading trace pipe (fix #define DEBUG)\n"); + read_trace_pipe(); + } + + stats_poll(interval, use_separators, prog_num, stress_mode); + return EXIT_OK; +} diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index d4d86a273fba..7eae07d7293e 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -20,6 +20,7 @@ #include <string.h> #include <unistd.h> #include <libgen.h> +#include <sys/resource.h> #include "bpf_load.h" #include "bpf_util.h" @@ -33,9 +34,9 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex_in, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); if (ifindex_out_xdp_dummy_attached) - set_link_xdp_fd(ifindex_out, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); exit(0); } @@ -74,6 +75,7 @@ static void usage(const char *prog) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; const char *optstr = "SN"; char filename[256]; int ret, opt, key = 0; @@ -97,6 +99,11 @@ int main(int argc, char **argv) return 1; } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + ifindex_in = strtoul(argv[optind], NULL, 0); ifindex_out = strtoul(argv[optind + 1], NULL, 0); printf("input: %d output: %d\n", ifindex_in, ifindex_out); @@ -113,13 +120,13 @@ int main(int argc, char **argv) return 1; } - if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); return 1; } /* Loading dummy XDP prog on out-device */ - if (set_link_xdp_fd(ifindex_out, prog_fd[1], + if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1], (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { printf("WARN: link set xdp fd failed on %d\n", ifindex_out); ifindex_out_xdp_dummy_attached = false; diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index 4475d837bf2c..d54e91eb6cbf 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -33,9 +33,9 @@ static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex_in, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags); if (ifindex_out_xdp_dummy_attached) - set_link_xdp_fd(ifindex_out, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); exit(0); } @@ -114,13 +114,13 @@ int main(int argc, char **argv) return 1; } - if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) { printf("ERROR: link set xdp fd failed on %d\n", ifindex_in); return 1; } /* Loading dummy XDP prog on out-device */ - if (set_link_xdp_fd(ifindex_out, prog_fd[1], + if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1], (xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) { printf("WARN: link set xdp fd failed on %d\n", ifindex_out); ifindex_out_xdp_dummy_attached = false; diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c new file mode 100644 index 000000000000..993f56bc7b9a --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_kern.c @@ -0,0 +1,186 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include <uapi/linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include "bpf_helpers.h" +#include <linux/slab.h> +#include <net/ip_fib.h> + +struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; +}; + +/* Key for lpm_trie*/ +union key_4 { + u32 b32[2]; + u8 b8[8]; +}; + +struct arp_entry { + __be64 mac; + __be32 dst; +}; + +struct direct_map { + struct arp_entry arp; + int ifindex; + __be64 mac; +}; + +/* Map for trie implementation*/ +struct bpf_map_def SEC("maps") lpm_map = { + .type = BPF_MAP_TYPE_LPM_TRIE, + .key_size = 8, + .value_size = sizeof(struct trie_value), + .max_entries = 50, + .map_flags = BPF_F_NO_PREALLOC, +}; + +/* Map for counter*/ +struct bpf_map_def SEC("maps") rxcnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = 256, +}; + +/* Map for ARP table*/ +struct bpf_map_def SEC("maps") arp_table = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(__be64), + .max_entries = 50, +}; + +/* Map to keep the exact match entries in the route table*/ +struct bpf_map_def SEC("maps") exact_match = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__be32), + .value_size = sizeof(struct direct_map), + .max_entries = 50, +}; + +struct bpf_map_def SEC("maps") tx_port = { + .type = BPF_MAP_TYPE_DEVMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 100, +}; + +/* Function to set source and destination mac of the packet */ +static inline void set_src_dst_mac(void *data, void *src, void *dst) +{ + unsigned short *source = src; + unsigned short *dest = dst; + unsigned short *p = data; + + __builtin_memcpy(p, dest, 6); + __builtin_memcpy(p + 3, source, 6); +} + +/* Parse IPV4 packet to get SRC, DST IP and protocol */ +static inline int parse_ipv4(void *data, u64 nh_off, void *data_end, + __be32 *src, __be32 *dest) +{ + struct iphdr *iph = data + nh_off; + + if (iph + 1 > data_end) + return 0; + *src = iph->saddr; + *dest = iph->daddr; + return iph->protocol; +} + +SEC("xdp_router_ipv4") +int xdp_router_ipv4_prog(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + __be64 *dest_mac = NULL, *src_mac = NULL; + void *data = (void *)(long)ctx->data; + struct trie_value *prefix_value; + int rc = XDP_DROP, forward_to; + struct ethhdr *eth = data; + union key_4 key4; + long *value; + u16 h_proto; + u32 ipproto; + u64 nh_off; + + nh_off = sizeof(*eth); + if (data + nh_off > data_end) + return rc; + + h_proto = eth->h_proto; + + if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) { + struct vlan_hdr *vhdr; + + vhdr = data + nh_off; + nh_off += sizeof(struct vlan_hdr); + if (data + nh_off > data_end) + return rc; + h_proto = vhdr->h_vlan_encapsulated_proto; + } + if (h_proto == htons(ETH_P_ARP)) { + return XDP_PASS; + } else if (h_proto == htons(ETH_P_IP)) { + struct direct_map *direct_entry; + __be32 src_ip = 0, dest_ip = 0; + + ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip); + direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip); + /* Check for exact match, this would give a faster lookup*/ + if (direct_entry && direct_entry->mac && direct_entry->arp.mac) { + src_mac = &direct_entry->mac; + dest_mac = &direct_entry->arp.mac; + forward_to = direct_entry->ifindex; + } else { + /* Look up in the trie for lpm*/ + key4.b32[0] = 32; + key4.b8[4] = dest_ip & 0xff; + key4.b8[5] = (dest_ip >> 8) & 0xff; + key4.b8[6] = (dest_ip >> 16) & 0xff; + key4.b8[7] = (dest_ip >> 24) & 0xff; + prefix_value = bpf_map_lookup_elem(&lpm_map, &key4); + if (!prefix_value) + return XDP_DROP; + src_mac = &prefix_value->value; + if (!src_mac) + return XDP_DROP; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + if (!dest_mac) { + if (!prefix_value->gw) + return XDP_DROP; + dest_ip = prefix_value->gw; + dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip); + } + forward_to = prefix_value->ifindex; + } + } else { + ipproto = 0; + } + if (src_mac && dest_mac) { + set_src_dst_mac(data, src_mac, dest_mac); + value = bpf_map_lookup_elem(&rxcnt, &ipproto); + if (value) + *value += 1; + return bpf_redirect_map(&tx_port, forward_to, 0); + } + return rc; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c new file mode 100644 index 000000000000..6296741c1fbd --- /dev/null +++ b/samples/bpf/xdp_router_ipv4_user.c @@ -0,0 +1,660 @@ +/* Copyright (C) 2017 Cavium, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <unistd.h> +#include "bpf_load.h" +#include "libbpf.h" +#include <arpa/inet.h> +#include <fcntl.h> +#include <poll.h> +#include <net/if.h> +#include <netdb.h> +#include <sys/ioctl.h> +#include <sys/syscall.h> +#include "bpf_util.h" + +int sock, sock_arp, flags = 0; +static int total_ifindex; +int *ifindex_list; +char buf[8192]; + +static int get_route_table(int rtm_family); +static void int_exit(int sig) +{ + int i = 0; + + for (i = 0; i < total_ifindex; i++) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); +} + +static void close_and_exit(int sig) +{ + int i = 0; + + close(sock); + close(sock_arp); + + for (i = 0; i < total_ifindex; i++) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); +} + +/* Get the mac address of the interface given interface name */ +static __be64 getmac(char *iface) +{ + struct ifreq ifr; + __be64 mac = 0; + int fd, i; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + ifr.ifr_addr.sa_family = AF_INET; + strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1); + if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) { + printf("ioctl failed leaving....\n"); + return -1; + } + for (i = 0; i < 6 ; i++) + *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i]; + close(fd); + return mac; +} + +static int recv_msg(struct sockaddr_nl sock_addr, int sock) +{ + struct nlmsghdr *nh; + int len, nll = 0; + char *buf_ptr; + + buf_ptr = buf; + while (1) { + len = recv(sock, buf_ptr, sizeof(buf) - nll, 0); + if (len < 0) + return len; + + nh = (struct nlmsghdr *)buf_ptr; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + buf_ptr += len; + nll += len; + if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH) + break; + + if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE) + break; + } + return nll; +} + +/* Function to parse the route entry returned by netlink + * Updates the route entry related map entries + */ +static void read_route(struct nlmsghdr *nh, int nll) +{ + char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24]; + struct bpf_lpm_trie_key *prefix_key; + struct rtattr *rt_attr; + struct rtmsg *rt_msg; + int rtm_family; + int rtl; + int i; + struct route_table { + int dst_len, iface, metric; + char *iface_name; + __be32 dst, gw; + __be64 mac; + } route; + struct arp_table { + __be64 mac; + __be32 dst; + }; + + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_DELROUTE) + printf("DELETING Route entry\n"); + else if (nh->nlmsg_type == RTM_GETROUTE) + printf("READING Route entry\n"); + else if (nh->nlmsg_type == RTM_NEWROUTE) + printf("NEW Route entry\n"); + else + printf("%d\n", nh->nlmsg_type); + + memset(&route, 0, sizeof(route)); + printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct rtmsg *)NLMSG_DATA(nh); + rtm_family = rt_msg->rtm_family; + if (rtm_family == AF_INET) + if (rt_msg->rtm_table != RT_TABLE_MAIN) + continue; + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + rtl = RTM_PAYLOAD(nh); + + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + (*((__be32 *)RTA_DATA(rt_attr)))); + break; + case RTA_GATEWAY: + sprintf(gws, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case RTA_OIF: + sprintf(ifs, "%u", + *((int *)RTA_DATA(rt_attr))); + break; + case RTA_METRICS: + sprintf(metrics, "%u", + *((int *)RTA_DATA(rt_attr))); + default: + break; + } + } + sprintf(dsts_len, "%d", rt_msg->rtm_dst_len); + route.dst = atoi(dsts); + route.dst_len = atoi(dsts_len); + route.gw = atoi(gws); + route.iface = atoi(ifs); + route.metric = atoi(metrics); + route.iface_name = alloca(sizeof(char *) * IFNAMSIZ); + route.iface_name = if_indextoname(route.iface, route.iface_name); + route.mac = getmac(route.iface_name); + if (route.mac == -1) { + int i = 0; + + for (i = 0; i < total_ifindex; i++) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + exit(0); + } + assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0); + if (rtm_family == AF_INET) { + struct trie_value { + __u8 prefix[4]; + __be64 value; + int ifindex; + int metric; + __be32 gw; + } *prefix_value; + + prefix_key = alloca(sizeof(*prefix_key) + 3); + prefix_value = alloca(sizeof(*prefix_value)); + + prefix_key->prefixlen = 32; + prefix_key->prefixlen = route.dst_len; + direct_entry.mac = route.mac & 0xffffffffffff; + direct_entry.ifindex = route.iface; + direct_entry.arp.mac = 0; + direct_entry.arp.dst = 0; + if (route.dst_len == 32) { + if (nh->nlmsg_type == RTM_DELROUTE) { + assert(bpf_map_delete_elem(map_fd[3], &route.dst) == 0); + } else { + if (bpf_map_lookup_elem(map_fd[2], &route.dst, &direct_entry.arp.mac) == 0) + direct_entry.arp.dst = route.dst; + assert(bpf_map_update_elem(map_fd[3], &route.dst, &direct_entry, 0) == 0); + } + } + for (i = 0; i < 4; i++) + prefix_key->data[i] = (route.dst >> i * 8) & 0xff; + + printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n", + (int)prefix_key->data[0], + (int)prefix_key->data[1], + (int)prefix_key->data[2], + (int)prefix_key->data[3], + route.gw, route.dst_len, + route.metric, + route.iface_name); + if (bpf_map_lookup_elem(map_fd[0], prefix_key, + prefix_value) < 0) { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = prefix_key->data[i]; + prefix_value->value = route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + + assert(bpf_map_update_elem(map_fd[0], + prefix_key, + prefix_value, 0 + ) == 0); + } else { + if (nh->nlmsg_type == RTM_DELROUTE) { + printf("deleting entry\n"); + printf("prefix key=%d.%d.%d.%d/%d", + prefix_key->data[0], + prefix_key->data[1], + prefix_key->data[2], + prefix_key->data[3], + prefix_key->prefixlen); + assert(bpf_map_delete_elem(map_fd[0], + prefix_key + ) == 0); + /* Rereading the route table to check if + * there is an entry with the same + * prefix but a different metric as the + * deleted enty. + */ + get_route_table(AF_INET); + } else if (prefix_key->data[0] == + prefix_value->prefix[0] && + prefix_key->data[1] == + prefix_value->prefix[1] && + prefix_key->data[2] == + prefix_value->prefix[2] && + prefix_key->data[3] == + prefix_value->prefix[3] && + route.metric >= prefix_value->metric) { + continue; + } else { + for (i = 0; i < 4; i++) + prefix_value->prefix[i] = + prefix_key->data[i]; + prefix_value->value = + route.mac & 0xffffffffffff; + prefix_value->ifindex = route.iface; + prefix_value->gw = route.gw; + prefix_value->metric = route.metric; + assert(bpf_map_update_elem( + map_fd[0], + prefix_key, + prefix_value, + 0) == 0); + } + } + } + memset(&route, 0, sizeof(route)); + memset(dsts, 0, sizeof(dsts)); + memset(dsts_len, 0, sizeof(dsts_len)); + memset(gws, 0, sizeof(gws)); + memset(ifs, 0, sizeof(ifs)); + memset(&route, 0, sizeof(route)); + } +} + +/* Function to read the existing route table when the process is launched*/ +static int get_route_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + + struct { + struct nlmsghdr nl; + struct rtmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETROUTE; + + req.rt.rtm_family = rtm_family; + req.rt.rtm_table = RT_TABLE_MAIN; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_route(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to parse the arp entry returned by netlink + * Updates the arp entry related map entries + */ +static void read_arp(struct nlmsghdr *nh, int nll) +{ + struct rtattr *rt_attr; + char dsts[24], mac[24]; + struct ndmsg *rt_msg; + int rtl, ndm_family; + + struct arp_table { + __be64 mac; + __be32 dst; + } arp_entry; + struct direct_map { + struct arp_table arp; + int ifindex; + __be64 mac; + } direct_entry; + + if (nh->nlmsg_type == RTM_GETNEIGH) + printf("READING arp entry\n"); + printf("Address\tHwAddress\n"); + for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) { + rt_msg = (struct ndmsg *)NLMSG_DATA(nh); + rt_attr = (struct rtattr *)RTM_RTA(rt_msg); + ndm_family = rt_msg->ndm_family; + rtl = RTM_PAYLOAD(nh); + for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) { + switch (rt_attr->rta_type) { + case NDA_DST: + sprintf(dsts, "%u", + *((__be32 *)RTA_DATA(rt_attr))); + break; + case NDA_LLADDR: + sprintf(mac, "%lld", + *((__be64 *)RTA_DATA(rt_attr))); + break; + default: + break; + } + } + arp_entry.dst = atoi(dsts); + arp_entry.mac = atol(mac); + printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac); + if (ndm_family == AF_INET) { + if (bpf_map_lookup_elem(map_fd[3], &arp_entry.dst, + &direct_entry) == 0) { + if (nh->nlmsg_type == RTM_DELNEIGH) { + direct_entry.arp.dst = 0; + direct_entry.arp.mac = 0; + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + direct_entry.arp.dst = arp_entry.dst; + direct_entry.arp.mac = arp_entry.mac; + } + assert(bpf_map_update_elem(map_fd[3], + &arp_entry.dst, + &direct_entry, 0 + ) == 0); + memset(&direct_entry, 0, sizeof(direct_entry)); + } + if (nh->nlmsg_type == RTM_DELNEIGH) { + assert(bpf_map_delete_elem(map_fd[2], &arp_entry.dst) == 0); + } else if (nh->nlmsg_type == RTM_NEWNEIGH) { + assert(bpf_map_update_elem(map_fd[2], + &arp_entry.dst, + &arp_entry.mac, 0 + ) == 0); + } + } + memset(&arp_entry, 0, sizeof(arp_entry)); + memset(dsts, 0, sizeof(dsts)); + } +} + +/* Function to read the existing arp table when the process is launched*/ +static int get_arp_table(int rtm_family) +{ + struct sockaddr_nl sa; + struct nlmsghdr *nh; + int sock, seq = 0; + struct msghdr msg; + struct iovec iov; + int ret = 0; + int nll; + struct { + struct nlmsghdr nl; + struct ndmsg rt; + char buf[8192]; + } req; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + memset(&sa, 0, sizeof(sa)); + sa.nl_family = AF_NETLINK; + if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(&req, 0, sizeof(req)); + req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + req.nl.nlmsg_type = RTM_GETNEIGH; + req.rt.ndm_state = NUD_REACHABLE; + req.rt.ndm_family = rtm_family; + req.nl.nlmsg_pid = 0; + req.nl.nlmsg_seq = ++seq; + memset(&msg, 0, sizeof(msg)); + iov.iov_base = (void *)&req.nl; + iov.iov_len = req.nl.nlmsg_len; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + ret = sendmsg(sock, &msg, 0); + if (ret < 0) { + printf("send to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + memset(buf, 0, sizeof(buf)); + nll = recv_msg(sa, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); +cleanup: + close(sock); + return ret; +} + +/* Function to keep track and update changes in route and arp table + * Give regular statistics of packets forwarded + */ +static int monitor_route(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + const unsigned int nr_keys = 256; + struct pollfd fds_route, fds_arp; + __u64 prev[nr_keys][nr_cpus]; + struct sockaddr_nl la, lr; + __u64 values[nr_cpus]; + struct nlmsghdr *nh; + int nll, ret = 0; + int interval = 5; + __u32 key; + int i; + + sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock, F_SETFL, O_NONBLOCK); + memset(&lr, 0, sizeof(lr)); + lr.nl_family = AF_NETLINK; + lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY; + if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_route.fd = sock; + fds_route.events = POLL_IN; + + sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (sock_arp < 0) { + printf("open netlink socket: %s\n", strerror(errno)); + return -1; + } + + fcntl(sock_arp, F_SETFL, O_NONBLOCK); + memset(&la, 0, sizeof(la)); + la.nl_family = AF_NETLINK; + la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY; + if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) { + printf("bind to netlink: %s\n", strerror(errno)); + ret = -1; + goto cleanup; + } + fds_arp.fd = sock_arp; + fds_arp.events = POLL_IN; + + memset(prev, 0, sizeof(prev)); + do { + signal(SIGINT, close_and_exit); + signal(SIGTERM, close_and_exit); + + sleep(interval); + for (key = 0; key < nr_keys; key++) { + __u64 sum = 0; + + assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0); + for (i = 0; i < nr_cpus; i++) + sum += (values[i] - prev[key][i]); + if (sum) + printf("proto %u: %10llu pkt/s\n", + key, sum / interval); + memcpy(prev[key], values, sizeof(values)); + } + + memset(buf, 0, sizeof(buf)); + if (poll(&fds_route, 1, 3) == POLL_IN) { + nll = recv_msg(lr, sock); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + printf("Routing table updated.\n"); + read_route(nh, nll); + } + memset(buf, 0, sizeof(buf)); + if (poll(&fds_arp, 1, 3) == POLL_IN) { + nll = recv_msg(la, sock_arp); + if (nll < 0) { + printf("recv from netlink: %s\n", strerror(nll)); + ret = -1; + goto cleanup; + } + + nh = (struct nlmsghdr *)buf; + read_arp(nh, nll); + } + + } while (1); +cleanup: + close(sock); + return ret; +} + +int main(int ac, char **argv) +{ + char filename[256]; + char **ifname_list; + int i = 1; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (ac < 2) { + printf("usage: %s [-S] Interface name list\n", argv[0]); + return 1; + } + if (!strcmp(argv[1], "-S")) { + flags = XDP_FLAGS_SKB_MODE; + total_ifindex = ac - 2; + ifname_list = (argv + 2); + } else { + flags = 0; + total_ifindex = ac - 1; + ifname_list = (argv + 1); + } + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("\n**************loading bpf file*********************\n\n\n"); + if (!prog_fd[0]) { + printf("load_bpf_file: %s\n", strerror(errno)); + return 1; + } + ifindex_list = (int *)malloc(total_ifindex * sizeof(int *)); + for (i = 0; i < total_ifindex; i++) { + ifindex_list[i] = if_nametoindex(ifname_list[i]); + if (!ifindex_list[i]) { + printf("Couldn't translate interface name: %s", + strerror(errno)); + return 1; + } + } + for (i = 0; i < total_ifindex; i++) { + if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) { + printf("link set xdp fd failed\n"); + int recovery_index = i; + + for (i = 0; i < recovery_index; i++) + bpf_set_link_xdp_fd(ifindex_list[i], -1, flags); + + return 1; + } + printf("Attached to %d\n", ifindex_list[i]); + } + signal(SIGINT, int_exit); + signal(SIGTERM, int_exit); + + printf("*******************ROUTE TABLE*************************\n\n\n"); + get_route_table(AF_INET); + printf("*******************ARP TABLE***************************\n\n\n"); + get_arp_table(AF_INET); + if (monitor_route() < 0) { + printf("Error in receiving route update"); + return 1; + } + + return 0; +} diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c new file mode 100644 index 000000000000..3fd209291653 --- /dev/null +++ b/samples/bpf/xdp_rxq_info_kern.c @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * + * Example howto extract XDP RX-queue info + */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +/* Config setup from with userspace + * + * User-side setup ifindex in config_map, to verify that + * ctx->ingress_ifindex is correct (against configured ifindex) + */ +struct config { + __u32 action; + int ifindex; +}; +struct bpf_map_def SEC("maps") config_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(struct config), + .max_entries = 1, +}; + +/* Common stats data record (shared with userspace) */ +struct datarec { + __u64 processed; + __u64 issue; +}; + +struct bpf_map_def SEC("maps") stats_global_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = 1, +}; + +#define MAX_RXQs 64 + +/* Stats per rx_queue_index (per CPU) */ +struct bpf_map_def SEC("maps") rx_queue_index_map = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct datarec), + .max_entries = MAX_RXQs + 1, +}; + +SEC("xdp_prog0") +int xdp_prognum0(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + struct datarec *rec, *rxq_rec; + int ingress_ifindex; + struct config *config; + u32 key = 0; + + /* Global stats record */ + rec = bpf_map_lookup_elem(&stats_global_map, &key); + if (!rec) + return XDP_ABORTED; + rec->processed++; + + /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF + * instructions inside kernel to access xdp_rxq->dev->ifindex + */ + ingress_ifindex = ctx->ingress_ifindex; + + config = bpf_map_lookup_elem(&config_map, &key); + if (!config) + return XDP_ABORTED; + + /* Simple test: check ctx provided ifindex is as expected */ + if (ingress_ifindex != config->ifindex) { + /* count this error case */ + rec->issue++; + return XDP_ABORTED; + } + + /* Update stats per rx_queue_index. Handle if rx_queue_index + * is larger than stats map can contain info for. + */ + key = ctx->rx_queue_index; + if (key >= MAX_RXQs) + key = MAX_RXQs; + rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key); + if (!rxq_rec) + return XDP_ABORTED; + rxq_rec->processed++; + if (key == MAX_RXQs) + rxq_rec->issue++; + + return config->action; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c new file mode 100644 index 000000000000..478d95412de4 --- /dev/null +++ b/samples/bpf/xdp_rxq_info_user.c @@ -0,0 +1,531 @@ +/* SPDX-License-Identifier: GPL-2.0 + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + */ +static const char *__doc__ = " XDP RX-queue info extract example\n\n" + "Monitor how many packets per sec (pps) are received\n" + "per NIC RX queue index and which CPU processed the packet\n" + ; + +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <locale.h> +#include <sys/resource.h> +#include <getopt.h> +#include <net/if.h> +#include <time.h> + +#include <arpa/inet.h> +#include <linux/if_link.h> + +#include "libbpf.h" +#include "bpf_load.h" +#include "bpf_util.h" + +static int ifindex = -1; +static char ifname_buf[IF_NAMESIZE]; +static char *ifname; + +static __u32 xdp_flags; + +/* Exit return codes */ +#define EXIT_OK 0 +#define EXIT_FAIL 1 +#define EXIT_FAIL_OPTION 2 +#define EXIT_FAIL_XDP 3 +#define EXIT_FAIL_BPF 4 +#define EXIT_FAIL_MEM 5 + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"dev", required_argument, NULL, 'd' }, + {"skb-mode", no_argument, NULL, 'S' }, + {"sec", required_argument, NULL, 's' }, + {"no-separators", no_argument, NULL, 'z' }, + {"action", required_argument, NULL, 'a' }, + {0, 0, NULL, 0 } +}; + +static void int_exit(int sig) +{ + fprintf(stderr, + "Interrupted: Removing XDP program on ifindex:%d device:%s\n", + ifindex, ifname); + if (ifindex > -1) + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); + exit(EXIT_OK); +} + +struct config { + __u32 action; + int ifindex; +}; +#define XDP_ACTION_MAX (XDP_TX + 1) +#define XDP_ACTION_MAX_STRLEN 11 +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", +}; + +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} + +static int parse_xdp_action(char *action_str) +{ + size_t maxlen; + __u64 action = -1; + int i; + + for (i = 0; i < XDP_ACTION_MAX; i++) { + maxlen = XDP_ACTION_MAX_STRLEN; + if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) { + action = i; + break; + } + } + return action; +} + +static void list_xdp_actions(void) +{ + int i; + + printf("Available XDP --action <options>\n"); + for (i = 0; i < XDP_ACTION_MAX; i++) + printf("\t%s\n", xdp_action_names[i]); + printf("\n"); +} + +static void usage(char *argv[]) +{ + int i; + + printf("\nDOCUMENTATION:\n%s\n", __doc__); + printf(" Usage: %s (options-see-below)\n", argv[0]); + printf(" Listing options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)", + *long_options[i].flag); + else + printf(" short-option: -%c", + long_options[i].val); + printf("\n"); + } + printf("\n"); + list_xdp_actions(); +} + +#define NANOSEC_PER_SEC 1000000000 /* 10^9 */ +static __u64 gettime(void) +{ + struct timespec t; + int res; + + res = clock_gettime(CLOCK_MONOTONIC, &t); + if (res < 0) { + fprintf(stderr, "Error with gettimeofday! (%i)\n", res); + exit(EXIT_FAIL); + } + return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec; +} + +/* Common stats data record shared with _kern.c */ +struct datarec { + __u64 processed; + __u64 issue; +}; +struct record { + __u64 timestamp; + struct datarec total; + struct datarec *cpu; +}; +struct stats_record { + struct record stats; + struct record *rxq; +}; + +static struct datarec *alloc_record_per_cpu(void) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec *array; + size_t size; + + size = sizeof(struct datarec) * nr_cpus; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct record *alloc_record_per_rxq(void) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + struct record *array; + size_t size; + + size = sizeof(struct record) * nr_rxqs; + array = malloc(size); + memset(array, 0, size); + if (!array) { + fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs); + exit(EXIT_FAIL_MEM); + } + return array; +} + +static struct stats_record *alloc_stats_record(void) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + struct stats_record *rec; + int i; + + rec = malloc(sizeof(*rec)); + memset(rec, 0, sizeof(*rec)); + if (!rec) { + fprintf(stderr, "Mem alloc error\n"); + exit(EXIT_FAIL_MEM); + } + rec->rxq = alloc_record_per_rxq(); + for (i = 0; i < nr_rxqs; i++) + rec->rxq[i].cpu = alloc_record_per_cpu(); + + rec->stats.cpu = alloc_record_per_cpu(); + return rec; +} + +static void free_stats_record(struct stats_record *r) +{ + unsigned int nr_rxqs = map_data[2].def.max_entries; + int i; + + for (i = 0; i < nr_rxqs; i++) + free(r->rxq[i].cpu); + + free(r->rxq); + free(r->stats.cpu); + free(r); +} + +static bool map_collect_percpu(int fd, __u32 key, struct record *rec) +{ + /* For percpu maps, userspace gets a value per possible CPU */ + unsigned int nr_cpus = bpf_num_possible_cpus(); + struct datarec values[nr_cpus]; + __u64 sum_processed = 0; + __u64 sum_issue = 0; + int i; + + if ((bpf_map_lookup_elem(fd, &key, values)) != 0) { + fprintf(stderr, + "ERR: bpf_map_lookup_elem failed key:0x%X\n", key); + return false; + } + /* Get time as close as possible to reading map contents */ + rec->timestamp = gettime(); + + /* Record and sum values from each CPU */ + for (i = 0; i < nr_cpus; i++) { + rec->cpu[i].processed = values[i].processed; + sum_processed += values[i].processed; + rec->cpu[i].issue = values[i].issue; + sum_issue += values[i].issue; + } + rec->total.processed = sum_processed; + rec->total.issue = sum_issue; + return true; +} + +static void stats_collect(struct stats_record *rec) +{ + int fd, i, max_rxqs; + + fd = map_data[1].fd; /* map: stats_global_map */ + map_collect_percpu(fd, 0, &rec->stats); + + fd = map_data[2].fd; /* map: rx_queue_index_map */ + max_rxqs = map_data[2].def.max_entries; + for (i = 0; i < max_rxqs; i++) + map_collect_percpu(fd, i, &rec->rxq[i]); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->processed - p->processed; + pps = packets / period_; + } + return pps; +} + +static __u64 calc_errs_pps(struct datarec *r, + struct datarec *p, double period_) +{ + __u64 packets = 0; + __u64 pps = 0; + + if (period_ > 0) { + packets = r->issue - p->issue; + pps = packets / period_; + } + return pps; +} + +static void stats_print(struct stats_record *stats_rec, + struct stats_record *stats_prev, + int action) +{ + unsigned int nr_cpus = bpf_num_possible_cpus(); + unsigned int nr_rxqs = map_data[2].def.max_entries; + double pps = 0, err = 0; + struct record *rec, *prev; + double t; + int rxq; + int i; + + /* Header */ + printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n", + ifname, ifindex, action2str(action)); + + /* stats_global_map */ + { + char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %-7s %'-11.0f\n"; + char *errstr = ""; + + printf("%-15s %-7s %-11s %-11s\n", + "XDP stats", "CPU", "pps", "issue-pps"); + + rec = &stats_rec->stats; + prev = &stats_prev->stats; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) + errstr = "invalid-ifindex"; + if (pps > 0) + printf(fmt_rx, "XDP-RX CPU", + i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + printf(fm2_rx, "XDP-RX CPU", "total", pps, err); + } + + /* rx_queue_index_map */ + printf("\n%-15s %-7s %-11s %-11s\n", + "RXQ stats", "RXQ:CPU", "pps", "issue-pps"); + + for (rxq = 0; rxq < nr_rxqs; rxq++) { + char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n"; + char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n"; + char *errstr = ""; + int rxq_ = rxq; + + /* Last RXQ in map catch overflows */ + if (rxq_ == nr_rxqs - 1) + rxq_ = -1; + + rec = &stats_rec->rxq[rxq]; + prev = &stats_prev->rxq[rxq]; + t = calc_period(rec, prev); + for (i = 0; i < nr_cpus; i++) { + struct datarec *r = &rec->cpu[i]; + struct datarec *p = &prev->cpu[i]; + + pps = calc_pps (r, p, t); + err = calc_errs_pps(r, p, t); + if (err > 0) { + if (rxq_ == -1) + errstr = "map-overflow-RXQ"; + else + errstr = "err"; + } + if (pps > 0) + printf(fmt_rx, "rx_queue_index", + rxq_, i, pps, err, errstr); + } + pps = calc_pps (&rec->total, &prev->total, t); + err = calc_errs_pps(&rec->total, &prev->total, t); + if (pps || err) + printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err); + } +} + + +/* Pointer swap trick */ +static inline void swap(struct stats_record **a, struct stats_record **b) +{ + struct stats_record *tmp; + + tmp = *a; + *a = *b; + *b = tmp; +} + +static void stats_poll(int interval, int action) +{ + struct stats_record *record, *prev; + + record = alloc_stats_record(); + prev = alloc_stats_record(); + stats_collect(record); + + while (1) { + swap(&prev, &record); + stats_collect(record); + stats_print(record, prev, action); + sleep(interval); + } + + free_stats_record(record); + free_stats_record(prev); +} + + +int main(int argc, char **argv) +{ + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + bool use_separators = true; + struct config cfg = { 0 }; + char filename[256]; + int longindex = 0; + int interval = 2; + __u32 key = 0; + int opt, err; + + char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 }; + int action = XDP_PASS; /* Default action */ + char *action_str = NULL; + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + + if (load_bpf_file(filename)) { + fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf); + return EXIT_FAIL; + } + + if (!prog_fd[0]) { + fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno)); + return EXIT_FAIL; + } + + /* Parse commands line args */ + while ((opt = getopt_long(argc, argv, "hSd:", + long_options, &longindex)) != -1) { + switch (opt) { + case 'd': + if (strlen(optarg) >= IF_NAMESIZE) { + fprintf(stderr, "ERR: --dev name too long\n"); + goto error; + } + ifname = (char *)&ifname_buf; + strncpy(ifname, optarg, IF_NAMESIZE); + ifindex = if_nametoindex(ifname); + if (ifindex == 0) { + fprintf(stderr, + "ERR: --dev name unknown err(%d):%s\n", + errno, strerror(errno)); + goto error; + } + break; + case 's': + interval = atoi(optarg); + break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + case 'z': + use_separators = false; + break; + case 'a': + action_str = (char *)&action_str_buf; + strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN); + break; + case 'h': + error: + default: + usage(argv); + return EXIT_FAIL_OPTION; + } + } + /* Required option */ + if (ifindex == -1) { + fprintf(stderr, "ERR: required option --dev missing\n"); + usage(argv); + return EXIT_FAIL_OPTION; + } + cfg.ifindex = ifindex; + + /* Parse action string */ + if (action_str) { + action = parse_xdp_action(action_str); + if (action < 0) { + fprintf(stderr, "ERR: Invalid XDP --action: %s\n", + action_str); + list_xdp_actions(); + return EXIT_FAIL_OPTION; + } + } + cfg.action = action; + + /* Trick to pretty printf with thousands separators use %' */ + if (use_separators) + setlocale(LC_NUMERIC, "en_US"); + + /* User-side setup ifindex in config_map */ + err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0); + if (err) { + fprintf(stderr, "Store config failed (err:%d)\n", err); + exit(EXIT_FAIL_BPF); + } + + /* Remove XDP program when program is interrupted */ + signal(SIGINT, int_exit); + + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + fprintf(stderr, "link set xdp fd failed\n"); + return EXIT_FAIL_XDP; + } + + stats_poll(interval, action); + return EXIT_OK; +} diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 715cd12eaca5..f0a787268a87 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -30,7 +30,7 @@ static __u32 xdp_flags = 0; static void int_exit(int sig) { if (ifindex > -1) - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -254,14 +254,14 @@ int main(int argc, char **argv) } } - if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { + if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } poll_stats(kill_after_s); - set_link_xdp_fd(ifindex, -1, xdp_flags); + bpf_set_link_xdp_fd(ifindex, -1, xdp_flags); return 0; } diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c index 1ea33119e532..004a4e201476 100644 --- a/samples/configfs/configfs_sample.c +++ b/samples/configfs/configfs_sample.c @@ -115,7 +115,7 @@ static struct configfs_attribute *childless_attrs[] = { NULL, }; -static struct config_item_type childless_type = { +static const struct config_item_type childless_type = { .ct_attrs = childless_attrs, .ct_owner = THIS_MODULE, }; @@ -193,7 +193,7 @@ static struct configfs_item_operations simple_child_item_ops = { .release = simple_child_release, }; -static struct config_item_type simple_child_type = { +static const struct config_item_type simple_child_type = { .ct_item_ops = &simple_child_item_ops, .ct_attrs = simple_child_attrs, .ct_owner = THIS_MODULE, @@ -261,7 +261,7 @@ static struct configfs_group_operations simple_children_group_ops = { .make_item = simple_children_make_item, }; -static struct config_item_type simple_children_type = { +static const struct config_item_type simple_children_type = { .ct_item_ops = &simple_children_item_ops, .ct_group_ops = &simple_children_group_ops, .ct_attrs = simple_children_attrs, @@ -331,7 +331,7 @@ static struct configfs_group_operations group_children_group_ops = { .make_group = group_children_make_group, }; -static struct config_item_type group_children_type = { +static const struct config_item_type group_children_type = { .ct_group_ops = &group_children_group_ops, .ct_attrs = group_children_attrs, .ct_owner = THIS_MODULE, diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c index d12cc944b696..95cd06f4ec1e 100644 --- a/samples/connector/cn_test.c +++ b/samples/connector/cn_test.c @@ -125,12 +125,12 @@ static int cn_test_want_notify(void) #endif static u32 cn_test_timer_counter; -static void cn_test_timer_func(unsigned long __data) +static void cn_test_timer_func(struct timer_list *unused) { struct cn_msg *m; char data[32]; - pr_debug("%s: timer fired with data %lu\n", __func__, __data); + pr_debug("%s: timer fired\n", __func__); m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC); if (m) { @@ -168,7 +168,7 @@ static int cn_test_init(void) goto err_out; } - setup_timer(&cn_test_timer, cn_test_timer_func, 0); + timer_setup(&cn_test_timer, cn_test_timer_func, 0); mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000)); pr_info("initialized with id={%u.%u}\n", diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile index f5c3012ffa79..dec1b22adf54 100644 --- a/samples/hidraw/Makefile +++ b/samples/hidraw/Makefile @@ -1,7 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - # List of programs to build hostprogs-y := hid-example diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c index 2e0740f06cd7..9e383fdbaa00 100644 --- a/samples/kobject/kobject-example.c +++ b/samples/kobject/kobject-example.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Sample kobject implementation * * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2007 Novell Inc. - * - * Released under the GPL version 2 only. - * */ #include <linux/kobject.h> #include <linux/string.h> diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c index a55bff52bde3..401328fd687d 100644 --- a/samples/kobject/kset-example.c +++ b/samples/kobject/kset-example.c @@ -1,11 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Sample kset and ktype implementation * * Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com> * Copyright (C) 2007 Novell Inc. - * - * Released under the GPL version 2 only. - * */ #include <linux/kobject.h> #include <linux/string.h> diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile index 68739bc4fc6a..880e54d2c082 100644 --- a/samples/kprobes/Makefile +++ b/samples/kprobes/Makefile @@ -1,5 +1,5 @@ # builds the kprobes example kernel modules; # then to use one (as root): insmod <module_name.ko> -obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o jprobe_example.o +obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c deleted file mode 100644 index e3c0a40909f7..000000000000 --- a/samples/kprobes/jprobe_example.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Here's a sample kernel module showing the use of jprobes to dump - * the arguments of _do_fork(). - * - * For more information on theory of operation of jprobes, see - * Documentation/kprobes.txt - * - * Build and insert the kernel module as done in the kprobe example. - * You will see the trace data in /var/log/messages and on the - * console whenever _do_fork() is invoked to create a new process. - * (Some messages may be suppressed if syslogd is configured to - * eliminate duplicate messages.) - */ - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/kprobes.h> - -/* - * Jumper probe for _do_fork. - * Mirror principle enables access to arguments of the probed routine - * from the probe handler. - */ - -/* Proxy routine having the same arguments as actual _do_fork() routine */ -static long j_do_fork(unsigned long clone_flags, unsigned long stack_start, - unsigned long stack_size, int __user *parent_tidptr, - int __user *child_tidptr, unsigned long tls) -{ - pr_info("jprobe: clone_flags = 0x%lx, stack_start = 0x%lx " - "stack_size = 0x%lx\n", clone_flags, stack_start, stack_size); - - /* Always end with a call to jprobe_return(). */ - jprobe_return(); - return 0; -} - -static struct jprobe my_jprobe = { - .entry = j_do_fork, - .kp = { - .symbol_name = "_do_fork", - }, -}; - -static int __init jprobe_init(void) -{ - int ret; - - ret = register_jprobe(&my_jprobe); - if (ret < 0) { - pr_err("register_jprobe failed, returned %d\n", ret); - return -1; - } - pr_info("Planted jprobe at %p, handler addr %p\n", - my_jprobe.kp.addr, my_jprobe.entry); - return 0; -} - -static void __exit jprobe_exit(void) -{ - unregister_jprobe(&my_jprobe); - pr_info("jprobe at %p unregistered\n", my_jprobe.kp.addr); -} - -module_init(jprobe_init) -module_exit(jprobe_exit) -MODULE_LICENSE("GPL"); diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c index 88b3e2d227ae..67de3b774bc9 100644 --- a/samples/kprobes/kprobe_example.c +++ b/samples/kprobes/kprobe_example.c @@ -47,6 +47,10 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs) " pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate); #endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->psw.addr, regs->flags); +#endif /* A dump_stack() here will give a stack backtrace */ return 0; @@ -76,6 +80,10 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs, pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n", p->symbol_name, p->addr, (long)regs->pstate); #endif +#ifdef CONFIG_S390 + pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n", + p->symbol_name, p->addr, regs->flags); +#endif } /* diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile index 10319d7ea0b1..2472ce39a18d 100644 --- a/samples/livepatch/Makefile +++ b/samples/livepatch/Makefile @@ -1 +1,7 @@ obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o +obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c new file mode 100644 index 000000000000..80d06e103f1b --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-busymod.c @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/workqueue.h> +#include <linux/delay.h> + +static int sleep_secs; +module_param(sleep_secs, int, 0644); +MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)"); + +static void busymod_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(work, busymod_work_func); + +static void busymod_work_func(struct work_struct *work) +{ + pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs); + msleep(sleep_secs * 1000); + pr_info("%s exit\n", __func__); +} + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + schedule_delayed_work(&work, + msecs_to_jiffies(1000 * 0)); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + cancel_delayed_work_sync(&work); + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c new file mode 100644 index 000000000000..72f9e6d1387b --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-demo.c @@ -0,0 +1,219 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo + * + * + * Purpose + * ------- + * + * Demonstration of registering livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * Step 1 - load the simple module + * + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * + * + * Step 2 - load the demonstration livepatch (with callbacks) + * + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * + * Step 3 - cleanup + * + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod livepatch_callbacks_demo + * rmmod livepatch_callbacks_mod + * + * Watch dmesg output to see livepatch enablement, callback execution + * and patching operations for both vmlinux and module targets. + * + * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and + * livepatch-callbacks-demo.ko to observe what happens when a + * target module is loaded after a livepatch with callbacks. + * + * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch + * callback return status. Try setting up a non-zero status + * such as -19 (-ENODEV): + * + * # Load demo livepatch, vmlinux is patched + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # Setup next pre-patch callback to return -ENODEV + * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret + * + * # Module loader refuses to load the target module + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device + * + * NOTE: There is a second target module, + * livepatch-callbacks-busymod.ko, available for experimenting + * with livepatch (un)patch callbacks. This module contains + * a 'sleep_secs' parameter that parks the module on one of the + * functions that the livepatch demo module wants to patch. + * Modifying this value and tweaking the order of module loads can + * effectively demonstrate stalled patch transitions: + * + * # Load a target module, let it park on 'busymod_work_func' for + * # thirty seconds + * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30 + * + * # Meanwhile load the livepatch + * insmod samples/livepatch/livepatch-callbacks-demo.ko + * + * # ... then load and unload another target module while the + * # transition is in progress + * insmod samples/livepatch/livepatch-callbacks-mod.ko + * rmmod samples/livepatch/livepatch-callbacks-mod.ko + * + * # Finally cleanup + * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled + * rmmod samples/livepatch/livepatch-callbacks-demo.ko + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> + +static int pre_patch_ret; +module_param(pre_patch_ret, int, 0644); +MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)"); + +static const char *const module_state[] = { + [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state", + [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init", + [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away", + [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up", +}; + +static void callback_info(const char *callback, struct klp_object *obj) +{ + if (obj->mod) + pr_info("%s: %s -> %s\n", callback, obj->mod->name, + module_state[obj->mod->state]); + else + pr_info("%s: vmlinux\n", callback); +} + +/* Executed on object patching (ie, patch enablement) */ +static int pre_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); + return pre_patch_ret; +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_patch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void pre_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +/* Executed on object unpatching (ie, patch disablement) */ +static void post_unpatch_callback(struct klp_object *obj) +{ + callback_info(__func__, obj); +} + +static void patched_work_func(struct work_struct *work) +{ + pr_info("%s\n", __func__); +} + +static struct klp_func no_funcs[] = { + { } +}; + +static struct klp_func busymod_funcs[] = { + { + .old_name = "busymod_work_func", + .new_func = patched_work_func, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = NULL, /* vmlinux */ + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_mod", + .funcs = no_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { + .name = "livepatch_callbacks_busymod", + .funcs = busymod_funcs, + .callbacks = { + .pre_patch = pre_patch_callback, + .post_patch = post_patch_callback, + .pre_unpatch = pre_unpatch_callback, + .post_unpatch = post_unpatch_callback, + }, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_callbacks_demo_init(void) +{ + int ret; + + ret = klp_register_patch(&patch); + if (ret) + return ret; + ret = klp_enable_patch(&patch); + if (ret) { + WARN_ON(klp_unregister_patch(&patch)); + return ret; + } + return 0; +} + +static void livepatch_callbacks_demo_exit(void) +{ + WARN_ON(klp_unregister_patch(&patch)); +} + +module_init(livepatch_callbacks_demo_init); +module_exit(livepatch_callbacks_demo_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c new file mode 100644 index 000000000000..e610ce29ba44 --- /dev/null +++ b/samples/livepatch/livepatch-callbacks-mod.c @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-callbacks-mod.c - (un)patching callbacks demo support module + * + * + * Purpose + * ------- + * + * Simple module to demonstrate livepatch (un)patching callbacks. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-callbacks-demo.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> + +static int livepatch_callbacks_mod_init(void) +{ + pr_info("%s\n", __func__); + return 0; +} + +static void livepatch_callbacks_mod_exit(void) +{ + pr_info("%s\n", __func__); +} + +module_init(livepatch_callbacks_mod_init); +module_exit(livepatch_callbacks_mod_exit); +MODULE_LICENSE("GPL"); diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c index 84795223f15f..2d554dd930e2 100644 --- a/samples/livepatch/livepatch-sample.c +++ b/samples/livepatch/livepatch-sample.c @@ -71,21 +71,6 @@ static int livepatch_init(void) { int ret; - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - ret = klp_register_patch(&patch); if (ret) return ret; diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c new file mode 100644 index 000000000000..830c55514f9f --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-shadow-fix1.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Fixes the memory leak introduced in livepatch-shadow-mod through the + * use of a shadow variable. This fix demonstrates the "extending" of + * short-lived data structures by patching its allocation and release + * functions. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +struct dummy *livepatch_fix1_dummy_alloc(void) +{ + struct dummy *d; + void *leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + + msecs_to_jiffies(1000 * EXPIRE_PERIOD); + + /* + * Patch: save the extra memory location into a SV_LEAK shadow + * variable. A patched dummy_free routine can later fetch this + * pointer to handle resource release. + */ + leak = kzalloc(sizeof(int), GFP_KERNEL); + klp_shadow_alloc(d, SV_LEAK, &leak, sizeof(leak), GFP_KERNEL); + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; +} + +void livepatch_fix1_dummy_free(struct dummy *d) +{ + void **shadow_leak, *leak; + + /* + * Patch: fetch the saved SV_LEAK shadow variable, detach and + * free it. Note: handle cases where this shadow variable does + * not exist (ie, dummy structures allocated before this livepatch + * was loaded.) + */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) { + leak = *shadow_leak; + klp_shadow_free(d, SV_LEAK); + kfree(leak); + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, leak); + } else { + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + } + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_alloc", + .new_func = livepatch_fix1_dummy_alloc, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix1_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix1_init(void) +{ + int ret; + + ret = klp_register_patch(&patch); + if (ret) + return ret; + ret = klp_enable_patch(&patch); + if (ret) { + WARN_ON(klp_unregister_patch(&patch)); + return ret; + } + return 0; +} + +static void livepatch_shadow_fix1_exit(void) +{ + /* Cleanup any existing SV_LEAK shadow variables */ + klp_shadow_free_all(SV_LEAK); + + WARN_ON(klp_unregister_patch(&patch)); +} + +module_init(livepatch_shadow_fix1_init); +module_exit(livepatch_shadow_fix1_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c new file mode 100644 index 000000000000..ff9948f0ec00 --- /dev/null +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-shadow-fix2.c - Shadow variables, livepatch demo + * + * Purpose + * ------- + * + * Adds functionality to livepatch-shadow-mod's in-flight data + * structures through a shadow variable. The livepatch patches a + * routine that periodically inspects data structures, incrementing a + * per-data-structure counter, creating the counter if needed. + * + * + * Usage + * ----- + * + * This module is not intended to be standalone. See the "Usage" + * section of livepatch-shadow-mod.c. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/livepatch.h> +#include <linux/slab.h> + +/* Shadow variable enums */ +#define SV_LEAK 1 +#define SV_COUNTER 2 + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies) +{ + int *shadow_count; + int count; + + /* + * Patch: handle in-flight dummy structures, if they do not + * already have a SV_COUNTER shadow variable, then attach a + * new one. + */ + count = 0; + shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER, + &count, sizeof(count), + GFP_NOWAIT); + if (shadow_count) + *shadow_count += 1; + + return time_after(jiffies, d->jiffies_expire); +} + +void livepatch_fix2_dummy_free(struct dummy *d) +{ + void **shadow_leak, *leak; + int *shadow_count; + + /* Patch: copy the memory leak patch from the fix1 module. */ + shadow_leak = klp_shadow_get(d, SV_LEAK); + if (shadow_leak) { + leak = *shadow_leak; + klp_shadow_free(d, SV_LEAK); + kfree(leak); + pr_info("%s: dummy @ %p, prevented leak @ %p\n", + __func__, d, leak); + } else { + pr_info("%s: dummy @ %p leaked!\n", __func__, d); + } + + /* + * Patch: fetch the SV_COUNTER shadow variable and display + * the final count. Detach the shadow variable. + */ + shadow_count = klp_shadow_get(d, SV_COUNTER); + if (shadow_count) { + pr_info("%s: dummy @ %p, check counter = %d\n", + __func__, d, *shadow_count); + klp_shadow_free(d, SV_COUNTER); + } + + kfree(d); +} + +static struct klp_func funcs[] = { + { + .old_name = "dummy_check", + .new_func = livepatch_fix2_dummy_check, + }, + { + .old_name = "dummy_free", + .new_func = livepatch_fix2_dummy_free, + }, { } +}; + +static struct klp_object objs[] = { + { + .name = "livepatch_shadow_mod", + .funcs = funcs, + }, { } +}; + +static struct klp_patch patch = { + .mod = THIS_MODULE, + .objs = objs, +}; + +static int livepatch_shadow_fix2_init(void) +{ + int ret; + + ret = klp_register_patch(&patch); + if (ret) + return ret; + ret = klp_enable_patch(&patch); + if (ret) { + WARN_ON(klp_unregister_patch(&patch)); + return ret; + } + return 0; +} + +static void livepatch_shadow_fix2_exit(void) +{ + /* Cleanup any existing SV_COUNTER shadow variables */ + klp_shadow_free_all(SV_COUNTER); + + WARN_ON(klp_unregister_patch(&patch)); +} + +module_init(livepatch_shadow_fix2_init); +module_exit(livepatch_shadow_fix2_exit); +MODULE_LICENSE("GPL"); +MODULE_INFO(livepatch, "Y"); diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c new file mode 100644 index 000000000000..4c54b250332d --- /dev/null +++ b/samples/livepatch/livepatch-shadow-mod.c @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * livepatch-shadow-mod.c - Shadow variables, buggy module demo + * + * Purpose + * ------- + * + * As a demonstration of livepatch shadow variable API, this module + * introduces memory leak behavior that livepatch modules + * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and + * enhance. + * + * WARNING - even though the livepatch-shadow-fix modules patch the + * memory leak, please load these modules at your own risk -- some + * amount of memory may leaked before the bug is patched. + * + * + * Usage + * ----- + * + * Step 1 - Load the buggy demonstration module: + * + * insmod samples/livepatch/livepatch-shadow-mod.ko + * + * Watch dmesg output for a few moments to see new dummy being allocated + * and a periodic cleanup check. (Note: a small amount of memory is + * being leaked.) + * + * + * Step 2 - Load livepatch fix1: + * + * insmod samples/livepatch/livepatch-shadow-fix1.ko + * + * Continue watching dmesg and note that now livepatch_fix1_dummy_free() + * and livepatch_fix1_dummy_alloc() are logging messages about leaked + * memory and eventually leaks prevented. + * + * + * Step 3 - Load livepatch fix2 (on top of fix1): + * + * insmod samples/livepatch/livepatch-shadow-fix2.ko + * + * This module extends functionality through shadow variables, as a new + * "check" counter is added to the dummy structure. Periodic dmesg + * messages will log these as dummies are cleaned up. + * + * + * Step 4 - Cleanup + * + * Unwind the demonstration by disabling the livepatch fix modules, then + * removing them and the demo module: + * + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled + * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled + * rmmod livepatch-shadow-fix2 + * rmmod livepatch-shadow-fix1 + * rmmod livepatch-shadow-mod + */ + + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/workqueue.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>"); +MODULE_DESCRIPTION("Buggy module for shadow variable demo"); + +/* Allocate new dummies every second */ +#define ALLOC_PERIOD 1 +/* Check for expired dummies after a few new ones have been allocated */ +#define CLEANUP_PERIOD (3 * ALLOC_PERIOD) +/* Dummies expire after a few cleanup instances */ +#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD) + +/* + * Keep a list of all the dummies so we can clean up any residual ones + * on module exit + */ +LIST_HEAD(dummy_list); +DEFINE_MUTEX(dummy_list_mutex); + +struct dummy { + struct list_head list; + unsigned long jiffies_expire; +}; + +noinline struct dummy *dummy_alloc(void) +{ + struct dummy *d; + void *leak; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return NULL; + + d->jiffies_expire = jiffies + + msecs_to_jiffies(1000 * EXPIRE_PERIOD); + + /* Oops, forgot to save leak! */ + leak = kzalloc(sizeof(int), GFP_KERNEL); + + pr_info("%s: dummy @ %p, expires @ %lx\n", + __func__, d, d->jiffies_expire); + + return d; +} + +noinline void dummy_free(struct dummy *d) +{ + pr_info("%s: dummy @ %p, expired = %lx\n", + __func__, d, d->jiffies_expire); + + kfree(d); +} + +noinline bool dummy_check(struct dummy *d, unsigned long jiffies) +{ + return time_after(jiffies, d->jiffies_expire); +} + +/* + * alloc_work_func: allocates new dummy structures, allocates additional + * memory, aptly named "leak", but doesn't keep + * permanent record of it. + */ + +static void alloc_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func); + +static void alloc_work_func(struct work_struct *work) +{ + struct dummy *d; + + d = dummy_alloc(); + if (!d) + return; + + mutex_lock(&dummy_list_mutex); + list_add(&d->list, &dummy_list); + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&alloc_dwork, + msecs_to_jiffies(1000 * ALLOC_PERIOD)); +} + +/* + * cleanup_work_func: frees dummy structures. Without knownledge of + * "leak", it leaks the additional memory that + * alloc_work_func created. + */ + +static void cleanup_work_func(struct work_struct *work); +static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func); + +static void cleanup_work_func(struct work_struct *work) +{ + struct dummy *d, *tmp; + unsigned long j; + + j = jiffies; + pr_info("%s: jiffies = %lx\n", __func__, j); + + mutex_lock(&dummy_list_mutex); + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + + /* Kick out and free any expired dummies */ + if (dummy_check(d, j)) { + list_del(&d->list); + dummy_free(d); + } + } + mutex_unlock(&dummy_list_mutex); + + schedule_delayed_work(&cleanup_dwork, + msecs_to_jiffies(1000 * CLEANUP_PERIOD)); +} + +static int livepatch_shadow_mod_init(void) +{ + schedule_delayed_work(&alloc_dwork, + msecs_to_jiffies(1000 * ALLOC_PERIOD)); + schedule_delayed_work(&cleanup_dwork, + msecs_to_jiffies(1000 * CLEANUP_PERIOD)); + + return 0; +} + +static void livepatch_shadow_mod_exit(void) +{ + struct dummy *d, *tmp; + + /* Wait for any dummies at work */ + cancel_delayed_work_sync(&alloc_dwork); + cancel_delayed_work_sync(&cleanup_dwork); + + /* Cleanup residual dummies */ + list_for_each_entry_safe(d, tmp, &dummy_list, list) { + list_del(&d->list); + dummy_free(d); + } +} + +module_init(livepatch_shadow_mod_init); +module_exit(livepatch_shadow_mod_exit); diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c index 49db1def1721..f42ce551bb48 100644 --- a/samples/mic/mpssd/mpssd.c +++ b/samples/mic/mpssd/mpssd.c @@ -65,7 +65,7 @@ static struct mic_info mic_list; /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE) -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) #define GSO_ENABLED 1 #define MAX_GSO_SIZE (64 * 1024) @@ -382,7 +382,7 @@ disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy, static inline __u16 read_avail_idx(struct mic_vring *vr) { - return ACCESS_ONCE(vr->info->avail_idx); + return READ_ONCE(vr->info->avail_idx); } static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr, @@ -523,7 +523,7 @@ spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr) { __u16 avail_idx = read_avail_idx(vr); - while (avail_idx == le16toh(ACCESS_ONCE(vr->vr.avail->idx))) { + while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) { #ifdef DEBUG mpsslog("%s %s waiting for desc avail %d info_avail %d\n", mic->name, __func__, diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh index 205e4cde4601..f8bb3cd0f4ce 100644 --- a/samples/pktgen/functions.sh +++ b/samples/pktgen/functions.sh @@ -119,3 +119,46 @@ function root_check_run_with_sudo() { err 4 "cannot perform sudo run of $0" fi } + +# Exact input device's NUMA node info +function get_iface_node() +{ + local node=$(</sys/class/net/$1/device/numa_node) + if [[ $node == -1 ]]; then + echo 0 + else + echo $node + fi +} + +# Given an Dev/iface, get its queues' irq numbers +function get_iface_irqs() +{ + local IFACE=$1 + local queues="${IFACE}-.*TxRx" + + irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:) + [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\ + do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\ + done) + [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE" + + echo $irqs +} + +# Given a NUMA node, return cpu ids belonging to it. +function get_node_cpus() +{ + local node=$1 + local node_cpu_list + local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \ + /sys/devices/system/node/node$node/cpulist` + + for cpu_range in $node_cpu_range_list + do + node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }` + done + + echo $node_cpu_list +} diff --git a/samples/pktgen/pktgen.conf-1-1-ip6 b/samples/pktgen/pktgen.conf-1-1-ip6 deleted file mode 100755 index 62426afeef84..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6 +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6 fec0::1" - pgset "src6 fec0::2" - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-1-ip6-rdos b/samples/pktgen/pktgen.conf-1-1-ip6-rdos deleted file mode 100755 index 3ac3eb1f3504..000000000000 --- a/samples/pktgen/pktgen.conf-1-1-ip6-rdos +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# Each CPU has its own thread. One CPU example. We add eth1. -# IPv6. Note increase in minimal packet length - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - - -# device config -# delay 0 means maximum speed. - -# We need to do alloc for every skb since we cannot clone here. -CLONE_SKB="clone_skb 0" - -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 66" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst6_min fec0::1" - pgset "dst6_max fec0::FFFF:FFFF" - - pgset "dst_mac 00:04:23:08:91:dc" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 diff --git a/samples/pktgen/pktgen.conf-1-2 b/samples/pktgen/pktgen.conf-1-2 deleted file mode 100755 index a85552760762..000000000000 --- a/samples/pktgen/pktgen.conf-1-2 +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -#modprobe pktgen - - -function pgset() { - local result - - echo $1 > $PGDEV - - result=`cat $PGDEV | fgrep "Result: OK:"` - if [ "$result" = "" ]; then - cat $PGDEV | fgrep Result: - fi -} - -# Config Start Here ----------------------------------------------------------- - - -# thread config -# One CPU means one thread. One CPU example. We add eth1, eth2 respectivly. - -PGDEV=/proc/net/pktgen/kpktgend_0 - echo "Removing all devices" - pgset "rem_device_all" - echo "Adding eth1" - pgset "add_device eth1" - echo "Adding eth2" - pgset "add_device eth2" - - -# device config -# delay 0 means maximum speed. - -CLONE_SKB="clone_skb 1000000" -# NIC adds 4 bytes CRC -PKT_SIZE="pkt_size 60" - -# COUNT 0 means forever -#COUNT="count 0" -COUNT="count 10000000" -DELAY="delay 0" - -PGDEV=/proc/net/pktgen/eth1 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 10.10.11.2" - pgset "dst_mac 00:04:23:08:91:dc" - -PGDEV=/proc/net/pktgen/eth2 - echo "Configuring $PGDEV" - pgset "$COUNT" - pgset "$CLONE_SKB" - pgset "$PKT_SIZE" - pgset "$DELAY" - pgset "dst 192.168.2.2" - pgset "dst_mac 00:04:23:08:91:de" - -# Time to run -PGDEV=/proc/net/pktgen/pgctrl - - echo "Running... ctrl^C to stop" - trap true INT - pgset "start" - echo "Done" - cat /proc/net/pktgen/eth1 /proc/net/pktgen/eth2 diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh index 4c2e4217638a..8fdd36722d9e 100755 --- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh +++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh @@ -31,7 +31,7 @@ if [ -z "$DEST_IP" ]; then fi [ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" [ -z "$BURST" ] && BURST=32 -[ -z "$CLONE_SKB" ] && CLONE_SKB="100000" +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting [ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely # Base Config diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh new file mode 100755 index 000000000000..353adc17205e --- /dev/null +++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh @@ -0,0 +1,97 @@ +#!/bin/bash +# +# Multiqueue: Using pktgen threads for sending on multiple CPUs +# * adding devices to kernel threads which are in the same NUMA node +# * bound devices queue's irq affinity to the threads, 1:1 mapping +# * notice the naming scheme for keeping device names unique +# * nameing scheme: dev@thread_number +# * flow variation via random UDP source port +# +basedir=`dirname $0` +source ${basedir}/functions.sh +root_check_run_with_sudo "$@" +# +# Required param: -i dev in $DEV +source ${basedir}/parameters.sh + +# Base Config +DELAY="0" # Zero means max speed +[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely +[ -z "$CLONE_SKB" ] && CLONE_SKB="0" + +# Flow variation random source port between min and max +UDP_MIN=9 +UDP_MAX=109 + +node=`get_iface_node $DEV` +irq_array=(`get_iface_irqs $DEV`) +cpu_array=(`get_node_cpus $node`) + +[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \ + err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})" + +# (example of setting default params in your script) +if [ -z "$DEST_IP" ]; then + [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1" +fi +[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff" + +# General cleanup everything since last run +pg_ctrl "reset" + +# Threads are specified with parameter -t value in $THREADS +for ((i = 0; i < $THREADS; i++)); do + # The device name is extended with @name, using thread number to + # make then unique, but any name will do. + # Set the queue's irq affinity to this $thread (processor) + # if '-f' is designated, offset cpu id + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list + info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`" + + # Add remove all other devices and add_device $dev to thread + pg_thread $thread "rem_device_all" + pg_thread $thread "add_device" $dev + + # select queue and bind the queue and $dev in 1:1 relationship + queue_num=$i + info "queue number is $queue_num" + pg_set $dev "queue_map_min $queue_num" + pg_set $dev "queue_map_max $queue_num" + + # Notice config queue to map to cpu (mirrors smp_processor_id()) + # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number + pg_set $dev "flag QUEUE_MAP_CPU" + + # Base config of dev + pg_set $dev "count $COUNT" + pg_set $dev "clone_skb $CLONE_SKB" + pg_set $dev "pkt_size $PKT_SIZE" + pg_set $dev "delay $DELAY" + + # Flag example disabling timestamping + pg_set $dev "flag NO_TIMESTAMP" + + # Destination + pg_set $dev "dst_mac $DST_MAC" + pg_set $dev "dst$IP6 $DEST_IP" + + # Setup random UDP port src range + pg_set $dev "flag UDPSRC_RND" + pg_set $dev "udp_src_min $UDP_MIN" + pg_set $dev "udp_src_max $UDP_MAX" +done + +# start_run +echo "Running... ctrl^C to stop" >&2 +pg_ctrl "start" +echo "Done" >&2 + +# Print results +for ((i = 0; i < $THREADS; i++)); do + thread=${cpu_array[$((i+F_THREAD))]} + dev=${DEV}@${thread} + echo "Device: $dev" + cat /proc/net/pktgen/$dev | grep -A2 "Result:" +done diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile index 19a870eed82b..ba942e3ead89 100644 --- a/samples/seccomp/Makefile +++ b/samples/seccomp/Makefile @@ -1,7 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - +ifndef CROSS_COMPILE hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include @@ -19,7 +17,6 @@ HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include bpf-direct-objs := bpf-direct.o # Try to match the kernel target. -ifndef CROSS_COMPILE ifndef CONFIG_64BIT # s390 has -m31 flag to build 31 bit binaries @@ -38,12 +35,4 @@ HOSTLOADLIBES_bpf-fancy += $(MFLAG) HOSTLOADLIBES_dropper += $(MFLAG) endif always := $(hostprogs-m) -else -# MIPS system calls are defined based on the -mabi that is passed -# to the toolchain which may or may not be a valid option -# for the host toolchain. So disable tests if target architecture -# is MIPS but the host isn't. -ifndef CONFIG_MIPS -always := $(hostprogs-m) -endif endif diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile index 9291ab8e0f8c..73f1da4d116c 100644 --- a/samples/sockmap/Makefile +++ b/samples/sockmap/Makefile @@ -1,6 +1,3 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - # List of programs to build hostprogs-y := sockmap diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 7cc9d228216f..7c25c0c112bc 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -23,8 +23,11 @@ #include <stdbool.h> #include <signal.h> #include <fcntl.h> +#include <sys/wait.h> +#include <time.h> #include <sys/time.h> +#include <sys/resource.h> #include <sys/types.h> #include <linux/netlink.h> @@ -35,6 +38,8 @@ #include <assert.h> #include <libgen.h> +#include <getopt.h> + #include "../bpf/bpf_load.h" #include "../bpf/bpf_util.h" #include "../bpf/libbpf.h" @@ -46,15 +51,42 @@ void running_handler(int a); #define S1_PORT 10000 #define S2_PORT 10001 -static int sockmap_test_sockets(int rate, int dot) +/* global sockets */ +int s1, s2, c1, c2, p1, p2; + +static const struct option long_options[] = { + {"help", no_argument, NULL, 'h' }, + {"cgroup", required_argument, NULL, 'c' }, + {"rate", required_argument, NULL, 'r' }, + {"verbose", no_argument, NULL, 'v' }, + {"iov_count", required_argument, NULL, 'i' }, + {"length", required_argument, NULL, 'l' }, + {"test", required_argument, NULL, 't' }, + {0, 0, NULL, 0 } +}; + +static void usage(char *argv[]) +{ + int i; + + printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]); + printf(" options:\n"); + for (i = 0; long_options[i].name != 0; i++) { + printf(" --%-12s", long_options[i].name); + if (long_options[i].flag != NULL) + printf(" flag (internal value:%d)\n", + *long_options[i].flag); + else + printf(" -%c\n", long_options[i].val); + } + printf("\n"); +} + +static int sockmap_init_sockets(void) { - int i, sc, err, max_fd, one = 1; - int s1, s2, c1, c2, p1, p2; + int i, err, one = 1; struct sockaddr_in addr; - struct timeval timeout; - char buf[1024] = {0}; int *fds[4] = {&s1, &s2, &c1, &c2}; - fd_set w; s1 = s2 = p1 = p2 = c1 = c2 = 0; @@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot) *fds[i] = socket(AF_INET, SOCK_STREAM, 0); if (*fds[i] < 0) { perror("socket s1 failed()"); - err = *fds[i]; - goto out; + return errno; } } @@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot) (char *)&one, sizeof(one)); if (err) { perror("setsockopt failed()"); - goto out; + return errno; } } /* Non-blocking sockets */ - for (i = 0; i < 4; i++) { + for (i = 0; i < 2; i++) { err = ioctl(*fds[i], FIONBIO, (char *)&one); if (err < 0) { perror("ioctl s1 failed()"); - goto out; + return errno; } } @@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot) err = bind(s1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = bind(s2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0) { perror("bind s2 failed()\n"); - goto out; + return errno; } /* Listen server sockets */ @@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot) err = listen(s1, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = listen(s2, 32); if (err < 0) { perror("listen s1 failed()\n"); - goto out; + return errno; } /* Initiate Connect */ @@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot) err = connect(c1, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c1 failed()\n"); - goto out; + return errno; } addr.sin_port = htons(S2_PORT); err = connect(c2, (struct sockaddr *)&addr, sizeof(addr)); if (err < 0 && errno != EINPROGRESS) { perror("connect c2 failed()\n"); - goto out; + return errno; + } else if (err < 0) { + err = 0; } /* Accept Connecrtions */ p1 = accept(s1, NULL, NULL); if (p1 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } p2 = accept(s2, NULL, NULL); if (p2 < 0) { perror("accept s1 failed()\n"); - goto out; + return errno; } - max_fd = p2; - timeout.tv_sec = 10; - timeout.tv_usec = 0; - printf("connected sockets: c1 <-> p1, c2 <-> p2\n"); printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n", c1, s1, c2, s2); + return 0; +} + +struct msg_stats { + size_t bytes_sent; + size_t bytes_recvd; + struct timespec start; + struct timespec end; +}; + +static int msg_loop(int fd, int iov_count, int iov_length, int cnt, + struct msg_stats *s, bool tx) +{ + struct msghdr msg = {0}; + int err, i, flags = MSG_NOSIGNAL; + struct iovec *iov; + + iov = calloc(iov_count, sizeof(struct iovec)); + if (!iov) + return errno; + + for (i = 0; i < iov_count; i++) { + char *d = calloc(iov_length, sizeof(char)); + + if (!d) { + fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); + goto out_errno; + } + iov[i].iov_base = d; + iov[i].iov_len = iov_length; + } + + msg.msg_iov = iov; + msg.msg_iovlen = iov_count; + + if (tx) { + clock_gettime(CLOCK_MONOTONIC, &s->start); + for (i = 0; i < cnt; i++) { + int sent = sendmsg(fd, &msg, flags); + + if (sent < 0) { + perror("send loop error:"); + goto out_errno; + } + s->bytes_sent += sent; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } else { + int slct, recv, max_fd = fd; + struct timeval timeout; + float total_bytes; + fd_set w; + + total_bytes = (float)iov_count * (float)iov_length * (float)cnt; + err = clock_gettime(CLOCK_MONOTONIC, &s->start); + if (err < 0) + perror("recv start time: "); + while (s->bytes_recvd < total_bytes) { + timeout.tv_sec = 1; + timeout.tv_usec = 0; + + /* FD sets */ + FD_ZERO(&w); + FD_SET(fd, &w); + + slct = select(max_fd + 1, &w, NULL, NULL, &timeout); + if (slct == -1) { + perror("select()"); + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } else if (!slct) { + fprintf(stderr, "unexpected timeout\n"); + errno = -EIO; + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } + + recv = recvmsg(fd, &msg, flags); + if (recv < 0) { + if (errno != EWOULDBLOCK) { + clock_gettime(CLOCK_MONOTONIC, &s->end); + perror("recv failed()\n"); + goto out_errno; + } + } + + s->bytes_recvd += recv; + } + clock_gettime(CLOCK_MONOTONIC, &s->end); + } + + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return 0; +out_errno: + for (i = 0; i < iov_count; i++) + free(iov[i].iov_base); + free(iov); + return errno; +} + +static float giga = 1000000000; + +static inline float sentBps(struct msg_stats s) +{ + return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec); +} + +static inline float recvdBps(struct msg_stats s) +{ + return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec); +} + +static int sendmsg_test(int iov_count, int iov_buf, int cnt, + int verbose, bool base) +{ + float sent_Bps = 0, recvd_Bps = 0; + int rx_fd, txpid, rxpid, err = 0; + struct msg_stats s = {0}; + int status; + + errno = 0; + + if (base) + rx_fd = p1; + else + rx_fd = p2; + + rxpid = fork(); + if (rxpid == 0) { + err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false); + if (err) + fprintf(stderr, + "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(p2, SHUT_RDWR); + shutdown(p1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (rxpid == -1) { + perror("msg_loop_rx: "); + return errno; + } + + txpid = fork(); + if (txpid == 0) { + err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true); + if (err) + fprintf(stderr, + "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n", + iov_count, iov_buf, cnt, err); + shutdown(c1, SHUT_RDWR); + if (s.end.tv_sec - s.start.tv_sec) { + sent_Bps = sentBps(s); + recvd_Bps = recvdBps(s); + } + fprintf(stdout, + "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n", + s.bytes_sent, sent_Bps, sent_Bps/giga, + s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + exit(1); + } else if (txpid == -1) { + perror("msg_loop_tx: "); + return errno; + } + + assert(waitpid(rxpid, &status, 0) == rxpid); + assert(waitpid(txpid, &status, 0) == txpid); + return err; +} + +static int forever_ping_pong(int rate, int verbose) +{ + struct timeval timeout; + char buf[1024] = {0}; + int sc; + + timeout.tv_sec = 10; + timeout.tv_usec = 0; /* Ping/Pong data from client to server */ sc = send(c1, buf, sizeof(buf), 0); if (sc < 0) { perror("send failed()\n"); - goto out; + return sc; } do { - int s, rc, i; + int s, rc, i, max_fd = p2; + fd_set w; /* FD sets */ FD_ZERO(&w); @@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot) if (rc < 0) { if (errno != EWOULDBLOCK) { perror("recv failed()\n"); - break; + return rc; } } @@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot) sc = send(i, buf, rc, 0); if (sc < 0) { perror("send failed()\n"); - break; + return sc; } } - sleep(rate); - if (dot) { + + if (rate) + sleep(rate); + + if (verbose) { printf("."); fflush(stdout); } } while (running); -out: - close(s1); - close(s2); - close(p1); - close(p2); - close(c1); - close(c2); - return err; + return 0; } +enum { + PING_PONG, + SENDMSG, + BASE, +}; + int main(int argc, char **argv) { - int rate = 1, dot = 1; + int iov_count = 1, length = 1024, rate = 1, verbose = 0; + struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + int opt, longindex, err, cg_fd = 0; + int test = PING_PONG; char filename[256]; - int err, cg_fd; - char *cg_path; - cg_path = argv[argc - 1]; + while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:", + long_options, &longindex)) != -1) { + switch (opt) { + /* Cgroup configuration */ + case 'c': + cg_fd = open(optarg, O_DIRECTORY, O_RDONLY); + if (cg_fd < 0) { + fprintf(stderr, + "ERROR: (%i) open cg path failed: %s\n", + cg_fd, optarg); + return cg_fd; + } + break; + case 'r': + rate = atoi(optarg); + break; + case 'v': + verbose = 1; + break; + case 'i': + iov_count = atoi(optarg); + break; + case 'l': + length = atoi(optarg); + break; + case 't': + if (strcmp(optarg, "ping") == 0) { + test = PING_PONG; + } else if (strcmp(optarg, "sendmsg") == 0) { + test = SENDMSG; + } else if (strcmp(optarg, "base") == 0) { + test = BASE; + } else { + usage(argv); + return -1; + } + break; + case 'h': + default: + usage(argv); + return -1; + } + } + + if (!cg_fd) { + fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n", + argv[0]); + return -1; + } + + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); running = 1; @@ -241,20 +515,16 @@ int main(int argc, char **argv) /* catch SIGINT */ signal(SIGINT, running_handler); + /* If base test skip BPF setup */ + if (test == BASE) + goto run; + if (load_bpf_file(filename)) { fprintf(stderr, "load_bpf_file: (%s) %s\n", filename, strerror(errno)); return 1; } - /* Cgroup configuration */ - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - if (cg_fd < 0) { - fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n", - cg_fd, cg_path); - return cg_fd; - } - /* Attach programs to sockmap */ err = bpf_prog_attach(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER, 0); @@ -280,12 +550,30 @@ int main(int argc, char **argv) return err; } - err = sockmap_test_sockets(rate, dot); +run: + err = sockmap_init_sockets(); if (err) { fprintf(stderr, "ERROR: test socket failed: %d\n", err); - return err; + goto out; } - return 0; + + if (test == PING_PONG) + err = forever_ping_pong(rate, verbose); + else if (test == SENDMSG) + err = sendmsg_test(iov_count, length, rate, verbose, false); + else if (test == BASE) + err = sendmsg_test(iov_count, length, rate, verbose, true); + else + fprintf(stderr, "unknown test\n"); +out: + close(s1); + close(s2); + close(p1); + close(p2); + close(c1); + close(c2); + close(cg_fd); + return err; } void running_handler(int a) diff --git a/samples/statx/Makefile b/samples/statx/Makefile index 1f80a3d8cf45..59df7c25a9d1 100644 --- a/samples/statx/Makefile +++ b/samples/statx/Makefile @@ -1,6 +1,3 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - # List of programs to build hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 5bcd91455ec8..80b4a70315b6 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -96,7 +96,7 @@ * __entry->bar.x = y; * __array: There are three fields (type, name, size). The type is the - * type of elements in teh array, the name is the name of the array. + * type of elements in the array, the name is the name of the array. * size is the number of items in the array (not the total size). * * __array( char, foo, 10) is the same as saying: char foo[10]; @@ -113,7 +113,7 @@ * type is the type of the element, name is the name of the array. * The size is different than __array. It is not a static number, * but the algorithm to figure out the length of the array for the - * specific instance of tracepoint. Again, size is the numebr of + * specific instance of tracepoint. Again, size is the number of * items in the array, not the total length in bytes. * * __dynamic_array( int, foo, bar) is similar to: int foo[bar]; @@ -126,9 +126,9 @@ * Notice, that "__entry" is not needed here. * * __string: This is a special kind of __dynamic_array. It expects to - * have a nul terminated character array passed to it (it allows + * have a null terminated character array passed to it (it allows * for NULL too, which would be converted into "(null)"). __string - * takes two paramenter (name, src), where name is the name of + * takes two parameter (name, src), where name is the name of * the string saved, and src is the string to copy into the * ring buffer. * @@ -445,7 +445,7 @@ DECLARE_EVENT_CLASS(foo_template, /* * Here's a better way for the previous samples (except, the first - * exmaple had more fields and could not be used here). + * example had more fields and could not be used here). */ DEFINE_EVENT(foo_template, foo_with_template_simple, TP_PROTO(const char *foo, int bar), diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile index c95a696560a7..8d7fd6190ac4 100644 --- a/samples/uhid/Makefile +++ b/samples/uhid/Makefile @@ -1,6 +1,3 @@ -# kbuild trick to avoid linker error. Can be omitted if a module is built. -obj- := dummy.o - # List of programs to build hostprogs-y := uhid-example diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c index 483e9bca9444..f520e3aef9c6 100644 --- a/samples/v4l/v4l2-pci-skeleton.c +++ b/samples/v4l/v4l2-pci-skeleton.c @@ -772,8 +772,10 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Allocate a new instance */ skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL); - if (!skel) - return -ENOMEM; + if (!skel) { + ret = -ENOMEM; + goto disable_pci; + } /* Allocate the interrupt */ ret = devm_request_irq(&pdev->dev, pdev->irq, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index ca495686b9c3..09f255bdf3ac 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1413,7 +1413,7 @@ struct attribute_group *mdev_type_groups[] = { NULL, }; -struct mdev_parent_ops mdev_fops = { +static const struct mdev_parent_ops mdev_fops = { .owner = THIS_MODULE, .dev_attr_groups = mtty_dev_groups, .mdev_attr_groups = mdev_dev_groups, |