aboutsummaryrefslogtreecommitdiffstats
path: root/samples
diff options
context:
space:
mode:
Diffstat (limited to 'samples')
-rw-r--r--samples/Kconfig5
-rw-r--r--samples/bpf/Makefile45
-rw-r--r--samples/bpf/README.rst10
-rw-r--r--samples/bpf/bpf_load.c118
-rw-r--r--samples/bpf/bpf_load.h2
-rw-r--r--samples/bpf/cgroup_helpers.c178
-rw-r--r--samples/bpf/cgroup_helpers.h17
-rw-r--r--samples/bpf/map_perf_test_kern.c2
-rw-r--r--samples/bpf/map_perf_test_user.c3
-rw-r--r--samples/bpf/syscall_tp_user.c66
-rw-r--r--samples/bpf/tcbpf2_kern.c167
-rw-r--r--samples/bpf/tcp_basertt_kern.c78
-rw-r--r--samples/bpf/tcp_bpf.readme26
-rw-r--r--samples/bpf/tcp_bufs_kern.c14
-rw-r--r--samples/bpf/tcp_clamp_kern.c24
-rw-r--r--samples/bpf/tcp_cong_kern.c6
-rw-r--r--samples/bpf/tcp_iw_kern.c14
-rw-r--r--samples/bpf/tcp_rwnd_kern.c6
-rw-r--r--samples/bpf/tcp_synrto_kern.c6
-rw-r--r--samples/bpf/test_cgrp2_attach2.c248
-rwxr-xr-xsamples/bpf/test_override_return.sh15
-rwxr-xr-xsamples/bpf/test_tunnel_bpf.sh128
-rw-r--r--samples/bpf/trace_event_kern.c10
-rw-r--r--samples/bpf/trace_event_user.c13
-rw-r--r--samples/bpf/tracex6_kern.c26
-rw-r--r--samples/bpf/tracex6_user.c13
-rw-r--r--samples/bpf/tracex7_kern.c16
-rw-r--r--samples/bpf/tracex7_user.c28
-rw-r--r--samples/bpf/xdp1_user.c12
-rwxr-xr-xsamples/bpf/xdp2skb_meta.sh220
-rw-r--r--samples/bpf/xdp2skb_meta_kern.c105
-rw-r--r--samples/bpf/xdp_monitor_kern.c154
-rw-r--r--samples/bpf/xdp_monitor_user.c479
-rw-r--r--samples/bpf/xdp_redirect_cpu_kern.c609
-rw-r--r--samples/bpf/xdp_redirect_cpu_user.c697
-rw-r--r--samples/bpf/xdp_redirect_map_user.c15
-rw-r--r--samples/bpf/xdp_redirect_user.c8
-rw-r--r--samples/bpf/xdp_router_ipv4_kern.c186
-rw-r--r--samples/bpf/xdp_router_ipv4_user.c660
-rw-r--r--samples/bpf/xdp_rxq_info_kern.c96
-rw-r--r--samples/bpf/xdp_rxq_info_user.c531
-rw-r--r--samples/bpf/xdp_tx_iptunnel_user.c6
-rw-r--r--samples/configfs/configfs_sample.c8
-rw-r--r--samples/connector/cn_test.c6
-rw-r--r--samples/hidraw/Makefile3
-rw-r--r--samples/kobject/kobject-example.c4
-rw-r--r--samples/kobject/kset-example.c4
-rw-r--r--samples/kprobes/Makefile2
-rw-r--r--samples/kprobes/jprobe_example.c67
-rw-r--r--samples/kprobes/kprobe_example.c8
-rw-r--r--samples/livepatch/Makefile6
-rw-r--r--samples/livepatch/livepatch-callbacks-busymod.c72
-rw-r--r--samples/livepatch/livepatch-callbacks-demo.c219
-rw-r--r--samples/livepatch/livepatch-callbacks-mod.c53
-rw-r--r--samples/livepatch/livepatch-sample.c15
-rw-r--r--samples/livepatch/livepatch-shadow-fix1.c158
-rw-r--r--samples/livepatch/livepatch-shadow-fix2.c153
-rw-r--r--samples/livepatch/livepatch-shadow-mod.c224
-rw-r--r--samples/mic/mpssd/mpssd.c6
-rw-r--r--samples/pktgen/functions.sh43
-rwxr-xr-xsamples/pktgen/pktgen.conf-1-1-ip661
-rwxr-xr-xsamples/pktgen/pktgen.conf-1-1-ip6-rdos64
-rwxr-xr-xsamples/pktgen/pktgen.conf-1-270
-rwxr-xr-xsamples/pktgen/pktgen_sample03_burst_single_flow.sh2
-rwxr-xr-xsamples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh97
-rw-r--r--samples/seccomp/Makefile13
-rw-r--r--samples/sockmap/Makefile3
-rw-r--r--samples/sockmap/sockmap_user.c392
-rw-r--r--samples/statx/Makefile3
-rw-r--r--samples/trace_events/trace-events-sample.h10
-rw-r--r--samples/uhid/Makefile3
-rw-r--r--samples/v4l/v4l2-pci-skeleton.c6
-rw-r--r--samples/vfio-mdev/mtty.c2
73 files changed, 5982 insertions, 857 deletions
diff --git a/samples/Kconfig b/samples/Kconfig
index 9cb63188d3ef..c332a3b9de05 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -71,11 +71,10 @@ config SAMPLE_RPMSG_CLIENT
the rpmsg bus.
config SAMPLE_LIVEPATCH
- tristate "Build live patching sample -- loadable modules only"
+ tristate "Build live patching samples -- loadable modules only"
depends on LIVEPATCH && m
help
- Builds a sample live patch that replaces the procfs handler
- for /proc/cmdline to print "this has been live patched".
+ Build sample live patch demonstrations.
config SAMPLE_CONFIGFS
tristate "Build configfs patching sample -- loadable modules only"
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9b4a66e3363e..ec3fc8d88e87 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
# List of programs to build
hostprogs-y := test_lru_dist
hostprogs-y += sock_example
@@ -15,6 +12,7 @@ hostprogs-y += tracex3
hostprogs-y += tracex4
hostprogs-y += tracex5
hostprogs-y += tracex6
+hostprogs-y += tracex7
hostprogs-y += test_probe_write_user
hostprogs-y += trace_output
hostprogs-y += lathist
@@ -29,6 +27,7 @@ hostprogs-y += test_cgrp2_sock
hostprogs-y += test_cgrp2_sock2
hostprogs-y += xdp1
hostprogs-y += xdp2
+hostprogs-y += xdp_router_ipv4
hostprogs-y += test_current_task_under_cgroup
hostprogs-y += trace_event
hostprogs-y += sampleip
@@ -40,11 +39,14 @@ hostprogs-y += per_socket_stats_example
hostprogs-y += load_sock_ops
hostprogs-y += xdp_redirect
hostprogs-y += xdp_redirect_map
+hostprogs-y += xdp_redirect_cpu
hostprogs-y += xdp_monitor
+hostprogs-y += xdp_rxq_info
hostprogs-y += syscall_tp
# Libbpf dependencies
-LIBBPF := ../../tools/lib/bpf/bpf.o
+LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
+CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
sock_example-objs := sock_example.o $(LIBBPF)
@@ -58,6 +60,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -68,13 +71,14 @@ map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
test_cgrp2_attach-objs := $(LIBBPF) test_cgrp2_attach.o
-test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o cgroup_helpers.o
+test_cgrp2_attach2-objs := $(LIBBPF) test_cgrp2_attach2.o $(CGROUP_HELPERS)
test_cgrp2_sock-objs := $(LIBBPF) test_cgrp2_sock.o
test_cgrp2_sock2-objs := bpf_load.o $(LIBBPF) test_cgrp2_sock2.o
xdp1-objs := bpf_load.o $(LIBBPF) xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
-test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) cgroup_helpers.o \
+xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
+test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
test_current_task_under_cgroup_user.o
trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
@@ -85,7 +89,9 @@ test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o
per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o
xdp_redirect-objs := bpf_load.o $(LIBBPF) xdp_redirect_user.o
xdp_redirect_map-objs := bpf_load.o $(LIBBPF) xdp_redirect_map_user.o
+xdp_redirect_cpu-objs := bpf_load.o $(LIBBPF) xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
+xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
# Tell kbuild to always build the programs
@@ -99,6 +105,7 @@ always += tracex3_kern.o
always += tracex4_kern.o
always += tracex5_kern.o
always += tracex6_kern.o
+always += tracex7_kern.o
always += sock_flags_kern.o
always += test_probe_write_user_kern.o
always += trace_output_kern.o
@@ -115,6 +122,7 @@ always += parse_varlen.o parse_simple.o parse_ldabs.o
always += test_cgrp2_tc_kern.o
always += xdp1_kern.o
always += xdp2_kern.o
+always += xdp_router_ipv4_kern.o
always += test_current_task_under_cgroup_kern.o
always += trace_event_kern.o
always += sampleip_kern.o
@@ -128,9 +136,13 @@ always += tcp_bufs_kern.o
always += tcp_cong_kern.o
always += tcp_iw_kern.o
always += tcp_clamp_kern.o
+always += tcp_basertt_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
+always += xdp_redirect_cpu_kern.o
always += xdp_monitor_kern.o
+always += xdp_rxq_info_kern.o
+always += xdp2skb_meta_kern.o
always += syscall_tp_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
@@ -150,6 +162,7 @@ HOSTLOADLIBES_tracex3 += -lelf
HOSTLOADLIBES_tracex4 += -lelf -lrt
HOSTLOADLIBES_tracex5 += -lelf
HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
HOSTLOADLIBES_load_sock_ops += -lelf
HOSTLOADLIBES_test_probe_write_user += -lelf
@@ -161,6 +174,7 @@ HOSTLOADLIBES_map_perf_test += -lelf -lrt
HOSTLOADLIBES_test_overhead += -lelf -lrt
HOSTLOADLIBES_xdp1 += -lelf
HOSTLOADLIBES_xdp2 += -lelf
+HOSTLOADLIBES_xdp_router_ipv4 += -lelf
HOSTLOADLIBES_test_current_task_under_cgroup += -lelf
HOSTLOADLIBES_trace_event += -lelf
HOSTLOADLIBES_sampleip += -lelf
@@ -170,7 +184,9 @@ HOSTLOADLIBES_xdp_tx_iptunnel += -lelf
HOSTLOADLIBES_test_map_in_map += -lelf
HOSTLOADLIBES_xdp_redirect += -lelf
HOSTLOADLIBES_xdp_redirect_map += -lelf
+HOSTLOADLIBES_xdp_redirect_cpu += -lelf
HOSTLOADLIBES_xdp_monitor += -lelf
+HOSTLOADLIBES_xdp_rxq_info += -lelf
HOSTLOADLIBES_syscall_tp += -lelf
# Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
@@ -178,14 +194,23 @@ HOSTLOADLIBES_syscall_tp += -lelf
LLC ?= llc
CLANG ?= clang
+# Detect that we're cross compiling and use the cross compiler
+ifdef CROSS_COMPILE
+HOSTCC = $(CROSS_COMPILE)gcc
+CLANG_ARCH_ARGS = -target $(ARCH)
+endif
+
# Trick to allow make to be run from this directory
-all:
+all: $(LIBBPF)
$(MAKE) -C ../../ $(CURDIR)/
clean:
$(MAKE) -C ../../ M=$(CURDIR) clean
@rm -f *~
+$(LIBBPF): FORCE
+ $(MAKE) -C $(dir $@) $(notdir $@)
+
$(obj)/syscall_nrs.s: $(src)/syscall_nrs.c
$(call if_changed_dep,cc_s_c)
@@ -225,9 +250,9 @@ $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
$(obj)/%.o: $(src)/%.c
$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) -I$(obj) \
-I$(srctree)/tools/testing/selftests/bpf/ \
- -D__KERNEL__ -D__ASM_SYSREG_H -Wno-unused-value -Wno-pointer-sign \
- -Wno-compare-distinct-pointer-types \
+ -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
+ -D__TARGET_ARCH_$(ARCH) -Wno-compare-distinct-pointer-types \
-Wno-gnu-variable-sized-type-not-at-end \
-Wno-address-of-packed-member -Wno-tautological-compare \
- -Wno-unknown-warning-option \
+ -Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
diff --git a/samples/bpf/README.rst b/samples/bpf/README.rst
index 79f9a58f1872..5f27e4faca50 100644
--- a/samples/bpf/README.rst
+++ b/samples/bpf/README.rst
@@ -64,3 +64,13 @@ It is also possible to point make to the newly compiled 'llc' or
'clang' command via redefining LLC or CLANG on the make command line::
make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
+
+Cross compiling samples
+-----------------------
+In order to cross-compile, say for arm64 targets, export CROSS_COMPILE and ARCH
+environment variables before calling make. This will direct make to build
+samples for the cross target.
+
+export ARCH=arm64
+export CROSS_COMPILE="aarch64-linux-gnu-"
+make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 2325d7ad76df..69806d74fa53 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -193,8 +193,18 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
return -1;
}
event_fd[prog_cnt - 1] = efd;
- ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
- ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+ err = ioctl(efd, PERF_EVENT_IOC_ENABLE, 0);
+ if (err < 0) {
+ printf("ioctl PERF_EVENT_IOC_ENABLE failed err %s\n",
+ strerror(errno));
+ return -1;
+ }
+ err = ioctl(efd, PERF_EVENT_IOC_SET_BPF, fd);
+ if (err < 0) {
+ printf("ioctl PERF_EVENT_IOC_SET_BPF failed err %s\n",
+ strerror(errno));
+ return -1;
+ }
return 0;
}
@@ -222,6 +232,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
int inner_map_fd = map_fd[maps[i].def.inner_map_idx];
map_fd[i] = bpf_create_map_in_map_node(maps[i].def.type,
+ maps[i].name,
maps[i].def.key_size,
inner_map_fd,
maps[i].def.max_entries,
@@ -229,6 +240,7 @@ static int load_maps(struct bpf_map_data *maps, int nr_maps,
numa_node);
} else {
map_fd[i] = bpf_create_map_node(maps[i].def.type,
+ maps[i].name,
maps[i].def.key_size,
maps[i].def.value_size,
maps[i].def.max_entries,
@@ -683,105 +695,3 @@ struct ksym *ksym_search(long key)
return &syms[0];
}
-int set_link_xdp_fd(int ifindex, int fd, __u32 flags)
-{
- struct sockaddr_nl sa;
- int sock, seq = 0, len, ret = -1;
- char buf[4096];
- struct nlattr *nla, *nla_xdp;
- struct {
- struct nlmsghdr nh;
- struct ifinfomsg ifinfo;
- char attrbuf[64];
- } req;
- struct nlmsghdr *nh;
- struct nlmsgerr *err;
-
- memset(&sa, 0, sizeof(sa));
- sa.nl_family = AF_NETLINK;
-
- sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
- if (sock < 0) {
- printf("open netlink socket: %s\n", strerror(errno));
- return -1;
- }
-
- if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
- printf("bind to netlink: %s\n", strerror(errno));
- goto cleanup;
- }
-
- memset(&req, 0, sizeof(req));
- req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
- req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
- req.nh.nlmsg_type = RTM_SETLINK;
- req.nh.nlmsg_pid = 0;
- req.nh.nlmsg_seq = ++seq;
- req.ifinfo.ifi_family = AF_UNSPEC;
- req.ifinfo.ifi_index = ifindex;
-
- /* started nested attribute for XDP */
- nla = (struct nlattr *)(((char *)&req)
- + NLMSG_ALIGN(req.nh.nlmsg_len));
- nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/;
- nla->nla_len = NLA_HDRLEN;
-
- /* add XDP fd */
- nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
- nla_xdp->nla_type = 1/*IFLA_XDP_FD*/;
- nla_xdp->nla_len = NLA_HDRLEN + sizeof(int);
- memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd));
- nla->nla_len += nla_xdp->nla_len;
-
- /* if user passed in any flags, add those too */
- if (flags) {
- nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len);
- nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/;
- nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags);
- memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags));
- nla->nla_len += nla_xdp->nla_len;
- }
-
- req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len);
-
- if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
- printf("send to netlink: %s\n", strerror(errno));
- goto cleanup;
- }
-
- len = recv(sock, buf, sizeof(buf), 0);
- if (len < 0) {
- printf("recv from netlink: %s\n", strerror(errno));
- goto cleanup;
- }
-
- for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
- nh = NLMSG_NEXT(nh, len)) {
- if (nh->nlmsg_pid != getpid()) {
- printf("Wrong pid %d, expected %d\n",
- nh->nlmsg_pid, getpid());
- goto cleanup;
- }
- if (nh->nlmsg_seq != seq) {
- printf("Wrong seq %d, expected %d\n",
- nh->nlmsg_seq, seq);
- goto cleanup;
- }
- switch (nh->nlmsg_type) {
- case NLMSG_ERROR:
- err = (struct nlmsgerr *)NLMSG_DATA(nh);
- if (!err->error)
- continue;
- printf("nlmsg error %s\n", strerror(-err->error));
- goto cleanup;
- case NLMSG_DONE:
- break;
- }
- }
-
- ret = 0;
-
-cleanup:
- close(sock);
- return ret;
-}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 7d57a4248893..453c200b389b 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -61,5 +61,5 @@ struct ksym {
int load_kallsyms(void);
struct ksym *ksym_search(long key);
-int set_link_xdp_fd(int ifindex, int fd, __u32 flags);
+int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
#endif
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
deleted file mode 100644
index 09afaddfc9ba..000000000000
--- a/samples/bpf/cgroup_helpers.c
+++ /dev/null
@@ -1,178 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sched.h>
-#include <sys/mount.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <linux/limits.h>
-#include <stdio.h>
-#include <linux/sched.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <ftw.h>
-
-
-#include "cgroup_helpers.h"
-
-/*
- * To avoid relying on the system setup, when setup_cgroup_env is called
- * we create a new mount namespace, and cgroup namespace. The cgroup2
- * root is mounted at CGROUP_MOUNT_PATH
- *
- * Unfortunately, most people don't have cgroupv2 enabled at this point in time.
- * It's easier to create our own mount namespace and manage it ourselves.
- *
- * We assume /mnt exists.
- */
-
-#define WALK_FD_LIMIT 16
-#define CGROUP_MOUNT_PATH "/mnt"
-#define CGROUP_WORK_DIR "/cgroup-test-work-dir"
-#define format_cgroup_path(buf, path) \
- snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \
- CGROUP_WORK_DIR, path)
-
-/**
- * setup_cgroup_environment() - Setup the cgroup environment
- *
- * After calling this function, cleanup_cgroup_environment should be called
- * once testing is complete.
- *
- * This function will print an error to stderr and return 1 if it is unable
- * to setup the cgroup environment. If setup is successful, 0 is returned.
- */
-int setup_cgroup_environment(void)
-{
- char cgroup_workdir[PATH_MAX + 1];
-
- format_cgroup_path(cgroup_workdir, "");
-
- if (unshare(CLONE_NEWNS)) {
- log_err("unshare");
- return 1;
- }
-
- if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
- log_err("mount fakeroot");
- return 1;
- }
-
- if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
- log_err("mount cgroup2");
- return 1;
- }
-
- /* Cleanup existing failed runs, now that the environment is setup */
- cleanup_cgroup_environment();
-
- if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
- log_err("mkdir cgroup work dir");
- return 1;
- }
-
- return 0;
-}
-
-static int nftwfunc(const char *filename, const struct stat *statptr,
- int fileflags, struct FTW *pfwt)
-{
- if ((fileflags & FTW_D) && rmdir(filename))
- log_err("Removing cgroup: %s", filename);
- return 0;
-}
-
-
-static int join_cgroup_from_top(char *cgroup_path)
-{
- char cgroup_procs_path[PATH_MAX + 1];
- pid_t pid = getpid();
- int fd, rc = 0;
-
- snprintf(cgroup_procs_path, sizeof(cgroup_procs_path),
- "%s/cgroup.procs", cgroup_path);
-
- fd = open(cgroup_procs_path, O_WRONLY);
- if (fd < 0) {
- log_err("Opening Cgroup Procs: %s", cgroup_procs_path);
- return 1;
- }
-
- if (dprintf(fd, "%d\n", pid) < 0) {
- log_err("Joining Cgroup");
- rc = 1;
- }
-
- close(fd);
- return rc;
-}
-
-/**
- * join_cgroup() - Join a cgroup
- * @path: The cgroup path, relative to the workdir, to join
- *
- * This function expects a cgroup to already be created, relative to the cgroup
- * work dir, and it joins it. For example, passing "/my-cgroup" as the path
- * would actually put the calling process into the cgroup
- * "/cgroup-test-work-dir/my-cgroup"
- *
- * On success, it returns 0, otherwise on failure it returns 1.
- */
-int join_cgroup(char *path)
-{
- char cgroup_path[PATH_MAX + 1];
-
- format_cgroup_path(cgroup_path, path);
- return join_cgroup_from_top(cgroup_path);
-}
-
-/**
- * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment
- *
- * This is an idempotent function to delete all temporary cgroups that
- * have been created during the test, including the cgroup testing work
- * directory.
- *
- * At call time, it moves the calling process to the root cgroup, and then
- * runs the deletion process. It is idempotent, and should not fail, unless
- * a process is lingering.
- *
- * On failure, it will print an error to stderr, and try to continue.
- */
-void cleanup_cgroup_environment(void)
-{
- char cgroup_workdir[PATH_MAX + 1];
-
- format_cgroup_path(cgroup_workdir, "");
- join_cgroup_from_top(CGROUP_MOUNT_PATH);
- nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT);
-}
-
-/**
- * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD
- * @path: The cgroup path, relative to the workdir, to join
- *
- * This function creates a cgroup under the top level workdir and returns the
- * file descriptor. It is idempotent.
- *
- * On success, it returns the file descriptor. On failure it returns 0.
- * If there is a failure, it prints the error to stderr.
- */
-int create_and_get_cgroup(char *path)
-{
- char cgroup_path[PATH_MAX + 1];
- int fd;
-
- format_cgroup_path(cgroup_path, path);
- if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
- log_err("mkdiring cgroup");
- return 0;
- }
-
- fd = open(cgroup_path, O_RDONLY);
- if (fd < 0) {
- log_err("Opening Cgroup");
- return 0;
- }
-
- return fd;
-}
diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h
deleted file mode 100644
index 06485e0002b3..000000000000
--- a/samples/bpf/cgroup_helpers.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __CGROUP_HELPERS_H
-#define __CGROUP_HELPERS_H
-#include <errno.h>
-#include <string.h>
-
-#define clean_errno() (errno == 0 ? "None" : strerror(errno))
-#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
- __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
-
-
-int create_and_get_cgroup(char *path);
-int join_cgroup(char *path);
-int setup_cgroup_environment(void);
-void cleanup_cgroup_environment(void);
-
-#endif
diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index 098c857f1eda..2b2ffb97018b 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -266,7 +266,7 @@ int stress_hash_map_lookup(struct pt_regs *ctx)
return 0;
}
-SEC("kprobe/sys_getpgrp")
+SEC("kprobe/sys_getppid")
int stress_array_map_lookup(struct pt_regs *ctx)
{
u32 key = 1, i;
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index f388254896f6..519d9af4b04a 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -137,6 +137,7 @@ static void do_test_lru(enum test_type test, int cpu)
inner_lru_map_fds[cpu] =
bpf_create_map_node(BPF_MAP_TYPE_LRU_HASH,
+ test_map_names[INNER_LRU_HASH_PREALLOC],
sizeof(uint32_t),
sizeof(long),
inner_lru_hash_size, 0,
@@ -282,7 +283,7 @@ static void test_array_lookup(int cpu)
start_time = time_get_ns();
for (i = 0; i < max_cnt; i++)
- syscall(__NR_getpgrp, 0);
+ syscall(__NR_getppid, 0);
printf("%d:array_lookup %lld lookups per sec\n",
cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time));
}
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index a3cb91ebf4e7..9169d3207f18 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -23,6 +23,13 @@
* This requires kernel CONFIG_FTRACE_SYSCALLS to be set.
*/
+static void usage(const char *cmd)
+{
+ printf("USAGE: %s [-i num_progs] [-h]\n", cmd);
+ printf(" -i num_progs # number of progs of the test\n");
+ printf(" -h # help\n");
+}
+
static void verify_map(int map_id)
{
__u32 key = 0;
@@ -32,22 +39,29 @@ static void verify_map(int map_id)
fprintf(stderr, "map_lookup failed: %s\n", strerror(errno));
return;
}
- if (val == 0)
+ if (val == 0) {
fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
+ return;
+ }
+ val = 0;
+ if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
+ fprintf(stderr, "map_update failed: %s\n", strerror(errno));
+ return;
+ }
}
-int main(int argc, char **argv)
+static int test(char *filename, int num_progs)
{
- struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
- char filename[256];
- int fd;
+ int i, fd, map0_fds[num_progs], map1_fds[num_progs];
- setrlimit(RLIMIT_MEMLOCK, &r);
- snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
- if (load_bpf_file(filename)) {
- fprintf(stderr, "%s", bpf_log_buf);
- return 1;
+ for (i = 0; i < num_progs; i++) {
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "%s", bpf_log_buf);
+ return 1;
+ }
+ printf("prog #%d: map ids %d %d\n", i, map_fd[0], map_fd[1]);
+ map0_fds[i] = map_fd[0];
+ map1_fds[i] = map_fd[1];
}
/* current load_bpf_file has perf_event_open default pid = -1
@@ -64,8 +78,34 @@ int main(int argc, char **argv)
close(fd);
/* verify the map */
- verify_map(map_fd[0]);
- verify_map(map_fd[1]);
+ for (i = 0; i < num_progs; i++) {
+ verify_map(map0_fds[i]);
+ verify_map(map1_fds[i]);
+ }
return 0;
}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int opt, num_progs = 1;
+ char filename[256];
+
+ while ((opt = getopt(argc, argv, "i:h")) != -1) {
+ switch (opt) {
+ case 'i':
+ num_progs = atoi(optarg);
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ return 0;
+ }
+ }
+
+ setrlimit(RLIMIT_MEMLOCK, &r);
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ return test(filename, num_progs);
+}
diff --git a/samples/bpf/tcbpf2_kern.c b/samples/bpf/tcbpf2_kern.c
index 370b749f5ee6..efdc16d195ff 100644
--- a/samples/bpf/tcbpf2_kern.c
+++ b/samples/bpf/tcbpf2_kern.c
@@ -15,6 +15,7 @@
#include <uapi/linux/tcp.h>
#include <uapi/linux/filter.h>
#include <uapi/linux/pkt_cls.h>
+#include <uapi/linux/erspan.h>
#include <net/ipv6.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
@@ -39,10 +40,6 @@ struct vxlan_metadata {
u32 gbp;
};
-struct erspan_metadata {
- __be32 index;
-};
-
SEC("gre_set_tunnel")
int _gre_set_tunnel(struct __sk_buff *skb)
{
@@ -81,6 +78,49 @@ int _gre_get_tunnel(struct __sk_buff *skb)
return TC_ACT_OK;
}
+SEC("ip6gretap_set_tunnel")
+int _ip6gretap_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = _htonl(0x11); /* ::11 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+ key.tunnel_label = 0xabcde;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6gretap_get_tunnel")
+int _ip6gretap_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "key %d remote ip6 ::%x label %x\n";
+ struct bpf_tunnel_key key;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+ return TC_ACT_OK;
+}
+
SEC("erspan_set_tunnel")
int _erspan_set_tunnel(struct __sk_buff *skb)
{
@@ -100,7 +140,20 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
return TC_ACT_SHOT;
}
- md.index = htonl(123);
+ __builtin_memset(&md, 0, sizeof(md));
+#ifdef ERSPAN_V1
+ md.version = 1;
+ md.u.index = bpf_htonl(123);
+#else
+ u8 direction = 1;
+ u8 hwid = 7;
+
+ md.version = 2;
+ md.u.md2.dir = direction;
+ md.u.md2.hwid = hwid & 0xf;
+ md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
if (ret < 0) {
ERROR(ret);
@@ -113,7 +166,7 @@ int _erspan_set_tunnel(struct __sk_buff *skb)
SEC("erspan_get_tunnel")
int _erspan_get_tunnel(struct __sk_buff *skb)
{
- char fmt[] = "key %d remote ip 0x%x erspan index 0x%x\n";
+ char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
struct bpf_tunnel_key key;
struct erspan_metadata md;
u32 index;
@@ -131,9 +184,107 @@ int _erspan_get_tunnel(struct __sk_buff *skb)
return TC_ACT_SHOT;
}
- index = bpf_ntohl(md.index);
bpf_trace_printk(fmt, sizeof(fmt),
- key.tunnel_id, key.remote_ipv4, index);
+ key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+ char fmt2[] = "\tindex %x\n";
+
+ index = bpf_ntohl(md.u.index);
+ bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+ char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+ bpf_trace_printk(fmt2, sizeof(fmt2),
+ md.u.md2.dir,
+ (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+ return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = _htonl(0x11);
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ __builtin_memset(&md, 0, sizeof(md));
+
+#ifdef ERSPAN_V1
+ md.u.index = htonl(123);
+ md.version = 1;
+#else
+ u8 direction = 0;
+ u8 hwid = 17;
+
+ md.version = 2;
+ md.u.md2.dir = direction;
+ md.u.md2.hwid = hwid & 0xf;
+ md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
+ ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ u32 index;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+ char fmt2[] = "\tindex %x\n";
+
+ index = bpf_ntohl(md.u.index);
+ bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+ char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+ bpf_trace_printk(fmt2, sizeof(fmt2),
+ md.u.md2.dir,
+ (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ bpf_ntohl(md.u.md2.timestamp));
+#endif
return TC_ACT_OK;
}
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
new file mode 100644
index 000000000000..4bf4fc597db9
--- /dev/null
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * BPF program to set base_rtt to 80us when host is running TCP-NV and
+ * both hosts are in the same datacenter (as determined by IPv6 prefix).
+ *
+ * Use load_sock_ops to load this BPF program.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <linux/socket.h>
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+#define DEBUG 1
+
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+
+SEC("sockops")
+int bpf_basertt(struct bpf_sock_ops *skops)
+{
+ char cong[20];
+ char nv[] = "nv";
+ int rv = 0, n;
+ int op;
+
+ op = (int) skops->op;
+
+#ifdef DEBUG
+ bpf_printk("BPF command: %d\n", op);
+#endif
+
+ /* Check if both hosts are in the same datacenter. For this
+ * example they are if the 1st 5.5 bytes in the IPv6 address
+ * are the same.
+ */
+ if (skops->family == AF_INET6 &&
+ skops->local_ip6[0] == skops->remote_ip6[0] &&
+ (bpf_ntohl(skops->local_ip6[1]) & 0xfff00000) ==
+ (bpf_ntohl(skops->remote_ip6[1]) & 0xfff00000)) {
+ switch (op) {
+ case BPF_SOCK_OPS_BASE_RTT:
+ n = bpf_getsockopt(skops, SOL_TCP, TCP_CONGESTION,
+ cong, sizeof(cong));
+ if (!n && !__builtin_memcmp(cong, nv, sizeof(nv)+1)) {
+ /* Set base_rtt to 80us */
+ rv = 80;
+ } else if (n) {
+ rv = n;
+ } else {
+ rv = -1;
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ } else {
+ rv = -1;
+ }
+#ifdef DEBUG
+ bpf_printk("Returning %d\n", rv);
+#endif
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
new file mode 100644
index 000000000000..831fb601e3c9
--- /dev/null
+++ b/samples/bpf/tcp_bpf.readme
@@ -0,0 +1,26 @@
+This file describes how to run the tcp_*_kern.o tcp_bpf (or socket_ops)
+programs. These programs attach to a cgroupv2. The following commands create
+a cgroupv2 and attach a bash shell to the group.
+
+ mkdir -p /tmp/cgroupv2
+ mount -t cgroup2 none /tmp/cgroupv2
+ mkdir -p /tmp/cgroupv2/foo
+ bash
+ echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+
+Anything that runs under this shell belongs to the foo cgroupv2 To load
+(attach) one of the tcp_*_kern.o programs:
+
+ ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
+
+If the "-l" flag is used, the load_sock_ops program will continue to run
+printing the BPF log buffer. The tcp_*_kern.o programs use special print
+functions to print logging information (if enabled by the ifdef).
+
+If using netperf/netserver to create traffic, you need to run them under the
+cgroupv2 to which the BPF programs are attached (i.e. under bash shell
+attached to the cgroupv2).
+
+To remove (unattach) a socket_ops BPF program from a cgroupv2:
+
+ ./load_sock_ops -r /tmp/cgroupv2/foo
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
index ee83bbabd17c..0566b7fa38a1 100644
--- a/samples/bpf/tcp_bufs_kern.c
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -41,8 +41,10 @@ int bpf_bufs(struct bpf_sock_ops *skops)
* if neither port numberis 55601
*/
if (bpf_ntohl(skops->remote_port) != 55601 &&
- skops->local_port != 55601)
- return -1;
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
op = (int) skops->op;
@@ -61,8 +63,8 @@ int bpf_bufs(struct bpf_sock_ops *skops)
/* Set sndbuf and rcvbuf of active connections */
rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
- &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
break;
case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
/* Nothing to do */
@@ -71,8 +73,8 @@ int bpf_bufs(struct bpf_sock_ops *skops)
/* Set sndbuf and rcvbuf of passive connections */
rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
- &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
break;
default:
rv = -1;
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
index d68eadd9ca2d..f4225c9d2c0c 100644
--- a/samples/bpf/tcp_clamp_kern.c
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -41,8 +41,10 @@ int bpf_clamp(struct bpf_sock_ops *skops)
/* For testing purposes, only execute rest of BPF program
* if neither port numberis 55601
*/
- if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601)
- return -1;
+ if (bpf_ntohl(skops->remote_port) != 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 0;
+ }
op = (int) skops->op;
@@ -66,9 +68,9 @@ int bpf_clamp(struct bpf_sock_ops *skops)
/* Set sndbuf and rcvbuf of active connections */
rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF,
&bufsize, sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
- SO_RCVBUF, &bufsize,
- sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
break;
case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
rv = bpf_setsockopt(skops, SOL_TCP,
@@ -80,12 +82,12 @@ int bpf_clamp(struct bpf_sock_ops *skops)
rv = bpf_setsockopt(skops, SOL_TCP,
TCP_BPF_SNDCWND_CLAMP,
&clamp, sizeof(clamp));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
- SO_SNDBUF, &bufsize,
- sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET,
- SO_RCVBUF, &bufsize,
- sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_SNDBUF, &bufsize,
+ sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET,
+ SO_RCVBUF, &bufsize,
+ sizeof(bufsize));
break;
default:
rv = -1;
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
index dac15bce1fa9..ad0f1ba8206a 100644
--- a/samples/bpf/tcp_cong_kern.c
+++ b/samples/bpf/tcp_cong_kern.c
@@ -39,8 +39,10 @@ int bpf_cong(struct bpf_sock_ops *skops)
* if neither port numberis 55601
*/
if (bpf_ntohl(skops->remote_port) != 55601 &&
- skops->local_port != 55601)
- return -1;
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
op = (int) skops->op;
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
index 23c5122ef819..4ca5ecc9f580 100644
--- a/samples/bpf/tcp_iw_kern.c
+++ b/samples/bpf/tcp_iw_kern.c
@@ -42,8 +42,10 @@ int bpf_iw(struct bpf_sock_ops *skops)
* if neither port numberis 55601
*/
if (bpf_ntohl(skops->remote_port) != 55601 &&
- skops->local_port != 55601)
- return -1;
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
op = (int) skops->op;
@@ -62,8 +64,8 @@ int bpf_iw(struct bpf_sock_ops *skops)
/* Set sndbuf and rcvbuf of active connections */
rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
- &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
break;
case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
rv = bpf_setsockopt(skops, SOL_TCP, TCP_BPF_IW, &iw,
@@ -73,8 +75,8 @@ int bpf_iw(struct bpf_sock_ops *skops)
/* Set sndbuf and rcvbuf of passive connections */
rv = bpf_setsockopt(skops, SOL_SOCKET, SO_SNDBUF, &bufsize,
sizeof(bufsize));
- rv = rv*100 + bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
- &bufsize, sizeof(bufsize));
+ rv += bpf_setsockopt(skops, SOL_SOCKET, SO_RCVBUF,
+ &bufsize, sizeof(bufsize));
break;
default:
rv = -1;
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
index 3f2a228f81ce..09ff65b40b31 100644
--- a/samples/bpf/tcp_rwnd_kern.c
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -38,8 +38,10 @@ int bpf_rwnd(struct bpf_sock_ops *skops)
* if neither port numberis 55601
*/
if (bpf_ntohl(skops->remote_port) !=
- 55601 && skops->local_port != 55601)
- return -1;
+ 55601 && skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
op = (int) skops->op;
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
index 3c3fc83d81cb..232bb242823e 100644
--- a/samples/bpf/tcp_synrto_kern.c
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -38,8 +38,10 @@ int bpf_synrto(struct bpf_sock_ops *skops)
* if neither port numberis 55601
*/
if (bpf_ntohl(skops->remote_port) != 55601 &&
- skops->local_port != 55601)
- return -1;
+ skops->local_port != 55601) {
+ skops->reply = -1;
+ return 1;
+ }
op = (int) skops->op;
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
index 3049b1f26267..1af412ec6007 100644
--- a/samples/bpf/test_cgrp2_attach2.c
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -30,7 +30,7 @@
#define FOO "/foo"
#define BAR "/foo/bar/"
-#define PING_CMD "ping -c1 -w1 127.0.0.1"
+#define PING_CMD "ping -c1 -w1 127.0.0.1 > /dev/null"
char bpf_log_buf[BPF_LOG_BUF_SIZE];
@@ -55,8 +55,7 @@ static int prog_load(int verdict)
return ret;
}
-
-int main(int argc, char **argv)
+static int test_foo_bar(void)
{
int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
@@ -79,7 +78,8 @@ int main(int argc, char **argv)
if (join_cgroup(FOO))
goto err;
- if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
log_err("Attaching prog to /foo");
goto err;
}
@@ -98,7 +98,8 @@ int main(int argc, char **argv)
printf("Attached DROP prog. This ping in cgroup /foo/bar should fail...\n");
assert(system(PING_CMD) != 0);
- if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
log_err("Attaching prog to /foo/bar");
goto err;
}
@@ -115,7 +116,8 @@ int main(int argc, char **argv)
"This ping in cgroup /foo/bar should fail...\n");
assert(system(PING_CMD) != 0);
- if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
log_err("Attaching prog to /foo/bar");
goto err;
}
@@ -129,7 +131,8 @@ int main(int argc, char **argv)
"This ping in cgroup /foo/bar should pass...\n");
assert(system(PING_CMD) == 0);
- if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
log_err("Attaching prog to /foo/bar");
goto err;
}
@@ -162,13 +165,15 @@ int main(int argc, char **argv)
goto err;
}
- if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
errno = 0;
log_err("Unexpected success attaching overridable prog to /foo/bar");
goto err;
}
- if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 1)) {
+ if (!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
errno = 0;
log_err("Unexpected success attaching overridable prog to /foo");
goto err;
@@ -189,8 +194,229 @@ out:
close(bar);
cleanup_cgroup_environment();
if (!rc)
- printf("PASS\n");
+ printf("### override:PASS\n");
else
- printf("FAIL\n");
+ printf("### override:FAIL\n");
return rc;
}
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+ if (map_fd < 0)
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+ int ret;
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ if (ret < 0) {
+ log_err("Loading program");
+ printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+ return 0;
+ }
+ return ret;
+}
+
+
+static int test_multiprog(void)
+{
+ __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+ int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+ int drop_prog, allow_prog[6] = {}, rc = 0;
+ unsigned long long value;
+ int i = 0;
+
+ for (i = 0; i < 6; i++) {
+ allow_prog[i] = prog_load_cnt(1, 1 << i);
+ if (!allow_prog[i])
+ goto err;
+ }
+ drop_prog = prog_load_cnt(0, 1);
+ if (!drop_prog)
+ goto err;
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ cg1 = create_and_get_cgroup("/cg1");
+ if (!cg1)
+ goto err;
+ cg2 = create_and_get_cgroup("/cg1/cg2");
+ if (!cg2)
+ goto err;
+ cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+ if (!cg3)
+ goto err;
+ cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+ if (!cg4)
+ goto err;
+ cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+ if (!cg5)
+ goto err;
+
+ if (join_cgroup("/cg1/cg2/cg3/cg4/cg5"))
+ goto err;
+
+ if (bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog to cg1");
+ goto err;
+ }
+ if (!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Unexpected success attaching the same prog to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog2 to cg1");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to cg2");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI)) {
+ log_err("Attaching prog to cg3");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE)) {
+ log_err("Attaching prog to cg4");
+ goto err;
+ }
+ if (bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0)) {
+ log_err("Attaching prog to cg5");
+ goto err;
+ }
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 32);
+
+ /* query the number of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ NULL, NULL, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ /* retrieve prog_ids of effective progs in cg5 */
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 4);
+ assert(attach_flags == 0);
+ saved_prog_id = prog_ids[0];
+ /* check enospc handling */
+ prog_ids[0] = 0;
+ prog_cnt = 2;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == -1 &&
+ errno == ENOSPC);
+ assert(prog_cnt == 4);
+ /* check that prog_ids are returned even when buffer is too small */
+ assert(prog_ids[0] == saved_prog_id);
+ /* retrieve prog_id of single attached prog in cg5 */
+ prog_ids[0] = 0;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 1);
+ assert(prog_ids[0] == saved_prog_id);
+
+ /* detach bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg5");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 8 + 16);
+
+ /* detach 3rd from bottom program and ping again */
+ errno = 0;
+ if (!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Unexpected success on detach from cg3");
+ goto err;
+ }
+ if (bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching from cg3");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 16);
+
+ /* detach 2nd from bottom program and ping again */
+ if (bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching prog from cg4");
+ goto err;
+ }
+ value = 0;
+ assert(bpf_map_update_elem(map_fd, &key, &value, 0) == 0);
+ assert(system(PING_CMD) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &value) == 0);
+ assert(value == 1 + 2 + 4);
+
+ prog_cnt = 4;
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, BPF_F_QUERY_EFFECTIVE,
+ &attach_flags, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 3);
+ assert(attach_flags == 0);
+ assert(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0,
+ NULL, prog_ids, &prog_cnt) == 0);
+ assert(prog_cnt == 0);
+ goto out;
+err:
+ rc = 1;
+
+out:
+ for (i = 0; i < 6; i++)
+ if (allow_prog[i] > 0)
+ close(allow_prog[i]);
+ close(cg1);
+ close(cg2);
+ close(cg3);
+ close(cg4);
+ close(cg5);
+ cleanup_cgroup_environment();
+ if (!rc)
+ printf("### multi:PASS\n");
+ else
+ printf("### multi:FAIL\n");
+ return rc;
+}
+
+int main(int argc, char **argv)
+{
+ int rc = 0;
+
+ rc = test_foo_bar();
+ if (rc)
+ return rc;
+
+ return test_multiprog();
+}
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000000..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+ echo "SUCCESS!"
+else
+ echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/test_tunnel_bpf.sh b/samples/bpf/test_tunnel_bpf.sh
index 312e1722a39f..43ce049996ee 100755
--- a/samples/bpf/test_tunnel_bpf.sh
+++ b/samples/bpf/test_tunnel_bpf.sh
@@ -33,10 +33,43 @@ function add_gre_tunnel {
ip addr add dev $DEV 10.1.1.200/24
}
-function add_erspan_tunnel {
+function add_ip6gretap_tunnel {
+
+ # assign ipv6 address
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
# in namespace
ip netns exec at_ns0 \
- ip link add dev $DEV_NS type $TYPE seq key 2 local 172.16.1.100 remote 172.16.1.200 erspan 123
+ ip link add dev $DEV_NS type $TYPE flowlabel 0xbcdef key 2 \
+ local ::11 remote ::22
+
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # out of namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip addr add dev $DEV fc80::200/24
+ ip link set dev $DEV up
+}
+
+function add_erspan_tunnel {
+ # in namespace
+ if [ "$1" == "v1" ]; then
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 1 erspan 123
+ else
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 2 erspan_dir egress erspan_hwid 3
+ fi
ip netns exec at_ns0 ip link set dev $DEV_NS up
ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
@@ -46,6 +79,35 @@ function add_erspan_tunnel {
ip addr add dev $DEV 10.1.1.200/24
}
+function add_ip6erspan_tunnel {
+
+ # assign ipv6 address
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # in namespace
+ if [ "$1" == "v1" ]; then
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local ::11 remote ::22 \
+ erspan_ver 1 erspan 123
+ else
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local ::11 remote ::22 \
+ erspan_ver 2 erspan_dir egress erspan_hwid 7
+ fi
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # out of namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip link set dev $DEV up
+}
+
function add_vxlan_tunnel {
# Set static ARP entry here because iptables set-mark works
# on L3 packet, as a result not applying to ARP packets,
@@ -113,18 +175,65 @@ function test_gre {
cleanup
}
+function test_ip6gre {
+ TYPE=ip6gre
+ DEV_NS=ip6gre00
+ DEV=ip6gre11
+ config_device
+ # reuse the ip6gretap function
+ add_ip6gretap_tunnel
+ attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+ # underlay
+ ping6 -c 4 ::11
+ # overlay: ipv4 over ipv6
+ ip netns exec at_ns0 ping -c 1 10.1.1.200
+ ping -c 1 10.1.1.100
+ # overlay: ipv6 over ipv6
+ ip netns exec at_ns0 ping6 -c 1 fc80::200
+ cleanup
+}
+
+function test_ip6gretap {
+ TYPE=ip6gretap
+ DEV_NS=ip6gretap00
+ DEV=ip6gretap11
+ config_device
+ add_ip6gretap_tunnel
+ attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+ # underlay
+ ping6 -c 4 ::11
+ # overlay: ipv4 over ipv6
+ ip netns exec at_ns0 ping -i .2 -c 1 10.1.1.200
+ ping -c 1 10.1.1.100
+ # overlay: ipv6 over ipv6
+ ip netns exec at_ns0 ping6 -c 1 fc80::200
+ cleanup
+}
+
function test_erspan {
TYPE=erspan
DEV_NS=erspan00
DEV=erspan11
config_device
- add_erspan_tunnel
+ add_erspan_tunnel $1
attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
ping -c 1 10.1.1.100
ip netns exec at_ns0 ping -c 1 10.1.1.200
cleanup
}
+function test_ip6erspan {
+ TYPE=ip6erspan
+ DEV_NS=ip6erspan00
+ DEV=ip6erspan11
+ config_device
+ add_ip6erspan_tunnel $1
+ attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+ ping6 -c 3 ::11
+ ip netns exec at_ns0 ping -c 1 10.1.1.200
+ cleanup
+}
+
function test_vxlan {
TYPE=vxlan
DEV_NS=vxlan00
@@ -175,9 +284,12 @@ function cleanup {
ip link del veth1
ip link del ipip11
ip link del gretap11
+ ip link del ip6gre11
+ ip link del ip6gretap11
ip link del vxlan11
ip link del geneve11
ip link del erspan11
+ ip link del ip6erspan11
pkill tcpdump
pkill cat
set -ex
@@ -187,8 +299,16 @@ trap cleanup 0 2 3 6 9
cleanup
echo "Testing GRE tunnel..."
test_gre
+echo "Testing IP6GRE tunnel..."
+test_ip6gre
+echo "Testing IP6GRETAP tunnel..."
+test_ip6gretap
echo "Testing ERSPAN tunnel..."
-test_erspan
+test_erspan v1
+test_erspan v2
+echo "Testing IP6ERSPAN tunnel..."
+test_ip6erspan v1
+test_ip6erspan v2
echo "Testing VXLAN tunnel..."
test_vxlan
echo "Testing GENEVE tunnel..."
diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c
index 41b6115a32eb..a77a583d94d4 100644
--- a/samples/bpf/trace_event_kern.c
+++ b/samples/bpf/trace_event_kern.c
@@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = {
SEC("perf_event")
int bpf_prog1(struct bpf_perf_event_data *ctx)
{
+ char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu";
+ char time_fmt2[] = "Get Time Failed, ErrCode: %d";
char fmt[] = "CPU-%d period %lld ip %llx";
u32 cpu = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value value_buf;
struct key_t key;
u64 *val, one = 1;
+ int ret;
if (ctx->sample_period < 10000)
/* ignore warmup */
@@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx)
return 0;
}
+ ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value));
+ if (!ret)
+ bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running);
+ else
+ bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret);
+
val = bpf_map_lookup_elem(&counts, &key);
if (val)
(*val)++;
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 7bd827b84a67..bf4f1b6d9a52 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr)
int *pmu_fd = malloc(nr_cpus * sizeof(int));
int i, error = 0;
+ /* system wide perf event, no need to inherit */
+ attr->inherit = 0;
+
/* open perf_event on all cpus */
for (i = 0; i < nr_cpus; i++) {
pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0);
@@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr)
{
int pmu_fd;
+ /* per task perf event, enable inherit so the "dd ..." command can be traced properly.
+ * Enabling inherit will cause bpf_perf_prog_read_time helper failure.
+ */
+ attr->inherit = 1;
+
/* open task bound event */
pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0);
if (pmu_fd < 0) {
@@ -175,14 +183,12 @@ static void test_bpf_perf_event(void)
.freq = 1,
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
- .inherit = 1,
};
struct perf_event_attr attr_type_sw = {
.sample_freq = SAMPLE_FREQ,
.freq = 1,
.type = PERF_TYPE_SOFTWARE,
.config = PERF_COUNT_SW_CPU_CLOCK,
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_l1d = {
.sample_freq = SAMPLE_FREQ,
@@ -192,7 +198,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_L1D |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_hw_cache_branch_miss = {
.sample_freq = SAMPLE_FREQ,
@@ -202,7 +207,6 @@ static void test_bpf_perf_event(void)
PERF_COUNT_HW_CACHE_BPU |
(PERF_COUNT_HW_CACHE_OP_READ << 8) |
(PERF_COUNT_HW_CACHE_RESULT_MISS << 16),
- .inherit = 1,
};
struct perf_event_attr attr_type_raw = {
.sample_freq = SAMPLE_FREQ,
@@ -210,7 +214,6 @@ static void test_bpf_perf_event(void)
.type = PERF_TYPE_RAW,
/* Intel Instruction Retired */
.config = 0xc0,
- .inherit = 1,
};
printf("Test HW_CPU_CYCLES\n");
diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c
index e7d180305974..46c557afac73 100644
--- a/samples/bpf/tracex6_kern.c
+++ b/samples/bpf/tracex6_kern.c
@@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = {
.value_size = sizeof(u64),
.max_entries = 64,
};
+struct bpf_map_def SEC("maps") values2 = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct bpf_perf_event_value),
+ .max_entries = 64,
+};
SEC("kprobe/htab_map_get_next_key")
int bpf_prog1(struct pt_regs *ctx)
@@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx)
return 0;
}
+SEC("kprobe/htab_map_lookup_elem")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ u32 key = bpf_get_smp_processor_id();
+ struct bpf_perf_event_value *val, buf;
+ int error;
+
+ error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf));
+ if (error)
+ return 0;
+
+ val = bpf_map_lookup_elem(&values2, &key);
+ if (val)
+ *val = buf;
+ else
+ bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST);
+
+ return 0;
+}
+
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c
index a8c22dcf8e4e..89ab8d408474 100644
--- a/samples/bpf/tracex6_user.c
+++ b/samples/bpf/tracex6_user.c
@@ -23,6 +23,7 @@
static void check_on_cpu(int cpu, struct perf_event_attr *attr)
{
+ struct bpf_perf_event_value value2;
int pmu_fd, error = 0;
cpu_set_t set;
__u64 value;
@@ -47,8 +48,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr)
fprintf(stderr, "Value missing for CPU %d\n", cpu);
error = 1;
goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: %llu\n", cpu, value);
+ }
+ /* The above bpf_map_lookup_elem should trigger the second kprobe */
+ if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) {
+ fprintf(stderr, "Value2 missing for CPU %d\n", cpu);
+ error = 1;
+ goto on_exit;
+ } else {
+ fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu,
+ value2.counter, value2.enabled, value2.running);
}
- fprintf(stderr, "CPU %d: %llu\n", cpu, value);
on_exit:
assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error);
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000000..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ unsigned long rc = -12;
+
+ bpf_override_return(ctx, rc);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000000..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+ FILE *f;
+ char filename[256];
+ char command[256];
+ int ret;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+ f = popen(command, "r");
+ ret = pclose(f);
+
+ return ret ? 0 : 1;
+}
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 2431c0321b71..b901ee2b3336 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -14,6 +14,7 @@
#include <string.h>
#include <unistd.h>
#include <libgen.h>
+#include <sys/resource.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -24,7 +25,7 @@ static __u32 xdp_flags;
static void int_exit(int sig)
{
- set_link_xdp_fd(ifindex, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
exit(0);
}
@@ -69,6 +70,7 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
const char *optstr = "SN";
char filename[256];
int opt;
@@ -91,6 +93,12 @@ int main(int argc, char **argv)
usage(basename(argv[0]));
return 1;
}
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
ifindex = strtoul(argv[optind], NULL, 0);
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
@@ -108,7 +116,7 @@ int main(int argc, char **argv)
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
- if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
printf("link set xdp fd failed\n");
return 1;
}
diff --git a/samples/bpf/xdp2skb_meta.sh b/samples/bpf/xdp2skb_meta.sh
new file mode 100755
index 000000000000..b9c9549c4c27
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta.sh
@@ -0,0 +1,220 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+#
+# Bash-shell example on using iproute2 tools 'tc' and 'ip' to load
+# eBPF programs, both for XDP and clsbpf. Shell script function
+# wrappers and even long options parsing is illustrated, for ease of
+# use.
+#
+# Related to sample/bpf/xdp2skb_meta_kern.c, which contains BPF-progs
+# that need to collaborate between XDP and TC hooks. Thus, it is
+# convenient that the same tool load both programs that need to work
+# together.
+#
+BPF_FILE=xdp2skb_meta_kern.o
+DIR=$(dirname $0)
+
+export TC=/usr/sbin/tc
+export IP=/usr/sbin/ip
+
+function usage() {
+ echo ""
+ echo "Usage: $0 [-vfh] --dev ethX"
+ echo " -d | --dev : Network device (required)"
+ echo " --flush : Cleanup flush TC and XDP progs"
+ echo " --list : (\$LIST) List TC and XDP progs"
+ echo " -v | --verbose : (\$VERBOSE) Verbose"
+ echo " --dry-run : (\$DRYRUN) Dry-run only (echo commands)"
+ echo ""
+}
+
+## -- General shell logging cmds --
+function err() {
+ local exitcode=$1
+ shift
+ echo "ERROR: $@" >&2
+ exit $exitcode
+}
+
+function info() {
+ if [[ -n "$VERBOSE" ]]; then
+ echo "# $@"
+ fi
+}
+
+## -- Helper function calls --
+
+# Wrapper call for TC and IP
+# - Will display the offending command on failure
+function _call_cmd() {
+ local cmd="$1"
+ local allow_fail="$2"
+ shift 2
+ if [[ -n "$VERBOSE" ]]; then
+ echo "$(basename $cmd) $@"
+ fi
+ if [[ -n "$DRYRUN" ]]; then
+ return
+ fi
+ $cmd "$@"
+ local status=$?
+ if (( $status != 0 )); then
+ if [[ "$allow_fail" == "" ]]; then
+ err 2 "Exec error($status) occurred cmd: \"$cmd $@\""
+ fi
+ fi
+}
+function call_tc() {
+ _call_cmd "$TC" "" "$@"
+}
+function call_tc_allow_fail() {
+ _call_cmd "$TC" "allow_fail" "$@"
+}
+function call_ip() {
+ _call_cmd "$IP" "" "$@"
+}
+
+## --- Parse command line arguments / parameters ---
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o vfhd: \
+ --long verbose,flush,help,list,dev:,dry-run -- "$@")
+if (( $? != 0 )); then
+ err 4 "Error calling getopt"
+fi
+eval set -- "$OPTIONS"
+
+unset DEV
+unset FLUSH
+while true; do
+ case "$1" in
+ -d | --dev ) # device
+ DEV=$2
+ info "Device set to: DEV=$DEV" >&2
+ shift 2
+ ;;
+ -v | --verbose)
+ VERBOSE=yes
+ # info "Verbose mode: VERBOSE=$VERBOSE" >&2
+ shift
+ ;;
+ --dry-run )
+ DRYRUN=yes
+ VERBOSE=yes
+ info "Dry-run mode: enable VERBOSE and don't call TC+IP" >&2
+ shift
+ ;;
+ -f | --flush )
+ FLUSH=yes
+ shift
+ ;;
+ --list )
+ LIST=yes
+ shift
+ ;;
+ -- )
+ shift
+ break
+ ;;
+ -h | --help )
+ usage;
+ exit 0
+ ;;
+ * )
+ shift
+ break
+ ;;
+ esac
+done
+
+FILE="$DIR/$BPF_FILE"
+if [[ ! -e $FILE ]]; then
+ err 3 "Missing BPF object file ($FILE)"
+fi
+
+if [[ -z $DEV ]]; then
+ usage
+ err 2 "Please specify network device -- required option --dev"
+fi
+
+## -- Function calls --
+
+function list_tc()
+{
+ local device="$1"
+ shift
+ info "Listing current TC ingress rules"
+ call_tc filter show dev $device ingress
+}
+
+function list_xdp()
+{
+ local device="$1"
+ shift
+ info "Listing current XDP device($device) setting"
+ call_ip link show dev $device | grep --color=auto xdp
+}
+
+function flush_tc()
+{
+ local device="$1"
+ shift
+ info "Flush TC on device: $device"
+ call_tc_allow_fail filter del dev $device ingress
+ call_tc_allow_fail qdisc del dev $device clsact
+}
+
+function flush_xdp()
+{
+ local device="$1"
+ shift
+ info "Flush XDP on device: $device"
+ call_ip link set dev $device xdp off
+}
+
+function attach_tc_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="tc_mark"
+ shift 2
+
+ # Re-attach clsact to clear/flush existing role
+ call_tc_allow_fail qdisc del dev $device clsact 2> /dev/null
+ call_tc qdisc add dev $device clsact
+
+ # Attach BPF prog
+ call_tc filter add dev $device ingress \
+ prio 1 handle 1 bpf da obj $file sec $prog
+}
+
+function attach_xdp_mark()
+{
+ local device="$1"
+ local file="$2"
+ local prog="xdp_mark"
+ shift 2
+
+ # Remove XDP prog in-case it's already loaded
+ # TODO: Need ip-link option to override/replace existing XDP prog
+ flush_xdp $device
+
+ # Attach XDP/BPF prog
+ call_ip link set dev $device xdp obj $file sec $prog
+}
+
+if [[ -n $FLUSH ]]; then
+ flush_tc $DEV
+ flush_xdp $DEV
+ exit 0
+fi
+
+if [[ -n $LIST ]]; then
+ list_tc $DEV
+ list_xdp $DEV
+ exit 0
+fi
+
+attach_tc_mark $DEV $FILE
+attach_xdp_mark $DEV $FILE
diff --git a/samples/bpf/xdp2skb_meta_kern.c b/samples/bpf/xdp2skb_meta_kern.c
new file mode 100644
index 000000000000..0c12048ac79f
--- /dev/null
+++ b/samples/bpf/xdp2skb_meta_kern.c
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto transfer info from XDP to SKB, e.g. skb->mark
+ * -----------------------------------------------------------
+ * This uses the XDP data_meta infrastructure, and is a cooperation
+ * between two bpf-programs (1) XDP and (2) clsact at TC-ingress hook.
+ *
+ * Notice: This example does not use the BPF C-loader (bpf_load.c),
+ * but instead rely on the iproute2 TC tool for loading BPF-objects.
+ */
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/pkt_cls.h>
+
+#include "bpf_helpers.h"
+
+/*
+ * This struct is stored in the XDP 'data_meta' area, which is located
+ * just in-front-of the raw packet payload data. The meaning is
+ * specific to these two BPF programs that use it as a communication
+ * channel. XDP adjust/increase the area via a bpf-helper, and TC use
+ * boundary checks to see if data have been provided.
+ *
+ * The struct must be 4 byte aligned, which here is enforced by the
+ * struct __attribute__((aligned(4))).
+ */
+struct meta_info {
+ __u32 mark;
+} __attribute__((aligned(4)));
+
+SEC("xdp_mark")
+int _xdp_mark(struct xdp_md *ctx)
+{
+ struct meta_info *meta;
+ void *data, *data_end;
+ int ret;
+
+ /* Reserve space in-front of data pointer for our meta info.
+ * (Notice drivers not supporting data_meta will fail here!)
+ */
+ ret = bpf_xdp_adjust_meta(ctx, -(int)sizeof(*meta));
+ if (ret < 0)
+ return XDP_ABORTED;
+
+ /* Notice: Kernel-side verifier requires that loading of
+ * ctx->data MUST happen _after_ helper bpf_xdp_adjust_meta(),
+ * as pkt-data pointers are invalidated. Helpers that require
+ * this are determined/marked by bpf_helper_changes_pkt_data()
+ */
+ data = (void *)(unsigned long)ctx->data;
+
+ /* Check data_meta have room for meta_info struct */
+ meta = (void *)(unsigned long)ctx->data_meta;
+ if (meta + 1 > data)
+ return XDP_ABORTED;
+
+ meta->mark = 42;
+
+ return XDP_PASS;
+}
+
+SEC("tc_mark")
+int _tc_mark(struct __sk_buff *ctx)
+{
+ void *data = (void *)(unsigned long)ctx->data;
+ void *data_end = (void *)(unsigned long)ctx->data_end;
+ void *data_meta = (void *)(unsigned long)ctx->data_meta;
+ struct meta_info *meta = data_meta;
+
+ /* Check XDP gave us some data_meta */
+ if (meta + 1 > data) {
+ ctx->mark = 41;
+ /* Skip "accept" if no data_meta is avail */
+ return TC_ACT_OK;
+ }
+
+ /* Hint: See func tc_cls_act_is_valid_access() for BPF_WRITE access */
+ ctx->mark = meta->mark; /* Transfer XDP-mark to SKB-mark */
+
+ return TC_ACT_OK;
+}
+
+/* Manually attaching these programs:
+export DEV=ixgbe2
+export FILE=xdp2skb_meta_kern.o
+
+# via TC command
+tc qdisc del dev $DEV clsact 2> /dev/null
+tc qdisc add dev $DEV clsact
+tc filter add dev $DEV ingress prio 1 handle 1 bpf da obj $FILE sec tc_mark
+tc filter show dev $DEV ingress
+
+# XDP via IP command:
+ip link set dev $DEV xdp off
+ip link set dev $DEV xdp obj $FILE sec xdp_mark
+
+# Use iptable to "see" if SKBs are marked
+iptables -I INPUT -p icmp -m mark --mark 41 # == 0x29
+iptables -I INPUT -p icmp -m mark --mark 42 # == 0x2a
+
+# Hint: catch XDP_ABORTED errors via
+perf record -e xdp:*
+perf script
+
+*/
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 74f3fd8ed729..211db8ded0de 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -1,6 +1,7 @@
-/* XDP monitor tool, based on tracepoints
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017-2018 Jesper Dangaard Brouer, Red Hat Inc.
*
- * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * XDP monitor tool, based on tracepoints
*/
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
@@ -13,23 +14,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = {
/* TODO: have entries for all possible errno's */
};
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = XDP_UNKNOWN + 1,
+};
+
/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
* Code in: kernel/include/trace/events/xdp.h
*/
struct xdp_redirect_ctx {
- unsigned short common_type; // offset:0; size:2; signed:0;
- unsigned char common_flags; // offset:2; size:1; signed:0;
- unsigned char common_preempt_count;// offset:3; size:1; signed:0;
- int common_pid; // offset:4; size:4; signed:1;
-
- int prog_id; // offset:8; size:4; signed:1;
- u32 act; // offset:12 size:4; signed:0;
- int ifindex; // offset:16 size:4; signed:1;
- int err; // offset:20 size:4; signed:1;
- int to_ifindex; // offset:24 size:4; signed:1;
- u32 map_id; // offset:28 size:4; signed:0;
- int map_index; // offset:32 size:4; signed:1;
-}; // offset:36
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
enum {
XDP_REDIRECT_SUCCESS = 0,
@@ -48,7 +53,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key);
if (!cnt)
- return 0;
+ return 1;
*cnt += 1;
return 0; /* Indicate event was filtered (no further processing)*/
@@ -86,3 +91,120 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx)
{
return xdp_redirect_collect_stat(ctx);
}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ u64 *cnt;
+ u32 key;
+
+ key = ctx->act;
+ if (key > XDP_REDIRECT)
+ key = XDP_UNKNOWN;
+
+ cnt = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!cnt)
+ return 1;
+ *cnt += 1;
+
+ return 0;
+}
+
+/* Common stats data record shared with _user.c */
+struct datarec {
+ u64 processed;
+ u64 dropped;
+ u64 info;
+};
+#define MAX_CPUS 64
+
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_CPUS,
+};
+
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int to_cpu; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+ u32 to_cpu = ctx->to_cpu;
+ struct datarec *rec;
+
+ if (to_cpu >= MAX_CPUS)
+ return 1;
+
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (ctx->processed > 0)
+ rec->info += 1;
+
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int sched; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Count times kthread yielded CPU via schedule call */
+ if (ctx->sched)
+ rec->info++;
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index b51b4f5e3257..eec14520d513 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -1,4 +1,5 @@
-/* Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
*/
static const char *__doc__=
"XDP monitor tool, based on tracepoints\n"
@@ -20,6 +21,7 @@ static const char *__doc_err_only__=
#include <unistd.h>
#include <locale.h>
+#include <sys/resource.h>
#include <getopt.h>
#include <net/if.h>
#include <time.h>
@@ -39,6 +41,9 @@ static const struct option long_options[] = {
{0, 0, NULL, 0 }
};
+/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
+#define EXIT_FAIL_MEM 5
+
static void usage(char *argv[])
{
int i;
@@ -61,7 +66,7 @@ static void usage(char *argv[])
}
#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
-__u64 gettime(void)
+static __u64 gettime(void)
{
struct timespec t;
int res;
@@ -89,98 +94,437 @@ static const char *err2str(int err)
return redir_names[err];
return NULL;
}
+/* enum xdp_action */
+#define XDP_UNKNOWN XDP_REDIRECT + 1
+#define XDP_ACTION_MAX (XDP_UNKNOWN + 1)
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+ [XDP_REDIRECT] = "XDP_REDIRECT",
+ [XDP_UNKNOWN] = "XDP_UNKNOWN",
+};
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 info;
+};
+#define MAX_CPUS 64
+
+/* Userspace structs for collection of stats from maps */
struct record {
- __u64 counter;
__u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct u64rec {
+ __u64 processed;
+};
+struct record_u64 {
+ /* record for _kern side __u64 values */
+ __u64 timestamp;
+ struct u64rec total;
+ struct u64rec *cpu;
};
struct stats_record {
- struct record xdp_redir[REDIR_RES_MAX];
+ struct record_u64 xdp_redirect[REDIR_RES_MAX];
+ struct record_u64 xdp_exception[XDP_ACTION_MAX];
+ struct record xdp_cpumap_kthread;
+ struct record xdp_cpumap_enqueue[MAX_CPUS];
};
-static void stats_print_headers(bool err_only)
-{
- if (err_only)
- printf("\n%s\n", __doc_err_only__);
-
- printf("%-14s %-10s %-18s %-9s\n",
- "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period");
-}
-
-static void stats_print(struct stats_record *rec,
- struct stats_record *prev,
- bool err_only)
+static bool map_collect_record(int fd, __u32 key, struct record *rec)
{
- int i = 0;
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_info = 0;
+ int i;
- if (err_only)
- i = REDIR_ERROR;
-
- for (; i < REDIR_RES_MAX; i++) {
- struct record *r = &rec->xdp_redir[i];
- struct record *p = &prev->xdp_redir[i];
- __u64 period = 0;
- __u64 packets = 0;
- double pps = 0;
- double period_ = 0;
-
- if (p->timestamp) {
- packets = r->counter - p->counter;
- period = r->timestamp - p->timestamp;
- if (period > 0) {
- period_ = ((double) period / NANOSEC_PER_SEC);
- pps = packets / period_;
- }
- }
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
- printf("%-14s %-10.0f %'-18.0f %f\n",
- err2str(i), pps, pps, period_);
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].dropped = values[i].dropped;
+ sum_dropped += values[i].dropped;
+ rec->cpu[i].info = values[i].info;
+ sum_info += values[i].info;
}
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.info = sum_info;
+ return true;
}
-static __u64 get_key32_value64_percpu(int fd, __u32 key)
+static bool map_collect_record_u64(int fd, __u32 key, struct record_u64 *rec)
{
/* For percpu maps, userspace gets a value per possible CPU */
unsigned int nr_cpus = bpf_num_possible_cpus();
- __u64 values[nr_cpus];
- __u64 sum = 0;
+ struct u64rec values[nr_cpus];
+ __u64 sum_total = 0;
int i;
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
fprintf(stderr,
"ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
- return 0;
+ return false;
}
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
- /* Sum values from each CPU */
+ /* Record and sum values from each CPU */
for (i = 0; i < nr_cpus; i++) {
- sum += values[i];
+ rec->cpu[i].processed = values[i].processed;
+ sum_total += values[i].processed;
+ }
+ rec->total.processed = sum_total;
+ return true;
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double calc_period_u64(struct record_u64 *r, struct record_u64 *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static double calc_pps(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_pps_u64(struct u64rec *r, struct u64rec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period;
}
- return sum;
+ return pps;
}
-static bool stats_collect(int fd, struct stats_record *rec)
+static double calc_drop(struct datarec *r, struct datarec *p, double period)
{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->dropped - p->dropped;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static double calc_info(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->info - p->info;
+ pps = packets / period;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ bool err_only)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ int rec_i = 0, i, to_cpu;
+ double t = 0, pps = 0;
+
+ /* Header */
+ printf("%-15s %-7s %-12s %-12s %-9s\n",
+ "XDP-event", "CPU:to", "pps", "drop-pps", "extra-info");
+
+ /* tracepoint: xdp:xdp_redirect_* */
+ if (err_only)
+ rec_i = REDIR_ERROR;
+
+ for (; rec_i < REDIR_RES_MAX; rec_i++) {
+ struct record_u64 *rec, *prev;
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+ rec = &stats_rec->xdp_redirect[rec_i];
+ prev = &stats_prev->xdp_redirect[rec_i];
+ t = calc_period_u64(rec, prev);
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct u64rec *r = &rec->cpu[i];
+ struct u64rec *p = &prev->cpu[i];
+
+ pps = calc_pps_u64(r, p, t);
+ if (pps > 0)
+ printf(fmt1, "XDP_REDIRECT", i,
+ rec_i ? 0.0: pps, rec_i ? pps : 0.0,
+ err2str(rec_i));
+ }
+ pps = calc_pps_u64(&rec->total, &prev->total, t);
+ printf(fmt2, "XDP_REDIRECT", "total",
+ rec_i ? 0.0: pps, rec_i ? pps : 0.0, err2str(rec_i));
+ }
+
+ /* tracepoint: xdp:xdp_exception */
+ for (rec_i = 0; rec_i < XDP_ACTION_MAX; rec_i++) {
+ struct record_u64 *rec, *prev;
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %s\n";
+
+ rec = &stats_rec->xdp_exception[rec_i];
+ prev = &stats_prev->xdp_exception[rec_i];
+ t = calc_period_u64(rec, prev);
+
+ for (i = 0; i < nr_cpus; i++) {
+ struct u64rec *r = &rec->cpu[i];
+ struct u64rec *p = &prev->cpu[i];
+
+ pps = calc_pps_u64(r, p, t);
+ if (pps > 0)
+ printf(fmt1, "Exception", i,
+ 0.0, pps, err2str(rec_i));
+ }
+ pps = calc_pps_u64(&rec->total, &prev->total, t);
+ if (pps > 0)
+ printf(fmt2, "Exception", "total",
+ 0.0, pps, action2str(rec_i));
+ }
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+ char *fmt1 = "%-15s %3d:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+ char *fmt2 = "%-15s %3s:%-3d %'-12.0f %'-12.0f %'-10.2f %s\n";
+ struct record *rec, *prev;
+ char *info_str = "";
+ double drop, info;
+
+ rec = &stats_rec->xdp_cpumap_enqueue[to_cpu];
+ prev = &stats_prev->xdp_cpumap_enqueue[to_cpu];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ if (info > 0) {
+ info_str = "bulk-average";
+ info = pps / info; /* calc average bulk size */
+ }
+ if (pps > 0)
+ printf(fmt1, "cpumap-enqueue",
+ i, to_cpu, pps, drop, info, info_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ if (pps > 0) {
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ if (info > 0) {
+ info_str = "bulk-average";
+ info = pps / info; /* calc average bulk size */
+ }
+ printf(fmt2, "cpumap-enqueue",
+ "sum", to_cpu, pps, drop, info, info_str);
+ }
+ }
+
+ /* cpumap kthread stats */
+ {
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.0f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.0f %s\n";
+ struct record *rec, *prev;
+ double drop, info;
+ char *i_str = "";
+
+ rec = &stats_rec->xdp_cpumap_kthread;
+ prev = &stats_prev->xdp_cpumap_kthread;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ if (info > 0)
+ i_str = "sched";
+ if (pps > 0)
+ printf(fmt1, "cpumap-kthread",
+ i, pps, drop, info, i_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ if (info > 0)
+ i_str = "sched-sum";
+ printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
+ }
+
+ printf("\n");
+}
+
+static bool stats_collect(struct stats_record *rec)
+{
+ int fd;
int i;
/* TODO: Detect if someone unloaded the perf event_fd's, as
* this can happen by someone running perf-record -e
*/
- for (i = 0; i < REDIR_RES_MAX; i++) {
- rec->xdp_redir[i].timestamp = gettime();
- rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i);
+ fd = map_data[0].fd; /* map0: redirect_err_cnt */
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
+
+ fd = map_data[1].fd; /* map1: exception_cnt */
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
}
+
+ fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
+ for (i = 0; i < MAX_CPUS; i++)
+ map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
+
+ fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
+ map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+
return true;
}
+static void *alloc_rec_per_cpu(int record_size)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ void *array;
+ size_t size;
+
+ size = record_size * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int rec_sz;
+ int i;
+
+ /* Alloc main stats_record structure */
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+
+ /* Alloc stats stored per CPU for each record */
+ rec_sz = sizeof(struct u64rec);
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ rec->xdp_redirect[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ rec->xdp_exception[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ rec_sz = sizeof(struct datarec);
+ rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+
+ for (i = 0; i < MAX_CPUS; i++)
+ rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
+
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ int i;
+
+ for (i = 0; i < REDIR_RES_MAX; i++)
+ free(r->xdp_redirect[i].cpu);
+
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ free(r->xdp_exception[i].cpu);
+
+ free(r->xdp_cpumap_kthread.cpu);
+
+ for (i = 0; i < MAX_CPUS; i++)
+ free(r->xdp_cpumap_enqueue[i].cpu);
+
+ free(r);
+}
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
static void stats_poll(int interval, bool err_only)
{
- struct stats_record rec, prev;
- int map_fd;
+ struct stats_record *rec, *prev;
- memset(&rec, 0, sizeof(rec));
+ rec = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(rec);
+
+ if (err_only)
+ printf("\n%s\n", __doc_err_only__);
/* Trick to pretty printf with thousands separators use %' */
setlocale(LC_NUMERIC, "en_US");
@@ -190,23 +534,26 @@ static void stats_poll(int interval, bool err_only)
printf("\n%s", __doc__);
/* TODO Need more advanced stats on error types */
- if (verbose)
- printf(" - Stats map: %s\n", map_data[0].name);
- map_fd = map_data[0].fd;
-
- stats_print_headers(err_only);
+ if (verbose) {
+ printf(" - Stats map0: %s\n", map_data[0].name);
+ printf(" - Stats map1: %s\n", map_data[1].name);
+ printf("\n");
+ }
fflush(stdout);
while (1) {
- memcpy(&prev, &rec, sizeof(rec));
- stats_collect(map_fd, &rec);
- stats_print(&rec, &prev, err_only);
+ swap(&prev, &rec);
+ stats_collect(rec);
+ stats_print(rec, prev, err_only);
fflush(stdout);
sleep(interval);
}
+
+ free_stats_record(rec);
+ free_stats_record(prev);
}
-void print_bpf_prog_info(void)
+static void print_bpf_prog_info(void)
{
int i;
@@ -235,6 +582,7 @@ void print_bpf_prog_info(void)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
int longindex = 0, opt;
int ret = EXIT_SUCCESS;
char bpf_obj_file[256];
@@ -265,13 +613,18 @@ int main(int argc, char **argv)
}
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return EXIT_FAILURE;
+ }
+
if (load_bpf_file(bpf_obj_file)) {
printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
- return 1;
+ return EXIT_FAILURE;
}
if (!prog_fd[0]) {
printf("ERROR - load_bpf_file: %s\n", strerror(errno));
- return 1;
+ return EXIT_FAILURE;
}
if (debug) {
diff --git a/samples/bpf/xdp_redirect_cpu_kern.c b/samples/bpf/xdp_redirect_cpu_kern.c
new file mode 100644
index 000000000000..303e9e7161f3
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_kern.c
@@ -0,0 +1,609 @@
+/* XDP redirect to CPUs via cpumap (BPF_MAP_TYPE_CPUMAP)
+ *
+ * GPLv2, Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/if_vlan.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/udp.h>
+
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define MAX_CPUS 12 /* WARNING - sync with _user.c */
+
+/* Special map type that can XDP_REDIRECT frames to another CPU */
+struct bpf_map_def SEC("maps") cpu_map = {
+ .type = BPF_MAP_TYPE_CPUMAP,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+
+/* Common stats data record to keep userspace more simple */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+
+/* Count RX packets, as XDP bpf_prog doesn't get direct TX-success
+ * feedback. Redirect TX errors can be caught via a tracepoint.
+ */
+struct bpf_map_def SEC("maps") rx_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") redirect_err_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 2,
+ /* TODO: have entries for all possible errno's */
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_CPUS,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Set of maps controlling available CPU, and for iterating through
+ * selectable redirect CPUs.
+ */
+struct bpf_map_def SEC("maps") cpus_available = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = MAX_CPUS,
+};
+struct bpf_map_def SEC("maps") cpus_count = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+struct bpf_map_def SEC("maps") cpus_iterator = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u32),
+ .max_entries = 1,
+};
+
+/* Used by trace point */
+struct bpf_map_def SEC("maps") exception_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Helper parse functions */
+
+/* Parse Ethernet layer 2, extract network layer 3 offset and protocol
+ *
+ * Returns false on error and non-supported ether-type
+ */
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+static __always_inline
+bool parse_eth(struct ethhdr *eth, void *data_end,
+ u16 *eth_proto, u64 *l3_offset)
+{
+ u16 eth_type;
+ u64 offset;
+
+ offset = sizeof(*eth);
+ if ((void *)eth + offset > data_end)
+ return false;
+
+ eth_type = eth->h_proto;
+
+ /* Skip non 802.3 Ethertypes */
+ if (unlikely(ntohs(eth_type) < ETH_P_802_3_MIN))
+ return false;
+
+ /* Handle VLAN tagged packet */
+ if (eth_type == htons(ETH_P_8021Q) || eth_type == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ offset += sizeof(*vlan_hdr);
+ if ((void *)eth + offset > data_end)
+ return false;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ }
+ /* TODO: Handle double VLAN tagged packet */
+
+ *eth_proto = ntohs(eth_type);
+ *l3_offset = offset;
+ return true;
+}
+
+static __always_inline
+u16 get_dest_port_ipv4_udp(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+ struct udphdr *udph;
+ u16 dport;
+
+ if (iph + 1 > data_end)
+ return 0;
+ if (!(iph->protocol == IPPROTO_UDP))
+ return 0;
+
+ udph = (void *)(iph + 1);
+ if (udph + 1 > data_end)
+ return 0;
+
+ dport = ntohs(udph->dest);
+ return dport;
+}
+
+static __always_inline
+int get_proto_ipv4(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ return iph->protocol;
+}
+
+static __always_inline
+int get_proto_ipv6(struct xdp_md *ctx, u64 nh_off)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ipv6hdr *ip6h = data + nh_off;
+
+ if (ip6h + 1 > data_end)
+ return 0;
+ return ip6h->nexthdr;
+}
+
+SEC("xdp_cpu_map0")
+int xdp_prognum0_no_touch(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map1_touch_data")
+int xdp_prognum1_touch_data(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 *cpu_selected;
+ u32 cpu_dest;
+ u16 eth_type;
+ u32 key = 0;
+
+ /* Only use first entry in cpus_available */
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &key);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Validate packet length is minimum Eth header size */
+ if (eth + 1 > data_end)
+ return XDP_ABORTED;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ /* Read packet data, and use it (drop non 802.3 Ethertypes) */
+ eth_type = eth->h_proto;
+ if (ntohs(eth_type) < ETH_P_802_3_MIN) {
+ rec->dropped++;
+ return XDP_DROP;
+ }
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map2_round_robin")
+int xdp_prognum2_round_robin(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct datarec *rec;
+ u32 cpu_dest;
+ u32 *cpu_lookup;
+ u32 key0 = 0;
+
+ u32 *cpu_selected;
+ u32 *cpu_iterator;
+ u32 *cpu_max;
+ u32 cpu_idx;
+
+ cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
+ if (!cpu_max)
+ return XDP_ABORTED;
+
+ cpu_iterator = bpf_map_lookup_elem(&cpus_iterator, &key0);
+ if (!cpu_iterator)
+ return XDP_ABORTED;
+ cpu_idx = *cpu_iterator;
+
+ *cpu_iterator += 1;
+ if (*cpu_iterator == *cpu_max)
+ *cpu_iterator = 0;
+
+ cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_selected)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_selected;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key0);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map3_proto_separate")
+int xdp_prognum3_proto_separate(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+SEC("xdp_cpu_map4_ddos_filter_pktgen")
+int xdp_prognum4_ddos_filter_pktgen(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ u8 ip_proto = IPPROTO_UDP;
+ struct datarec *rec;
+ u16 eth_proto = 0;
+ u64 l3_offset = 0;
+ u32 cpu_dest = 0;
+ u32 cpu_idx = 0;
+ u16 dest_port;
+ u32 *cpu_lookup;
+ u32 key = 0;
+
+ /* Count RX packet in map */
+ rec = bpf_map_lookup_elem(&rx_cnt, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ if (!(parse_eth(eth, data_end, &eth_proto, &l3_offset)))
+ return XDP_PASS; /* Just skip */
+
+ /* Extract L4 protocol */
+ switch (eth_proto) {
+ case ETH_P_IP:
+ ip_proto = get_proto_ipv4(ctx, l3_offset);
+ break;
+ case ETH_P_IPV6:
+ ip_proto = get_proto_ipv6(ctx, l3_offset);
+ break;
+ case ETH_P_ARP:
+ cpu_idx = 0; /* ARP packet handled on separate CPU */
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ /* Choose CPU based on L4 protocol */
+ switch (ip_proto) {
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ cpu_idx = 2;
+ break;
+ case IPPROTO_TCP:
+ cpu_idx = 0;
+ break;
+ case IPPROTO_UDP:
+ cpu_idx = 1;
+ /* DDoS filter UDP port 9 (pktgen) */
+ dest_port = get_dest_port_ipv4_udp(ctx, l3_offset);
+ if (dest_port == 9) {
+ if (rec)
+ rec->dropped++;
+ return XDP_DROP;
+ }
+ break;
+ default:
+ cpu_idx = 0;
+ }
+
+ cpu_lookup = bpf_map_lookup_elem(&cpus_available, &cpu_idx);
+ if (!cpu_lookup)
+ return XDP_ABORTED;
+ cpu_dest = *cpu_lookup;
+
+ if (cpu_dest >= MAX_CPUS) {
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ return bpf_redirect_map(&cpu_map, cpu_dest, 0);
+}
+
+
+char _license[] SEC("license") = "GPL";
+
+/*** Trace point code ***/
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_redirect_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12 size:4; signed:0;
+ int ifindex; // offset:16 size:4; signed:1;
+ int err; // offset:20 size:4; signed:1;
+ int to_ifindex; // offset:24 size:4; signed:1;
+ u32 map_id; // offset:28 size:4; signed:0;
+ int map_index; // offset:32 size:4; signed:1;
+}; // offset:36
+
+enum {
+ XDP_REDIRECT_SUCCESS = 0,
+ XDP_REDIRECT_ERROR = 1
+};
+
+static __always_inline
+int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx)
+{
+ u32 key = XDP_REDIRECT_ERROR;
+ struct datarec *rec;
+ int err = ctx->err;
+
+ if (!err)
+ key = XDP_REDIRECT_SUCCESS;
+
+ rec = bpf_map_lookup_elem(&redirect_err_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->dropped += 1;
+
+ return 0; /* Indicate event was filtered (no further processing)*/
+ /*
+ * Returning 1 here would allow e.g. a perf-record tracepoint
+ * to see and record these events, but it doesn't work well
+ * in-practice as stopping perf-record also unload this
+ * bpf_prog. Plus, there is additional overhead of doing so.
+ */
+}
+
+SEC("tracepoint/xdp/xdp_redirect_err")
+int trace_xdp_redirect_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+SEC("tracepoint/xdp/xdp_redirect_map_err")
+int trace_xdp_redirect_map_err(struct xdp_redirect_ctx *ctx)
+{
+ return xdp_redirect_collect_stat(ctx);
+}
+
+/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct xdp_exception_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int prog_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int ifindex; // offset:16; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_exception")
+int trace_xdp_exception(struct xdp_exception_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&exception_cnt, &key);
+ if (!rec)
+ return 1;
+ rec->dropped += 1;
+
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_enqueue_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int to_cpu; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_enqueue")
+int trace_xdp_cpumap_enqueue(struct cpumap_enqueue_ctx *ctx)
+{
+ u32 to_cpu = ctx->to_cpu;
+ struct datarec *rec;
+
+ if (to_cpu >= MAX_CPUS)
+ return 1;
+
+ rec = bpf_map_lookup_elem(&cpumap_enqueue_cnt, &to_cpu);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ if (ctx->processed > 0)
+ rec->issue += 1;
+
+ /* Inception: It's possible to detect overload situations, via
+ * this tracepoint. This can be used for creating a feedback
+ * loop to XDP, which can take appropriate actions to mitigate
+ * this overload situation.
+ */
+ return 0;
+}
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_kthread/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct cpumap_kthread_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ int cpu; // offset:16; size:4; signed:1;
+ unsigned int drops; // offset:20; size:4; signed:0;
+ unsigned int processed; // offset:24; size:4; signed:0;
+ int sched; // offset:28; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_cpumap_kthread")
+int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&cpumap_kthread_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->processed;
+ rec->dropped += ctx->drops;
+
+ /* Count times kthread yielded CPU via schedule call */
+ if (ctx->sched)
+ rec->issue++;
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c
new file mode 100644
index 000000000000..23744a8aaf21
--- /dev/null
+++ b/samples/bpf/xdp_redirect_cpu_user.c
@@ -0,0 +1,697 @@
+/* GPLv2 Copyright(c) 2017 Jesper Dangaard Brouer, Red Hat, Inc.
+ */
+static const char *__doc__ =
+ " XDP redirect with a CPU-map type \"BPF_MAP_TYPE_CPUMAP\"";
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#define MAX_CPUS 12 /* WARNING - sync with _kern.c */
+
+/* How many xdp_progs are defined in _kern.c */
+#define MAX_PROG 5
+
+/* Wanted to get rid of bpf_load.h and fake-"libbpf.h" (and instead
+ * use bpf/libbpf.h), but cannot as (currently) needed for XDP
+ * attaching to a device via bpf_set_link_xdp_fd()
+ */
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"dev", required_argument, NULL, 'd' },
+ {"skb-mode", no_argument, NULL, 'S' },
+ {"debug", no_argument, NULL, 'D' },
+ {"sec", required_argument, NULL, 's' },
+ {"prognum", required_argument, NULL, 'p' },
+ {"qsize", required_argument, NULL, 'q' },
+ {"cpu", required_argument, NULL, 'c' },
+ {"stress-mode", no_argument, NULL, 'x' },
+ {"no-separators", no_argument, NULL, 'z' },
+ {0, 0, NULL, 0 }
+};
+
+static void int_exit(int sig)
+{
+ fprintf(stderr,
+ "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+ ifindex, ifname);
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(EXIT_OK);
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf("\n");
+ printf(" Usage: %s (options-see-below)\n", argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf(" short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+}
+
+/* gettime returns the current time of day in nanoseconds.
+ * Cost: clock_gettime (ns) => 26ns (CLOCK_MONOTONIC)
+ * clock_gettime (ns) => 9ns (CLOCK_MONOTONIC_COARSE)
+ */
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAIL);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 dropped;
+ __u64 issue;
+};
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct stats_record {
+ struct record rx_cnt;
+ struct record redir_err;
+ struct record kthread;
+ struct record exception;
+ struct record enq[MAX_CPUS];
+};
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_dropped = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].dropped = values[i].dropped;
+ sum_dropped += values[i].dropped;
+ rec->cpu[i].issue = values[i].issue;
+ sum_issue += values[i].issue;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.dropped = sum_dropped;
+ rec->total.issue = sum_issue;
+ return true;
+}
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec *array;
+ size_t size;
+
+ size = sizeof(struct datarec) * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ struct stats_record *rec;
+ int i;
+
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+ rec->rx_cnt.cpu = alloc_record_per_cpu();
+ rec->redir_err.cpu = alloc_record_per_cpu();
+ rec->kthread.cpu = alloc_record_per_cpu();
+ rec->exception.cpu = alloc_record_per_cpu();
+ for (i = 0; i < MAX_CPUS; i++)
+ rec->enq[i].cpu = alloc_record_per_cpu();
+
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ int i;
+
+ for (i = 0; i < MAX_CPUS; i++)
+ free(r->enq[i].cpu);
+ free(r->exception.cpu);
+ free(r->kthread.cpu);
+ free(r->redir_err.cpu);
+ free(r->rx_cnt.cpu);
+ free(r);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_drop_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->dropped - p->dropped;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+ struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ int prog_num)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ double pps = 0, drop = 0, err = 0;
+ struct record *rec, *prev;
+ int to_cpu;
+ double t;
+ int i;
+
+ /* Header */
+ printf("Running XDP/eBPF prog_num:%d\n", prog_num);
+ printf("%-15s %-7s %-14s %-11s %-9s\n",
+ "XDP-cpumap", "CPU:to", "pps", "drop-pps", "extra-info");
+
+ /* XDP rx_cnt */
+ {
+ char *fmt_rx = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %-7s %'-14.0f %'-11.0f\n";
+ char *errstr = "";
+
+ rec = &stats_rec->rx_cnt;
+ prev = &stats_prev->rx_cnt;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ errstr = "cpu-dest/err";
+ if (pps > 0)
+ printf(fmt_rx, "XDP-RX",
+ i, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ printf(fm2_rx, "XDP-RX", "total", pps, drop);
+ }
+
+ /* cpumap enqueue stats */
+ for (to_cpu = 0; to_cpu < MAX_CPUS; to_cpu++) {
+ char *fmt = "%-15s %3d:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *fm2 = "%-15s %3s:%-3d %'-14.0f %'-11.0f %'-10.2f %s\n";
+ char *errstr = "";
+
+ rec = &stats_rec->enq[to_cpu];
+ prev = &stats_prev->enq[to_cpu];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ if (pps > 0)
+ printf(fmt, "cpumap-enqueue",
+ i, to_cpu, pps, drop, err, errstr);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ if (pps > 0) {
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0) {
+ errstr = "bulk-average";
+ err = pps / err; /* calc average bulk size */
+ }
+ printf(fm2, "cpumap-enqueue",
+ "sum", to_cpu, pps, drop, err, errstr);
+ }
+ }
+
+ /* cpumap kthread stats */
+ {
+ char *fmt_k = "%-15s %-7d %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *fm2_k = "%-15s %-7s %'-14.0f %'-11.0f %'-10.0f %s\n";
+ char *e_str = "";
+
+ rec = &stats_rec->kthread;
+ prev = &stats_prev->kthread;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ e_str = "sched";
+ if (pps > 0)
+ printf(fmt_k, "cpumap_kthread",
+ i, pps, drop, err, e_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (err > 0)
+ e_str = "sched-sum";
+ printf(fm2_k, "cpumap_kthread", "total", pps, drop, err, e_str);
+ }
+
+ /* XDP redirect err tracepoints (very unlikely) */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->redir_err;
+ prev = &stats_prev->redir_err;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "redirect_err", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "redirect_err", "total", pps, drop);
+ }
+
+ /* XDP general exception tracepoints */
+ {
+ char *fmt_err = "%-15s %-7d %'-14.0f %'-11.0f\n";
+ char *fm2_err = "%-15s %-7s %'-14.0f %'-11.0f\n";
+
+ rec = &stats_rec->exception;
+ prev = &stats_prev->exception;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop_pps(r, p, t);
+ if (pps > 0)
+ printf(fmt_err, "xdp_exception", i, pps, drop);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop_pps(&rec->total, &prev->total, t);
+ printf(fm2_err, "xdp_exception", "total", pps, drop);
+ }
+
+ printf("\n");
+ fflush(stdout);
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+ int fd, i;
+
+ fd = map_fd[1]; /* map: rx_cnt */
+ map_collect_percpu(fd, 0, &rec->rx_cnt);
+
+ fd = map_fd[2]; /* map: redirect_err_cnt */
+ map_collect_percpu(fd, 1, &rec->redir_err);
+
+ fd = map_fd[3]; /* map: cpumap_enqueue_cnt */
+ for (i = 0; i < MAX_CPUS; i++)
+ map_collect_percpu(fd, i, &rec->enq[i]);
+
+ fd = map_fd[4]; /* map: cpumap_kthread_cnt */
+ map_collect_percpu(fd, 0, &rec->kthread);
+
+ fd = map_fd[8]; /* map: exception_cnt */
+ map_collect_percpu(fd, 0, &rec->exception);
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static int create_cpu_entry(__u32 cpu, __u32 queue_size,
+ __u32 avail_idx, bool new)
+{
+ __u32 curr_cpus_count = 0;
+ __u32 key = 0;
+ int ret;
+
+ /* Add a CPU entry to cpumap, as this allocate a cpu entry in
+ * the kernel for the cpu.
+ */
+ ret = bpf_map_update_elem(map_fd[0], &cpu, &queue_size, 0);
+ if (ret) {
+ fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* Inform bpf_prog's that a new CPU is available to select
+ * from via some control maps.
+ */
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &avail_idx, &cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Add to avail CPUs failed\n");
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* When not replacing/updating existing entry, bump the count */
+ /* map_fd[6] = cpus_count */
+ ret = bpf_map_lookup_elem(map_fd[6], &key, &curr_cpus_count);
+ if (ret) {
+ fprintf(stderr, "Failed reading curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ if (new) {
+ curr_cpus_count++;
+ ret = bpf_map_update_elem(map_fd[6], &key, &curr_cpus_count, 0);
+ if (ret) {
+ fprintf(stderr, "Failed write curr cpus_count\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+ /* map_fd[7] = cpus_iterator */
+ printf("%s CPU:%u as idx:%u queue_size:%d (total cpus_count:%u)\n",
+ new ? "Add-new":"Replace", cpu, avail_idx,
+ queue_size, curr_cpus_count);
+
+ return 0;
+}
+
+/* CPUs are zero-indexed. Thus, add a special sentinel default value
+ * in map cpus_available to mark CPU index'es not configured
+ */
+static void mark_cpus_unavailable(void)
+{
+ __u32 invalid_cpu = MAX_CPUS;
+ int ret, i;
+
+ for (i = 0; i < MAX_CPUS; i++) {
+ /* map_fd[5] = cpus_available */
+ ret = bpf_map_update_elem(map_fd[5], &i, &invalid_cpu, 0);
+ if (ret) {
+ fprintf(stderr, "Failed marking CPU unavailable\n");
+ exit(EXIT_FAIL_BPF);
+ }
+ }
+}
+
+/* Stress cpumap management code by concurrently changing underlying cpumap */
+static void stress_cpumap(void)
+{
+ /* Changing qsize will cause kernel to free and alloc a new
+ * bpf_cpu_map_entry, with an associated/complicated tear-down
+ * procedure.
+ */
+ create_cpu_entry(1, 1024, 0, false);
+ create_cpu_entry(1, 128, 0, false);
+ create_cpu_entry(1, 16000, 0, false);
+}
+
+static void stats_poll(int interval, bool use_separators, int prog_num,
+ bool stress_mode)
+{
+ struct stats_record *record, *prev;
+
+ record = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(record);
+
+ /* Trick to pretty printf with thousands separators use %' */
+ if (use_separators)
+ setlocale(LC_NUMERIC, "en_US");
+
+ while (1) {
+ swap(&prev, &record);
+ stats_collect(record);
+ stats_print(record, prev, prog_num);
+ sleep(interval);
+ if (stress_mode)
+ stress_cpumap();
+ }
+
+ free_stats_record(record);
+ free_stats_record(prev);
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ bool use_separators = true;
+ bool stress_mode = false;
+ char filename[256];
+ bool debug = false;
+ int added_cpus = 0;
+ int longindex = 0;
+ int interval = 2;
+ int prog_num = 0;
+ int add_cpu = -1;
+ __u32 qsize;
+ int opt;
+
+ /* Notice: choosing he queue size is very important with the
+ * ixgbe driver, because it's driver page recycling trick is
+ * dependend on pages being returned quickly. The number of
+ * out-standing packets in the system must be less-than 2x
+ * RX-ring size.
+ */
+ qsize = 128+64;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+ return EXIT_FAIL;
+ }
+
+ if (!prog_fd[0]) {
+ fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAIL;
+ }
+
+ mark_cpus_unavailable();
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hSd:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (strlen(optarg) >= IF_NAMESIZE) {
+ fprintf(stderr, "ERR: --dev name too long\n");
+ goto error;
+ }
+ ifname = (char *)&ifname_buf;
+ strncpy(ifname, optarg, IF_NAMESIZE);
+ ifindex = if_nametoindex(ifname);
+ if (ifindex == 0) {
+ fprintf(stderr,
+ "ERR: --dev name unknown err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'D':
+ debug = true;
+ break;
+ case 'x':
+ stress_mode = true;
+ break;
+ case 'z':
+ use_separators = false;
+ break;
+ case 'p':
+ /* Selecting eBPF prog to load */
+ prog_num = atoi(optarg);
+ if (prog_num < 0 || prog_num >= MAX_PROG) {
+ fprintf(stderr,
+ "--prognum too large err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 'c':
+ /* Add multiple CPUs */
+ add_cpu = strtoul(optarg, NULL, 0);
+ if (add_cpu >= MAX_CPUS) {
+ fprintf(stderr,
+ "--cpu nr too large for cpumap err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ create_cpu_entry(add_cpu, qsize, added_cpus, true);
+ added_cpus++;
+ break;
+ case 'q':
+ qsize = atoi(optarg);
+ break;
+ case 'h':
+ error:
+ default:
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ /* Required option */
+ if (ifindex == -1) {
+ fprintf(stderr, "ERR: required option --dev missing\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ /* Required option */
+ if (add_cpu == -1) {
+ fprintf(stderr, "ERR: required option --cpu missing\n");
+ fprintf(stderr, " Specify multiple --cpu option to add more\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+
+ /* Remove XDP program when program is interrupted */
+ signal(SIGINT, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[prog_num], xdp_flags) < 0) {
+ fprintf(stderr, "link set xdp fd failed\n");
+ return EXIT_FAIL_XDP;
+ }
+
+ if (debug) {
+ printf("Debug-mode reading trace pipe (fix #define DEBUG)\n");
+ read_trace_pipe();
+ }
+
+ stats_poll(interval, use_separators, prog_num, stress_mode);
+ return EXIT_OK;
+}
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index d4d86a273fba..7eae07d7293e 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -20,6 +20,7 @@
#include <string.h>
#include <unistd.h>
#include <libgen.h>
+#include <sys/resource.h>
#include "bpf_load.h"
#include "bpf_util.h"
@@ -33,9 +34,9 @@ static __u32 xdp_flags;
static void int_exit(int sig)
{
- set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
if (ifindex_out_xdp_dummy_attached)
- set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
exit(0);
}
@@ -74,6 +75,7 @@ static void usage(const char *prog)
int main(int argc, char **argv)
{
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
const char *optstr = "SN";
char filename[256];
int ret, opt, key = 0;
@@ -97,6 +99,11 @@ int main(int argc, char **argv)
return 1;
}
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
ifindex_in = strtoul(argv[optind], NULL, 0);
ifindex_out = strtoul(argv[optind + 1], NULL, 0);
printf("input: %d output: %d\n", ifindex_in, ifindex_out);
@@ -113,13 +120,13 @@ int main(int argc, char **argv)
return 1;
}
- if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
return 1;
}
/* Loading dummy XDP prog on out-device */
- if (set_link_xdp_fd(ifindex_out, prog_fd[1],
+ if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
(xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
ifindex_out_xdp_dummy_attached = false;
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index 4475d837bf2c..d54e91eb6cbf 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -33,9 +33,9 @@ static __u32 xdp_flags;
static void int_exit(int sig)
{
- set_link_xdp_fd(ifindex_in, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex_in, -1, xdp_flags);
if (ifindex_out_xdp_dummy_attached)
- set_link_xdp_fd(ifindex_out, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
exit(0);
}
@@ -114,13 +114,13 @@ int main(int argc, char **argv)
return 1;
}
- if (set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex_in, prog_fd[0], xdp_flags) < 0) {
printf("ERROR: link set xdp fd failed on %d\n", ifindex_in);
return 1;
}
/* Loading dummy XDP prog on out-device */
- if (set_link_xdp_fd(ifindex_out, prog_fd[1],
+ if (bpf_set_link_xdp_fd(ifindex_out, prog_fd[1],
(xdp_flags | XDP_FLAGS_UPDATE_IF_NOEXIST)) < 0) {
printf("WARN: link set xdp fd failed on %d\n", ifindex_out);
ifindex_out_xdp_dummy_attached = false;
diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c
new file mode 100644
index 000000000000..993f56bc7b9a
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_kern.c
@@ -0,0 +1,186 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include "bpf_helpers.h"
+#include <linux/slab.h>
+#include <net/ip_fib.h>
+
+struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+};
+
+/* Key for lpm_trie*/
+union key_4 {
+ u32 b32[2];
+ u8 b8[8];
+};
+
+struct arp_entry {
+ __be64 mac;
+ __be32 dst;
+};
+
+struct direct_map {
+ struct arp_entry arp;
+ int ifindex;
+ __be64 mac;
+};
+
+/* Map for trie implementation*/
+struct bpf_map_def SEC("maps") lpm_map = {
+ .type = BPF_MAP_TYPE_LPM_TRIE,
+ .key_size = 8,
+ .value_size = sizeof(struct trie_value),
+ .max_entries = 50,
+ .map_flags = BPF_F_NO_PREALLOC,
+};
+
+/* Map for counter*/
+struct bpf_map_def SEC("maps") rxcnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(u64),
+ .max_entries = 256,
+};
+
+/* Map for ARP table*/
+struct bpf_map_def SEC("maps") arp_table = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__be32),
+ .value_size = sizeof(__be64),
+ .max_entries = 50,
+};
+
+/* Map to keep the exact match entries in the route table*/
+struct bpf_map_def SEC("maps") exact_match = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__be32),
+ .value_size = sizeof(struct direct_map),
+ .max_entries = 50,
+};
+
+struct bpf_map_def SEC("maps") tx_port = {
+ .type = BPF_MAP_TYPE_DEVMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 100,
+};
+
+/* Function to set source and destination mac of the packet */
+static inline void set_src_dst_mac(void *data, void *src, void *dst)
+{
+ unsigned short *source = src;
+ unsigned short *dest = dst;
+ unsigned short *p = data;
+
+ __builtin_memcpy(p, dest, 6);
+ __builtin_memcpy(p + 3, source, 6);
+}
+
+/* Parse IPV4 packet to get SRC, DST IP and protocol */
+static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
+ __be32 *src, __be32 *dest)
+{
+ struct iphdr *iph = data + nh_off;
+
+ if (iph + 1 > data_end)
+ return 0;
+ *src = iph->saddr;
+ *dest = iph->daddr;
+ return iph->protocol;
+}
+
+SEC("xdp_router_ipv4")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ __be64 *dest_mac = NULL, *src_mac = NULL;
+ void *data = (void *)(long)ctx->data;
+ struct trie_value *prefix_value;
+ int rc = XDP_DROP, forward_to;
+ struct ethhdr *eth = data;
+ union key_4 key4;
+ long *value;
+ u16 h_proto;
+ u32 ipproto;
+ u64 nh_off;
+
+ nh_off = sizeof(*eth);
+ if (data + nh_off > data_end)
+ return rc;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
+ struct vlan_hdr *vhdr;
+
+ vhdr = data + nh_off;
+ nh_off += sizeof(struct vlan_hdr);
+ if (data + nh_off > data_end)
+ return rc;
+ h_proto = vhdr->h_vlan_encapsulated_proto;
+ }
+ if (h_proto == htons(ETH_P_ARP)) {
+ return XDP_PASS;
+ } else if (h_proto == htons(ETH_P_IP)) {
+ struct direct_map *direct_entry;
+ __be32 src_ip = 0, dest_ip = 0;
+
+ ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip);
+ direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip);
+ /* Check for exact match, this would give a faster lookup*/
+ if (direct_entry && direct_entry->mac && direct_entry->arp.mac) {
+ src_mac = &direct_entry->mac;
+ dest_mac = &direct_entry->arp.mac;
+ forward_to = direct_entry->ifindex;
+ } else {
+ /* Look up in the trie for lpm*/
+ key4.b32[0] = 32;
+ key4.b8[4] = dest_ip & 0xff;
+ key4.b8[5] = (dest_ip >> 8) & 0xff;
+ key4.b8[6] = (dest_ip >> 16) & 0xff;
+ key4.b8[7] = (dest_ip >> 24) & 0xff;
+ prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
+ if (!prefix_value)
+ return XDP_DROP;
+ src_mac = &prefix_value->value;
+ if (!src_mac)
+ return XDP_DROP;
+ dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
+ if (!dest_mac) {
+ if (!prefix_value->gw)
+ return XDP_DROP;
+ dest_ip = prefix_value->gw;
+ dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
+ }
+ forward_to = prefix_value->ifindex;
+ }
+ } else {
+ ipproto = 0;
+ }
+ if (src_mac && dest_mac) {
+ set_src_dst_mac(data, src_mac, dest_mac);
+ value = bpf_map_lookup_elem(&rxcnt, &ipproto);
+ if (value)
+ *value += 1;
+ return bpf_redirect_map(&tx_port, forward_to, 0);
+ }
+ return rc;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
new file mode 100644
index 000000000000..6296741c1fbd
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -0,0 +1,660 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+#include <arpa/inet.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <net/if.h>
+#include <netdb.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include "bpf_util.h"
+
+int sock, sock_arp, flags = 0;
+static int total_ifindex;
+int *ifindex_list;
+char buf[8192];
+
+static int get_route_table(int rtm_family);
+static void int_exit(int sig)
+{
+ int i = 0;
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+}
+
+static void close_and_exit(int sig)
+{
+ int i = 0;
+
+ close(sock);
+ close(sock_arp);
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+}
+
+/* Get the mac address of the interface given interface name */
+static __be64 getmac(char *iface)
+{
+ struct ifreq ifr;
+ __be64 mac = 0;
+ int fd, i;
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ ifr.ifr_addr.sa_family = AF_INET;
+ strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1);
+ if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) {
+ printf("ioctl failed leaving....\n");
+ return -1;
+ }
+ for (i = 0; i < 6 ; i++)
+ *((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i];
+ close(fd);
+ return mac;
+}
+
+static int recv_msg(struct sockaddr_nl sock_addr, int sock)
+{
+ struct nlmsghdr *nh;
+ int len, nll = 0;
+ char *buf_ptr;
+
+ buf_ptr = buf;
+ while (1) {
+ len = recv(sock, buf_ptr, sizeof(buf) - nll, 0);
+ if (len < 0)
+ return len;
+
+ nh = (struct nlmsghdr *)buf_ptr;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ buf_ptr += len;
+ nll += len;
+ if ((sock_addr.nl_groups & RTMGRP_NEIGH) == RTMGRP_NEIGH)
+ break;
+
+ if ((sock_addr.nl_groups & RTMGRP_IPV4_ROUTE) == RTMGRP_IPV4_ROUTE)
+ break;
+ }
+ return nll;
+}
+
+/* Function to parse the route entry returned by netlink
+ * Updates the route entry related map entries
+ */
+static void read_route(struct nlmsghdr *nh, int nll)
+{
+ char dsts[24], gws[24], ifs[16], dsts_len[24], metrics[24];
+ struct bpf_lpm_trie_key *prefix_key;
+ struct rtattr *rt_attr;
+ struct rtmsg *rt_msg;
+ int rtm_family;
+ int rtl;
+ int i;
+ struct route_table {
+ int dst_len, iface, metric;
+ char *iface_name;
+ __be32 dst, gw;
+ __be64 mac;
+ } route;
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ };
+
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ if (nh->nlmsg_type == RTM_DELROUTE)
+ printf("DELETING Route entry\n");
+ else if (nh->nlmsg_type == RTM_GETROUTE)
+ printf("READING Route entry\n");
+ else if (nh->nlmsg_type == RTM_NEWROUTE)
+ printf("NEW Route entry\n");
+ else
+ printf("%d\n", nh->nlmsg_type);
+
+ memset(&route, 0, sizeof(route));
+ printf("Destination\t\tGateway\t\tGenmask\t\tMetric\t\tIface\n");
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
+ rtm_family = rt_msg->rtm_family;
+ if (rtm_family == AF_INET)
+ if (rt_msg->rtm_table != RT_TABLE_MAIN)
+ continue;
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ rtl = RTM_PAYLOAD(nh);
+
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ (*((__be32 *)RTA_DATA(rt_attr))));
+ break;
+ case RTA_GATEWAY:
+ sprintf(gws, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_OIF:
+ sprintf(ifs, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ break;
+ case RTA_METRICS:
+ sprintf(metrics, "%u",
+ *((int *)RTA_DATA(rt_attr)));
+ default:
+ break;
+ }
+ }
+ sprintf(dsts_len, "%d", rt_msg->rtm_dst_len);
+ route.dst = atoi(dsts);
+ route.dst_len = atoi(dsts_len);
+ route.gw = atoi(gws);
+ route.iface = atoi(ifs);
+ route.metric = atoi(metrics);
+ route.iface_name = alloca(sizeof(char *) * IFNAMSIZ);
+ route.iface_name = if_indextoname(route.iface, route.iface_name);
+ route.mac = getmac(route.iface_name);
+ if (route.mac == -1) {
+ int i = 0;
+
+ for (i = 0; i < total_ifindex; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+ exit(0);
+ }
+ assert(bpf_map_update_elem(map_fd[4], &route.iface, &route.iface, 0) == 0);
+ if (rtm_family == AF_INET) {
+ struct trie_value {
+ __u8 prefix[4];
+ __be64 value;
+ int ifindex;
+ int metric;
+ __be32 gw;
+ } *prefix_value;
+
+ prefix_key = alloca(sizeof(*prefix_key) + 3);
+ prefix_value = alloca(sizeof(*prefix_value));
+
+ prefix_key->prefixlen = 32;
+ prefix_key->prefixlen = route.dst_len;
+ direct_entry.mac = route.mac & 0xffffffffffff;
+ direct_entry.ifindex = route.iface;
+ direct_entry.arp.mac = 0;
+ direct_entry.arp.dst = 0;
+ if (route.dst_len == 32) {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ assert(bpf_map_delete_elem(map_fd[3], &route.dst) == 0);
+ } else {
+ if (bpf_map_lookup_elem(map_fd[2], &route.dst, &direct_entry.arp.mac) == 0)
+ direct_entry.arp.dst = route.dst;
+ assert(bpf_map_update_elem(map_fd[3], &route.dst, &direct_entry, 0) == 0);
+ }
+ }
+ for (i = 0; i < 4; i++)
+ prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
+
+ printf("%3d.%d.%d.%d\t\t%3x\t\t%d\t\t%d\t\t%s\n",
+ (int)prefix_key->data[0],
+ (int)prefix_key->data[1],
+ (int)prefix_key->data[2],
+ (int)prefix_key->data[3],
+ route.gw, route.dst_len,
+ route.metric,
+ route.iface_name);
+ if (bpf_map_lookup_elem(map_fd[0], prefix_key,
+ prefix_value) < 0) {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] = prefix_key->data[i];
+ prefix_value->value = route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+
+ assert(bpf_map_update_elem(map_fd[0],
+ prefix_key,
+ prefix_value, 0
+ ) == 0);
+ } else {
+ if (nh->nlmsg_type == RTM_DELROUTE) {
+ printf("deleting entry\n");
+ printf("prefix key=%d.%d.%d.%d/%d",
+ prefix_key->data[0],
+ prefix_key->data[1],
+ prefix_key->data[2],
+ prefix_key->data[3],
+ prefix_key->prefixlen);
+ assert(bpf_map_delete_elem(map_fd[0],
+ prefix_key
+ ) == 0);
+ /* Rereading the route table to check if
+ * there is an entry with the same
+ * prefix but a different metric as the
+ * deleted enty.
+ */
+ get_route_table(AF_INET);
+ } else if (prefix_key->data[0] ==
+ prefix_value->prefix[0] &&
+ prefix_key->data[1] ==
+ prefix_value->prefix[1] &&
+ prefix_key->data[2] ==
+ prefix_value->prefix[2] &&
+ prefix_key->data[3] ==
+ prefix_value->prefix[3] &&
+ route.metric >= prefix_value->metric) {
+ continue;
+ } else {
+ for (i = 0; i < 4; i++)
+ prefix_value->prefix[i] =
+ prefix_key->data[i];
+ prefix_value->value =
+ route.mac & 0xffffffffffff;
+ prefix_value->ifindex = route.iface;
+ prefix_value->gw = route.gw;
+ prefix_value->metric = route.metric;
+ assert(bpf_map_update_elem(
+ map_fd[0],
+ prefix_key,
+ prefix_value,
+ 0) == 0);
+ }
+ }
+ }
+ memset(&route, 0, sizeof(route));
+ memset(dsts, 0, sizeof(dsts));
+ memset(dsts_len, 0, sizeof(dsts_len));
+ memset(gws, 0, sizeof(gws));
+ memset(ifs, 0, sizeof(ifs));
+ memset(&route, 0, sizeof(route));
+ }
+}
+
+/* Function to read the existing route table when the process is launched*/
+static int get_route_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+
+ struct {
+ struct nlmsghdr nl;
+ struct rtmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETROUTE;
+
+ req.rt.rtm_family = rtm_family;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ printf("send to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_route(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to parse the arp entry returned by netlink
+ * Updates the arp entry related map entries
+ */
+static void read_arp(struct nlmsghdr *nh, int nll)
+{
+ struct rtattr *rt_attr;
+ char dsts[24], mac[24];
+ struct ndmsg *rt_msg;
+ int rtl, ndm_family;
+
+ struct arp_table {
+ __be64 mac;
+ __be32 dst;
+ } arp_entry;
+ struct direct_map {
+ struct arp_table arp;
+ int ifindex;
+ __be64 mac;
+ } direct_entry;
+
+ if (nh->nlmsg_type == RTM_GETNEIGH)
+ printf("READING arp entry\n");
+ printf("Address\tHwAddress\n");
+ for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
+ rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
+ rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
+ ndm_family = rt_msg->ndm_family;
+ rtl = RTM_PAYLOAD(nh);
+ for (; RTA_OK(rt_attr, rtl); rt_attr = RTA_NEXT(rt_attr, rtl)) {
+ switch (rt_attr->rta_type) {
+ case NDA_DST:
+ sprintf(dsts, "%u",
+ *((__be32 *)RTA_DATA(rt_attr)));
+ break;
+ case NDA_LLADDR:
+ sprintf(mac, "%lld",
+ *((__be64 *)RTA_DATA(rt_attr)));
+ break;
+ default:
+ break;
+ }
+ }
+ arp_entry.dst = atoi(dsts);
+ arp_entry.mac = atol(mac);
+ printf("%x\t\t%llx\n", arp_entry.dst, arp_entry.mac);
+ if (ndm_family == AF_INET) {
+ if (bpf_map_lookup_elem(map_fd[3], &arp_entry.dst,
+ &direct_entry) == 0) {
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ direct_entry.arp.dst = 0;
+ direct_entry.arp.mac = 0;
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ direct_entry.arp.dst = arp_entry.dst;
+ direct_entry.arp.mac = arp_entry.mac;
+ }
+ assert(bpf_map_update_elem(map_fd[3],
+ &arp_entry.dst,
+ &direct_entry, 0
+ ) == 0);
+ memset(&direct_entry, 0, sizeof(direct_entry));
+ }
+ if (nh->nlmsg_type == RTM_DELNEIGH) {
+ assert(bpf_map_delete_elem(map_fd[2], &arp_entry.dst) == 0);
+ } else if (nh->nlmsg_type == RTM_NEWNEIGH) {
+ assert(bpf_map_update_elem(map_fd[2],
+ &arp_entry.dst,
+ &arp_entry.mac, 0
+ ) == 0);
+ }
+ }
+ memset(&arp_entry, 0, sizeof(arp_entry));
+ memset(dsts, 0, sizeof(dsts));
+ }
+}
+
+/* Function to read the existing arp table when the process is launched*/
+static int get_arp_table(int rtm_family)
+{
+ struct sockaddr_nl sa;
+ struct nlmsghdr *nh;
+ int sock, seq = 0;
+ struct msghdr msg;
+ struct iovec iov;
+ int ret = 0;
+ int nll;
+ struct {
+ struct nlmsghdr nl;
+ struct ndmsg rt;
+ char buf[8192];
+ } req;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+ memset(&sa, 0, sizeof(sa));
+ sa.nl_family = AF_NETLINK;
+ if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(&req, 0, sizeof(req));
+ req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
+ req.nl.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nl.nlmsg_type = RTM_GETNEIGH;
+ req.rt.ndm_state = NUD_REACHABLE;
+ req.rt.ndm_family = rtm_family;
+ req.nl.nlmsg_pid = 0;
+ req.nl.nlmsg_seq = ++seq;
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = (void *)&req.nl;
+ iov.iov_len = req.nl.nlmsg_len;
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ ret = sendmsg(sock, &msg, 0);
+ if (ret < 0) {
+ printf("send to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ memset(buf, 0, sizeof(buf));
+ nll = recv_msg(sa, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+/* Function to keep track and update changes in route and arp table
+ * Give regular statistics of packets forwarded
+ */
+static int monitor_route(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ const unsigned int nr_keys = 256;
+ struct pollfd fds_route, fds_arp;
+ __u64 prev[nr_keys][nr_cpus];
+ struct sockaddr_nl la, lr;
+ __u64 values[nr_cpus];
+ struct nlmsghdr *nh;
+ int nll, ret = 0;
+ int interval = 5;
+ __u32 key;
+ int i;
+
+ sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+
+ fcntl(sock, F_SETFL, O_NONBLOCK);
+ memset(&lr, 0, sizeof(lr));
+ lr.nl_family = AF_NETLINK;
+ lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
+ if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ fds_route.fd = sock;
+ fds_route.events = POLL_IN;
+
+ sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sock_arp < 0) {
+ printf("open netlink socket: %s\n", strerror(errno));
+ return -1;
+ }
+
+ fcntl(sock_arp, F_SETFL, O_NONBLOCK);
+ memset(&la, 0, sizeof(la));
+ la.nl_family = AF_NETLINK;
+ la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
+ if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
+ printf("bind to netlink: %s\n", strerror(errno));
+ ret = -1;
+ goto cleanup;
+ }
+ fds_arp.fd = sock_arp;
+ fds_arp.events = POLL_IN;
+
+ memset(prev, 0, sizeof(prev));
+ do {
+ signal(SIGINT, close_and_exit);
+ signal(SIGTERM, close_and_exit);
+
+ sleep(interval);
+ for (key = 0; key < nr_keys; key++) {
+ __u64 sum = 0;
+
+ assert(bpf_map_lookup_elem(map_fd[1], &key, values) == 0);
+ for (i = 0; i < nr_cpus; i++)
+ sum += (values[i] - prev[key][i]);
+ if (sum)
+ printf("proto %u: %10llu pkt/s\n",
+ key, sum / interval);
+ memcpy(prev[key], values, sizeof(values));
+ }
+
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_route, 1, 3) == POLL_IN) {
+ nll = recv_msg(lr, sock);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ printf("Routing table updated.\n");
+ read_route(nh, nll);
+ }
+ memset(buf, 0, sizeof(buf));
+ if (poll(&fds_arp, 1, 3) == POLL_IN) {
+ nll = recv_msg(la, sock_arp);
+ if (nll < 0) {
+ printf("recv from netlink: %s\n", strerror(nll));
+ ret = -1;
+ goto cleanup;
+ }
+
+ nh = (struct nlmsghdr *)buf;
+ read_arp(nh, nll);
+ }
+
+ } while (1);
+cleanup:
+ close(sock);
+ return ret;
+}
+
+int main(int ac, char **argv)
+{
+ char filename[256];
+ char **ifname_list;
+ int i = 1;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (ac < 2) {
+ printf("usage: %s [-S] Interface name list\n", argv[0]);
+ return 1;
+ }
+ if (!strcmp(argv[1], "-S")) {
+ flags = XDP_FLAGS_SKB_MODE;
+ total_ifindex = ac - 2;
+ ifname_list = (argv + 2);
+ } else {
+ flags = 0;
+ total_ifindex = ac - 1;
+ ifname_list = (argv + 1);
+ }
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+ printf("\n**************loading bpf file*********************\n\n\n");
+ if (!prog_fd[0]) {
+ printf("load_bpf_file: %s\n", strerror(errno));
+ return 1;
+ }
+ ifindex_list = (int *)malloc(total_ifindex * sizeof(int *));
+ for (i = 0; i < total_ifindex; i++) {
+ ifindex_list[i] = if_nametoindex(ifname_list[i]);
+ if (!ifindex_list[i]) {
+ printf("Couldn't translate interface name: %s",
+ strerror(errno));
+ return 1;
+ }
+ }
+ for (i = 0; i < total_ifindex; i++) {
+ if (bpf_set_link_xdp_fd(ifindex_list[i], prog_fd[0], flags) < 0) {
+ printf("link set xdp fd failed\n");
+ int recovery_index = i;
+
+ for (i = 0; i < recovery_index; i++)
+ bpf_set_link_xdp_fd(ifindex_list[i], -1, flags);
+
+ return 1;
+ }
+ printf("Attached to %d\n", ifindex_list[i]);
+ }
+ signal(SIGINT, int_exit);
+ signal(SIGTERM, int_exit);
+
+ printf("*******************ROUTE TABLE*************************\n\n\n");
+ get_route_table(AF_INET);
+ printf("*******************ARP TABLE***************************\n\n\n");
+ get_arp_table(AF_INET);
+ if (monitor_route() < 0) {
+ printf("Error in receiving route update");
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/samples/bpf/xdp_rxq_info_kern.c b/samples/bpf/xdp_rxq_info_kern.c
new file mode 100644
index 000000000000..3fd209291653
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_kern.c
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ *
+ * Example howto extract XDP RX-queue info
+ */
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Config setup from with userspace
+ *
+ * User-side setup ifindex in config_map, to verify that
+ * ctx->ingress_ifindex is correct (against configured ifindex)
+ */
+struct config {
+ __u32 action;
+ int ifindex;
+};
+struct bpf_map_def SEC("maps") config_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct config),
+ .max_entries = 1,
+};
+
+/* Common stats data record (shared with userspace) */
+struct datarec {
+ __u64 processed;
+ __u64 issue;
+};
+
+struct bpf_map_def SEC("maps") stats_global_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+#define MAX_RXQs 64
+
+/* Stats per rx_queue_index (per CPU) */
+struct bpf_map_def SEC("maps") rx_queue_index_map = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = MAX_RXQs + 1,
+};
+
+SEC("xdp_prog0")
+int xdp_prognum0(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct datarec *rec, *rxq_rec;
+ int ingress_ifindex;
+ struct config *config;
+ u32 key = 0;
+
+ /* Global stats record */
+ rec = bpf_map_lookup_elem(&stats_global_map, &key);
+ if (!rec)
+ return XDP_ABORTED;
+ rec->processed++;
+
+ /* Accessing ctx->ingress_ifindex, cause BPF to rewrite BPF
+ * instructions inside kernel to access xdp_rxq->dev->ifindex
+ */
+ ingress_ifindex = ctx->ingress_ifindex;
+
+ config = bpf_map_lookup_elem(&config_map, &key);
+ if (!config)
+ return XDP_ABORTED;
+
+ /* Simple test: check ctx provided ifindex is as expected */
+ if (ingress_ifindex != config->ifindex) {
+ /* count this error case */
+ rec->issue++;
+ return XDP_ABORTED;
+ }
+
+ /* Update stats per rx_queue_index. Handle if rx_queue_index
+ * is larger than stats map can contain info for.
+ */
+ key = ctx->rx_queue_index;
+ if (key >= MAX_RXQs)
+ key = MAX_RXQs;
+ rxq_rec = bpf_map_lookup_elem(&rx_queue_index_map, &key);
+ if (!rxq_rec)
+ return XDP_ABORTED;
+ rxq_rec->processed++;
+ if (key == MAX_RXQs)
+ rxq_rec->issue++;
+
+ return config->action;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c
new file mode 100644
index 000000000000..478d95412de4
--- /dev/null
+++ b/samples/bpf/xdp_rxq_info_user.c
@@ -0,0 +1,531 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ */
+static const char *__doc__ = " XDP RX-queue info extract example\n\n"
+ "Monitor how many packets per sec (pps) are received\n"
+ "per NIC RX queue index and which CPU processed the packet\n"
+ ;
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <locale.h>
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+#include <time.h>
+
+#include <arpa/inet.h>
+#include <linux/if_link.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+
+static int ifindex = -1;
+static char ifname_buf[IF_NAMESIZE];
+static char *ifname;
+
+static __u32 xdp_flags;
+
+/* Exit return codes */
+#define EXIT_OK 0
+#define EXIT_FAIL 1
+#define EXIT_FAIL_OPTION 2
+#define EXIT_FAIL_XDP 3
+#define EXIT_FAIL_BPF 4
+#define EXIT_FAIL_MEM 5
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"dev", required_argument, NULL, 'd' },
+ {"skb-mode", no_argument, NULL, 'S' },
+ {"sec", required_argument, NULL, 's' },
+ {"no-separators", no_argument, NULL, 'z' },
+ {"action", required_argument, NULL, 'a' },
+ {0, 0, NULL, 0 }
+};
+
+static void int_exit(int sig)
+{
+ fprintf(stderr,
+ "Interrupted: Removing XDP program on ifindex:%d device:%s\n",
+ ifindex, ifname);
+ if (ifindex > -1)
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ exit(EXIT_OK);
+}
+
+struct config {
+ __u32 action;
+ int ifindex;
+};
+#define XDP_ACTION_MAX (XDP_TX + 1)
+#define XDP_ACTION_MAX_STRLEN 11
+static const char *xdp_action_names[XDP_ACTION_MAX] = {
+ [XDP_ABORTED] = "XDP_ABORTED",
+ [XDP_DROP] = "XDP_DROP",
+ [XDP_PASS] = "XDP_PASS",
+ [XDP_TX] = "XDP_TX",
+};
+
+static const char *action2str(int action)
+{
+ if (action < XDP_ACTION_MAX)
+ return xdp_action_names[action];
+ return NULL;
+}
+
+static int parse_xdp_action(char *action_str)
+{
+ size_t maxlen;
+ __u64 action = -1;
+ int i;
+
+ for (i = 0; i < XDP_ACTION_MAX; i++) {
+ maxlen = XDP_ACTION_MAX_STRLEN;
+ if (strncmp(xdp_action_names[i], action_str, maxlen) == 0) {
+ action = i;
+ break;
+ }
+ }
+ return action;
+}
+
+static void list_xdp_actions(void)
+{
+ int i;
+
+ printf("Available XDP --action <options>\n");
+ for (i = 0; i < XDP_ACTION_MAX; i++)
+ printf("\t%s\n", xdp_action_names[i]);
+ printf("\n");
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf("\nDOCUMENTATION:\n%s\n", __doc__);
+ printf(" Usage: %s (options-see-below)\n", argv[0]);
+ printf(" Listing options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)",
+ *long_options[i].flag);
+ else
+ printf(" short-option: -%c",
+ long_options[i].val);
+ printf("\n");
+ }
+ printf("\n");
+ list_xdp_actions();
+}
+
+#define NANOSEC_PER_SEC 1000000000 /* 10^9 */
+static __u64 gettime(void)
+{
+ struct timespec t;
+ int res;
+
+ res = clock_gettime(CLOCK_MONOTONIC, &t);
+ if (res < 0) {
+ fprintf(stderr, "Error with gettimeofday! (%i)\n", res);
+ exit(EXIT_FAIL);
+ }
+ return (__u64) t.tv_sec * NANOSEC_PER_SEC + t.tv_nsec;
+}
+
+/* Common stats data record shared with _kern.c */
+struct datarec {
+ __u64 processed;
+ __u64 issue;
+};
+struct record {
+ __u64 timestamp;
+ struct datarec total;
+ struct datarec *cpu;
+};
+struct stats_record {
+ struct record stats;
+ struct record *rxq;
+};
+
+static struct datarec *alloc_record_per_cpu(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec *array;
+ size_t size;
+
+ size = sizeof(struct datarec) * nr_cpus;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_cpus:%u)\n", nr_cpus);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct record *alloc_record_per_rxq(void)
+{
+ unsigned int nr_rxqs = map_data[2].def.max_entries;
+ struct record *array;
+ size_t size;
+
+ size = sizeof(struct record) * nr_rxqs;
+ array = malloc(size);
+ memset(array, 0, size);
+ if (!array) {
+ fprintf(stderr, "Mem alloc error (nr_rxqs:%u)\n", nr_rxqs);
+ exit(EXIT_FAIL_MEM);
+ }
+ return array;
+}
+
+static struct stats_record *alloc_stats_record(void)
+{
+ unsigned int nr_rxqs = map_data[2].def.max_entries;
+ struct stats_record *rec;
+ int i;
+
+ rec = malloc(sizeof(*rec));
+ memset(rec, 0, sizeof(*rec));
+ if (!rec) {
+ fprintf(stderr, "Mem alloc error\n");
+ exit(EXIT_FAIL_MEM);
+ }
+ rec->rxq = alloc_record_per_rxq();
+ for (i = 0; i < nr_rxqs; i++)
+ rec->rxq[i].cpu = alloc_record_per_cpu();
+
+ rec->stats.cpu = alloc_record_per_cpu();
+ return rec;
+}
+
+static void free_stats_record(struct stats_record *r)
+{
+ unsigned int nr_rxqs = map_data[2].def.max_entries;
+ int i;
+
+ for (i = 0; i < nr_rxqs; i++)
+ free(r->rxq[i].cpu);
+
+ free(r->rxq);
+ free(r->stats.cpu);
+ free(r);
+}
+
+static bool map_collect_percpu(int fd, __u32 key, struct record *rec)
+{
+ /* For percpu maps, userspace gets a value per possible CPU */
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct datarec values[nr_cpus];
+ __u64 sum_processed = 0;
+ __u64 sum_issue = 0;
+ int i;
+
+ if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
+ fprintf(stderr,
+ "ERR: bpf_map_lookup_elem failed key:0x%X\n", key);
+ return false;
+ }
+ /* Get time as close as possible to reading map contents */
+ rec->timestamp = gettime();
+
+ /* Record and sum values from each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ rec->cpu[i].processed = values[i].processed;
+ sum_processed += values[i].processed;
+ rec->cpu[i].issue = values[i].issue;
+ sum_issue += values[i].issue;
+ }
+ rec->total.processed = sum_processed;
+ rec->total.issue = sum_issue;
+ return true;
+}
+
+static void stats_collect(struct stats_record *rec)
+{
+ int fd, i, max_rxqs;
+
+ fd = map_data[1].fd; /* map: stats_global_map */
+ map_collect_percpu(fd, 0, &rec->stats);
+
+ fd = map_data[2].fd; /* map: rx_queue_index_map */
+ max_rxqs = map_data[2].def.max_entries;
+ for (i = 0; i < max_rxqs; i++)
+ map_collect_percpu(fd, i, &rec->rxq[i]);
+}
+
+static double calc_period(struct record *r, struct record *p)
+{
+ double period_ = 0;
+ __u64 period = 0;
+
+ period = r->timestamp - p->timestamp;
+ if (period > 0)
+ period_ = ((double) period / NANOSEC_PER_SEC);
+
+ return period_;
+}
+
+static __u64 calc_pps(struct datarec *r, struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->processed - p->processed;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static __u64 calc_errs_pps(struct datarec *r,
+ struct datarec *p, double period_)
+{
+ __u64 packets = 0;
+ __u64 pps = 0;
+
+ if (period_ > 0) {
+ packets = r->issue - p->issue;
+ pps = packets / period_;
+ }
+ return pps;
+}
+
+static void stats_print(struct stats_record *stats_rec,
+ struct stats_record *stats_prev,
+ int action)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ unsigned int nr_rxqs = map_data[2].def.max_entries;
+ double pps = 0, err = 0;
+ struct record *rec, *prev;
+ double t;
+ int rxq;
+ int i;
+
+ /* Header */
+ printf("\nRunning XDP on dev:%s (ifindex:%d) action:%s\n",
+ ifname, ifindex, action2str(action));
+
+ /* stats_global_map */
+ {
+ char *fmt_rx = "%-15s %-7d %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %-7s %'-11.0f\n";
+ char *errstr = "";
+
+ printf("%-15s %-7s %-11s %-11s\n",
+ "XDP stats", "CPU", "pps", "issue-pps");
+
+ rec = &stats_rec->stats;
+ prev = &stats_prev->stats;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps (r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0)
+ errstr = "invalid-ifindex";
+ if (pps > 0)
+ printf(fmt_rx, "XDP-RX CPU",
+ i, pps, err, errstr);
+ }
+ pps = calc_pps (&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ printf(fm2_rx, "XDP-RX CPU", "total", pps, err);
+ }
+
+ /* rx_queue_index_map */
+ printf("\n%-15s %-7s %-11s %-11s\n",
+ "RXQ stats", "RXQ:CPU", "pps", "issue-pps");
+
+ for (rxq = 0; rxq < nr_rxqs; rxq++) {
+ char *fmt_rx = "%-15s %3d:%-3d %'-11.0f %'-10.0f %s\n";
+ char *fm2_rx = "%-15s %3d:%-3s %'-11.0f\n";
+ char *errstr = "";
+ int rxq_ = rxq;
+
+ /* Last RXQ in map catch overflows */
+ if (rxq_ == nr_rxqs - 1)
+ rxq_ = -1;
+
+ rec = &stats_rec->rxq[rxq];
+ prev = &stats_prev->rxq[rxq];
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps (r, p, t);
+ err = calc_errs_pps(r, p, t);
+ if (err > 0) {
+ if (rxq_ == -1)
+ errstr = "map-overflow-RXQ";
+ else
+ errstr = "err";
+ }
+ if (pps > 0)
+ printf(fmt_rx, "rx_queue_index",
+ rxq_, i, pps, err, errstr);
+ }
+ pps = calc_pps (&rec->total, &prev->total, t);
+ err = calc_errs_pps(&rec->total, &prev->total, t);
+ if (pps || err)
+ printf(fm2_rx, "rx_queue_index", rxq_, "sum", pps, err);
+ }
+}
+
+
+/* Pointer swap trick */
+static inline void swap(struct stats_record **a, struct stats_record **b)
+{
+ struct stats_record *tmp;
+
+ tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+static void stats_poll(int interval, int action)
+{
+ struct stats_record *record, *prev;
+
+ record = alloc_stats_record();
+ prev = alloc_stats_record();
+ stats_collect(record);
+
+ while (1) {
+ swap(&prev, &record);
+ stats_collect(record);
+ stats_print(record, prev, action);
+ sleep(interval);
+ }
+
+ free_stats_record(record);
+ free_stats_record(prev);
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ bool use_separators = true;
+ struct config cfg = { 0 };
+ char filename[256];
+ int longindex = 0;
+ int interval = 2;
+ __u32 key = 0;
+ int opt, err;
+
+ char action_str_buf[XDP_ACTION_MAX_STRLEN + 1 /* for \0 */] = { 0 };
+ int action = XDP_PASS; /* Default action */
+ char *action_str = NULL;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ fprintf(stderr, "ERR in load_bpf_file(): %s", bpf_log_buf);
+ return EXIT_FAIL;
+ }
+
+ if (!prog_fd[0]) {
+ fprintf(stderr, "ERR: load_bpf_file: %s\n", strerror(errno));
+ return EXIT_FAIL;
+ }
+
+ /* Parse commands line args */
+ while ((opt = getopt_long(argc, argv, "hSd:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 'd':
+ if (strlen(optarg) >= IF_NAMESIZE) {
+ fprintf(stderr, "ERR: --dev name too long\n");
+ goto error;
+ }
+ ifname = (char *)&ifname_buf;
+ strncpy(ifname, optarg, IF_NAMESIZE);
+ ifindex = if_nametoindex(ifname);
+ if (ifindex == 0) {
+ fprintf(stderr,
+ "ERR: --dev name unknown err(%d):%s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ break;
+ case 's':
+ interval = atoi(optarg);
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ case 'z':
+ use_separators = false;
+ break;
+ case 'a':
+ action_str = (char *)&action_str_buf;
+ strncpy(action_str, optarg, XDP_ACTION_MAX_STRLEN);
+ break;
+ case 'h':
+ error:
+ default:
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ /* Required option */
+ if (ifindex == -1) {
+ fprintf(stderr, "ERR: required option --dev missing\n");
+ usage(argv);
+ return EXIT_FAIL_OPTION;
+ }
+ cfg.ifindex = ifindex;
+
+ /* Parse action string */
+ if (action_str) {
+ action = parse_xdp_action(action_str);
+ if (action < 0) {
+ fprintf(stderr, "ERR: Invalid XDP --action: %s\n",
+ action_str);
+ list_xdp_actions();
+ return EXIT_FAIL_OPTION;
+ }
+ }
+ cfg.action = action;
+
+ /* Trick to pretty printf with thousands separators use %' */
+ if (use_separators)
+ setlocale(LC_NUMERIC, "en_US");
+
+ /* User-side setup ifindex in config_map */
+ err = bpf_map_update_elem(map_fd[0], &key, &cfg, 0);
+ if (err) {
+ fprintf(stderr, "Store config failed (err:%d)\n", err);
+ exit(EXIT_FAIL_BPF);
+ }
+
+ /* Remove XDP program when program is interrupted */
+ signal(SIGINT, int_exit);
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ fprintf(stderr, "link set xdp fd failed\n");
+ return EXIT_FAIL_XDP;
+ }
+
+ stats_poll(interval, action);
+ return EXIT_OK;
+}
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 715cd12eaca5..f0a787268a87 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -30,7 +30,7 @@ static __u32 xdp_flags = 0;
static void int_exit(int sig)
{
if (ifindex > -1)
- set_link_xdp_fd(ifindex, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
exit(0);
}
@@ -254,14 +254,14 @@ int main(int argc, char **argv)
}
}
- if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
printf("link set xdp fd failed\n");
return 1;
}
poll_stats(kill_after_s);
- set_link_xdp_fd(ifindex, -1, xdp_flags);
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
return 0;
}
diff --git a/samples/configfs/configfs_sample.c b/samples/configfs/configfs_sample.c
index 1ea33119e532..004a4e201476 100644
--- a/samples/configfs/configfs_sample.c
+++ b/samples/configfs/configfs_sample.c
@@ -115,7 +115,7 @@ static struct configfs_attribute *childless_attrs[] = {
NULL,
};
-static struct config_item_type childless_type = {
+static const struct config_item_type childless_type = {
.ct_attrs = childless_attrs,
.ct_owner = THIS_MODULE,
};
@@ -193,7 +193,7 @@ static struct configfs_item_operations simple_child_item_ops = {
.release = simple_child_release,
};
-static struct config_item_type simple_child_type = {
+static const struct config_item_type simple_child_type = {
.ct_item_ops = &simple_child_item_ops,
.ct_attrs = simple_child_attrs,
.ct_owner = THIS_MODULE,
@@ -261,7 +261,7 @@ static struct configfs_group_operations simple_children_group_ops = {
.make_item = simple_children_make_item,
};
-static struct config_item_type simple_children_type = {
+static const struct config_item_type simple_children_type = {
.ct_item_ops = &simple_children_item_ops,
.ct_group_ops = &simple_children_group_ops,
.ct_attrs = simple_children_attrs,
@@ -331,7 +331,7 @@ static struct configfs_group_operations group_children_group_ops = {
.make_group = group_children_make_group,
};
-static struct config_item_type group_children_type = {
+static const struct config_item_type group_children_type = {
.ct_group_ops = &group_children_group_ops,
.ct_attrs = group_children_attrs,
.ct_owner = THIS_MODULE,
diff --git a/samples/connector/cn_test.c b/samples/connector/cn_test.c
index d12cc944b696..95cd06f4ec1e 100644
--- a/samples/connector/cn_test.c
+++ b/samples/connector/cn_test.c
@@ -125,12 +125,12 @@ static int cn_test_want_notify(void)
#endif
static u32 cn_test_timer_counter;
-static void cn_test_timer_func(unsigned long __data)
+static void cn_test_timer_func(struct timer_list *unused)
{
struct cn_msg *m;
char data[32];
- pr_debug("%s: timer fired with data %lu\n", __func__, __data);
+ pr_debug("%s: timer fired\n", __func__);
m = kzalloc(sizeof(*m) + sizeof(data), GFP_ATOMIC);
if (m) {
@@ -168,7 +168,7 @@ static int cn_test_init(void)
goto err_out;
}
- setup_timer(&cn_test_timer, cn_test_timer_func, 0);
+ timer_setup(&cn_test_timer, cn_test_timer_func, 0);
mod_timer(&cn_test_timer, jiffies + msecs_to_jiffies(1000));
pr_info("initialized with id={%u.%u}\n",
diff --git a/samples/hidraw/Makefile b/samples/hidraw/Makefile
index f5c3012ffa79..dec1b22adf54 100644
--- a/samples/hidraw/Makefile
+++ b/samples/hidraw/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
# List of programs to build
hostprogs-y := hid-example
diff --git a/samples/kobject/kobject-example.c b/samples/kobject/kobject-example.c
index 2e0740f06cd7..9e383fdbaa00 100644
--- a/samples/kobject/kobject-example.c
+++ b/samples/kobject/kobject-example.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Sample kobject implementation
*
* Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2007 Novell Inc.
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/kobject.h>
#include <linux/string.h>
diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c
index a55bff52bde3..401328fd687d 100644
--- a/samples/kobject/kset-example.c
+++ b/samples/kobject/kset-example.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Sample kset and ktype implementation
*
* Copyright (C) 2004-2007 Greg Kroah-Hartman <greg@kroah.com>
* Copyright (C) 2007 Novell Inc.
- *
- * Released under the GPL version 2 only.
- *
*/
#include <linux/kobject.h>
#include <linux/string.h>
diff --git a/samples/kprobes/Makefile b/samples/kprobes/Makefile
index 68739bc4fc6a..880e54d2c082 100644
--- a/samples/kprobes/Makefile
+++ b/samples/kprobes/Makefile
@@ -1,5 +1,5 @@
# builds the kprobes example kernel modules;
# then to use one (as root): insmod <module_name.ko>
-obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o jprobe_example.o
+obj-$(CONFIG_SAMPLE_KPROBES) += kprobe_example.o
obj-$(CONFIG_SAMPLE_KRETPROBES) += kretprobe_example.o
diff --git a/samples/kprobes/jprobe_example.c b/samples/kprobes/jprobe_example.c
deleted file mode 100644
index e3c0a40909f7..000000000000
--- a/samples/kprobes/jprobe_example.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Here's a sample kernel module showing the use of jprobes to dump
- * the arguments of _do_fork().
- *
- * For more information on theory of operation of jprobes, see
- * Documentation/kprobes.txt
- *
- * Build and insert the kernel module as done in the kprobe example.
- * You will see the trace data in /var/log/messages and on the
- * console whenever _do_fork() is invoked to create a new process.
- * (Some messages may be suppressed if syslogd is configured to
- * eliminate duplicate messages.)
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/kprobes.h>
-
-/*
- * Jumper probe for _do_fork.
- * Mirror principle enables access to arguments of the probed routine
- * from the probe handler.
- */
-
-/* Proxy routine having the same arguments as actual _do_fork() routine */
-static long j_do_fork(unsigned long clone_flags, unsigned long stack_start,
- unsigned long stack_size, int __user *parent_tidptr,
- int __user *child_tidptr, unsigned long tls)
-{
- pr_info("jprobe: clone_flags = 0x%lx, stack_start = 0x%lx "
- "stack_size = 0x%lx\n", clone_flags, stack_start, stack_size);
-
- /* Always end with a call to jprobe_return(). */
- jprobe_return();
- return 0;
-}
-
-static struct jprobe my_jprobe = {
- .entry = j_do_fork,
- .kp = {
- .symbol_name = "_do_fork",
- },
-};
-
-static int __init jprobe_init(void)
-{
- int ret;
-
- ret = register_jprobe(&my_jprobe);
- if (ret < 0) {
- pr_err("register_jprobe failed, returned %d\n", ret);
- return -1;
- }
- pr_info("Planted jprobe at %p, handler addr %p\n",
- my_jprobe.kp.addr, my_jprobe.entry);
- return 0;
-}
-
-static void __exit jprobe_exit(void)
-{
- unregister_jprobe(&my_jprobe);
- pr_info("jprobe at %p unregistered\n", my_jprobe.kp.addr);
-}
-
-module_init(jprobe_init)
-module_exit(jprobe_exit)
-MODULE_LICENSE("GPL");
diff --git a/samples/kprobes/kprobe_example.c b/samples/kprobes/kprobe_example.c
index 88b3e2d227ae..67de3b774bc9 100644
--- a/samples/kprobes/kprobe_example.c
+++ b/samples/kprobes/kprobe_example.c
@@ -47,6 +47,10 @@ static int handler_pre(struct kprobe *p, struct pt_regs *regs)
" pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pc, (long)regs->pstate);
#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> pre_handler: p->addr, 0x%p, ip = 0x%lx, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->psw.addr, regs->flags);
+#endif
/* A dump_stack() here will give a stack backtrace */
return 0;
@@ -76,6 +80,10 @@ static void handler_post(struct kprobe *p, struct pt_regs *regs,
pr_info("<%s> post_handler: p->addr = 0x%p, pstate = 0x%lx\n",
p->symbol_name, p->addr, (long)regs->pstate);
#endif
+#ifdef CONFIG_S390
+ pr_info("<%s> pre_handler: p->addr, 0x%p, flags = 0x%lx\n",
+ p->symbol_name, p->addr, regs->flags);
+#endif
}
/*
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 10319d7ea0b1..2472ce39a18d 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -1 +1,7 @@
obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000000..80d06e103f1b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+ pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+ msleep(sleep_secs * 1000);
+ pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ schedule_delayed_work(&work,
+ msecs_to_jiffies(1000 * 0));
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ cancel_delayed_work_sync(&work);
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000000..72f9e6d1387b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod livepatch_callbacks_demo
+ * rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ * livepatch-callbacks-demo.ko to observe what happens when a
+ * target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ * callback return status. Try setting up a non-zero status
+ * such as -19 (-ENODEV):
+ *
+ * # Load demo livepatch, vmlinux is patched
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # Setup next pre-patch callback to return -ENODEV
+ * echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ * # Module loader refuses to load the target module
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ * livepatch-callbacks-busymod.ko, available for experimenting
+ * with livepatch (un)patch callbacks. This module contains
+ * a 'sleep_secs' parameter that parks the module on one of the
+ * functions that the livepatch demo module wants to patch.
+ * Modifying this value and tweaking the order of module loads can
+ * effectively demonstrate stalled patch transitions:
+ *
+ * # Load a target module, let it park on 'busymod_work_func' for
+ * # thirty seconds
+ * insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ * # Meanwhile load the livepatch
+ * insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ * # ... then load and unload another target module while the
+ * # transition is in progress
+ * insmod samples/livepatch/livepatch-callbacks-mod.ko
+ * rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ * # Finally cleanup
+ * echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ * rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+ [MODULE_STATE_LIVE] = "[MODULE_STATE_LIVE] Normal state",
+ [MODULE_STATE_COMING] = "[MODULE_STATE_COMING] Full formed, running module_init",
+ [MODULE_STATE_GOING] = "[MODULE_STATE_GOING] Going away",
+ [MODULE_STATE_UNFORMED] = "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+ if (obj->mod)
+ pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+ module_state[obj->mod->state]);
+ else
+ pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+ return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+ callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+ pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+ { }
+};
+
+static struct klp_func busymod_funcs[] = {
+ {
+ .old_name = "busymod_work_func",
+ .new_func = patched_work_func,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = NULL, /* vmlinux */
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_mod",
+ .funcs = no_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, {
+ .name = "livepatch_callbacks_busymod",
+ .funcs = busymod_funcs,
+ .callbacks = {
+ .pre_patch = pre_patch_callback,
+ .post_patch = post_patch_callback,
+ .pre_unpatch = pre_unpatch_callback,
+ .post_unpatch = post_unpatch_callback,
+ },
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000000..e610ce29ba44
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+ pr_info("%s\n", __func__);
+ return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+ pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c
index 84795223f15f..2d554dd930e2 100644
--- a/samples/livepatch/livepatch-sample.c
+++ b/samples/livepatch/livepatch-sample.c
@@ -71,21 +71,6 @@ static int livepatch_init(void)
{
int ret;
- if (!klp_have_reliable_stack() && !patch.immediate) {
- /*
- * WARNING: Be very careful when using 'patch.immediate' in
- * your patches. It's ok to use it for simple patches like
- * this, but for more complex patches which change function
- * semantics, locking semantics, or data structures, it may not
- * be safe. Use of this option will also prevent removal of
- * the patch.
- *
- * See Documentation/livepatch/livepatch.txt for more details.
- */
- patch.immediate = true;
- pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n");
- }
-
ret = klp_register_patch(&patch);
if (ret)
return ret;
diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c
new file mode 100644
index 000000000000..830c55514f9f
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix1.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix1.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Fixes the memory leak introduced in livepatch-shadow-mod through the
+ * use of a shadow variable. This fix demonstrates the "extending" of
+ * short-lived data structures by patching its allocation and release
+ * functions.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+struct dummy *livepatch_fix1_dummy_alloc(void)
+{
+ struct dummy *d;
+ void *leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies +
+ msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+ /*
+ * Patch: save the extra memory location into a SV_LEAK shadow
+ * variable. A patched dummy_free routine can later fetch this
+ * pointer to handle resource release.
+ */
+ leak = kzalloc(sizeof(int), GFP_KERNEL);
+ klp_shadow_alloc(d, SV_LEAK, &leak, sizeof(leak), GFP_KERNEL);
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+}
+
+void livepatch_fix1_dummy_free(struct dummy *d)
+{
+ void **shadow_leak, *leak;
+
+ /*
+ * Patch: fetch the saved SV_LEAK shadow variable, detach and
+ * free it. Note: handle cases where this shadow variable does
+ * not exist (ie, dummy structures allocated before this livepatch
+ * was loaded.)
+ */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak) {
+ leak = *shadow_leak;
+ klp_shadow_free(d, SV_LEAK);
+ kfree(leak);
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, leak);
+ } else {
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+ }
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_alloc",
+ .new_func = livepatch_fix1_dummy_alloc,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix1_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix1_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_shadow_fix1_exit(void)
+{
+ /* Cleanup any existing SV_LEAK shadow variables */
+ klp_shadow_free_all(SV_LEAK);
+
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix1_init);
+module_exit(livepatch_shadow_fix1_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c
new file mode 100644
index 000000000000..ff9948f0ec00
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-fix2.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-fix2.c - Shadow variables, livepatch demo
+ *
+ * Purpose
+ * -------
+ *
+ * Adds functionality to livepatch-shadow-mod's in-flight data
+ * structures through a shadow variable. The livepatch patches a
+ * routine that periodically inspects data structures, incrementing a
+ * per-data-structure counter, creating the counter if needed.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone. See the "Usage"
+ * section of livepatch-shadow-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+#include <linux/slab.h>
+
+/* Shadow variable enums */
+#define SV_LEAK 1
+#define SV_COUNTER 2
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+bool livepatch_fix2_dummy_check(struct dummy *d, unsigned long jiffies)
+{
+ int *shadow_count;
+ int count;
+
+ /*
+ * Patch: handle in-flight dummy structures, if they do not
+ * already have a SV_COUNTER shadow variable, then attach a
+ * new one.
+ */
+ count = 0;
+ shadow_count = klp_shadow_get_or_alloc(d, SV_COUNTER,
+ &count, sizeof(count),
+ GFP_NOWAIT);
+ if (shadow_count)
+ *shadow_count += 1;
+
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+void livepatch_fix2_dummy_free(struct dummy *d)
+{
+ void **shadow_leak, *leak;
+ int *shadow_count;
+
+ /* Patch: copy the memory leak patch from the fix1 module. */
+ shadow_leak = klp_shadow_get(d, SV_LEAK);
+ if (shadow_leak) {
+ leak = *shadow_leak;
+ klp_shadow_free(d, SV_LEAK);
+ kfree(leak);
+ pr_info("%s: dummy @ %p, prevented leak @ %p\n",
+ __func__, d, leak);
+ } else {
+ pr_info("%s: dummy @ %p leaked!\n", __func__, d);
+ }
+
+ /*
+ * Patch: fetch the SV_COUNTER shadow variable and display
+ * the final count. Detach the shadow variable.
+ */
+ shadow_count = klp_shadow_get(d, SV_COUNTER);
+ if (shadow_count) {
+ pr_info("%s: dummy @ %p, check counter = %d\n",
+ __func__, d, *shadow_count);
+ klp_shadow_free(d, SV_COUNTER);
+ }
+
+ kfree(d);
+}
+
+static struct klp_func funcs[] = {
+ {
+ .old_name = "dummy_check",
+ .new_func = livepatch_fix2_dummy_check,
+ },
+ {
+ .old_name = "dummy_free",
+ .new_func = livepatch_fix2_dummy_free,
+ }, { }
+};
+
+static struct klp_object objs[] = {
+ {
+ .name = "livepatch_shadow_mod",
+ .funcs = funcs,
+ }, { }
+};
+
+static struct klp_patch patch = {
+ .mod = THIS_MODULE,
+ .objs = objs,
+};
+
+static int livepatch_shadow_fix2_init(void)
+{
+ int ret;
+
+ ret = klp_register_patch(&patch);
+ if (ret)
+ return ret;
+ ret = klp_enable_patch(&patch);
+ if (ret) {
+ WARN_ON(klp_unregister_patch(&patch));
+ return ret;
+ }
+ return 0;
+}
+
+static void livepatch_shadow_fix2_exit(void)
+{
+ /* Cleanup any existing SV_COUNTER shadow variables */
+ klp_shadow_free_all(SV_COUNTER);
+
+ WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_shadow_fix2_init);
+module_exit(livepatch_shadow_fix2_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-shadow-mod.c b/samples/livepatch/livepatch-shadow-mod.c
new file mode 100644
index 000000000000..4c54b250332d
--- /dev/null
+++ b/samples/livepatch/livepatch-shadow-mod.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-shadow-mod.c - Shadow variables, buggy module demo
+ *
+ * Purpose
+ * -------
+ *
+ * As a demonstration of livepatch shadow variable API, this module
+ * introduces memory leak behavior that livepatch modules
+ * livepatch-shadow-fix1.ko and livepatch-shadow-fix2.ko correct and
+ * enhance.
+ *
+ * WARNING - even though the livepatch-shadow-fix modules patch the
+ * memory leak, please load these modules at your own risk -- some
+ * amount of memory may leaked before the bug is patched.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - Load the buggy demonstration module:
+ *
+ * insmod samples/livepatch/livepatch-shadow-mod.ko
+ *
+ * Watch dmesg output for a few moments to see new dummy being allocated
+ * and a periodic cleanup check. (Note: a small amount of memory is
+ * being leaked.)
+ *
+ *
+ * Step 2 - Load livepatch fix1:
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix1.ko
+ *
+ * Continue watching dmesg and note that now livepatch_fix1_dummy_free()
+ * and livepatch_fix1_dummy_alloc() are logging messages about leaked
+ * memory and eventually leaks prevented.
+ *
+ *
+ * Step 3 - Load livepatch fix2 (on top of fix1):
+ *
+ * insmod samples/livepatch/livepatch-shadow-fix2.ko
+ *
+ * This module extends functionality through shadow variables, as a new
+ * "check" counter is added to the dummy structure. Periodic dmesg
+ * messages will log these as dummies are cleaned up.
+ *
+ *
+ * Step 4 - Cleanup
+ *
+ * Unwind the demonstration by disabling the livepatch fix modules, then
+ * removing them and the demo module:
+ *
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix2/enabled
+ * echo 0 > /sys/kernel/livepatch/livepatch_shadow_fix1/enabled
+ * rmmod livepatch-shadow-fix2
+ * rmmod livepatch-shadow-fix1
+ * rmmod livepatch-shadow-mod
+ */
+
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/workqueue.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joe Lawrence <joe.lawrence@redhat.com>");
+MODULE_DESCRIPTION("Buggy module for shadow variable demo");
+
+/* Allocate new dummies every second */
+#define ALLOC_PERIOD 1
+/* Check for expired dummies after a few new ones have been allocated */
+#define CLEANUP_PERIOD (3 * ALLOC_PERIOD)
+/* Dummies expire after a few cleanup instances */
+#define EXPIRE_PERIOD (4 * CLEANUP_PERIOD)
+
+/*
+ * Keep a list of all the dummies so we can clean up any residual ones
+ * on module exit
+ */
+LIST_HEAD(dummy_list);
+DEFINE_MUTEX(dummy_list_mutex);
+
+struct dummy {
+ struct list_head list;
+ unsigned long jiffies_expire;
+};
+
+noinline struct dummy *dummy_alloc(void)
+{
+ struct dummy *d;
+ void *leak;
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return NULL;
+
+ d->jiffies_expire = jiffies +
+ msecs_to_jiffies(1000 * EXPIRE_PERIOD);
+
+ /* Oops, forgot to save leak! */
+ leak = kzalloc(sizeof(int), GFP_KERNEL);
+
+ pr_info("%s: dummy @ %p, expires @ %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ return d;
+}
+
+noinline void dummy_free(struct dummy *d)
+{
+ pr_info("%s: dummy @ %p, expired = %lx\n",
+ __func__, d, d->jiffies_expire);
+
+ kfree(d);
+}
+
+noinline bool dummy_check(struct dummy *d, unsigned long jiffies)
+{
+ return time_after(jiffies, d->jiffies_expire);
+}
+
+/*
+ * alloc_work_func: allocates new dummy structures, allocates additional
+ * memory, aptly named "leak", but doesn't keep
+ * permanent record of it.
+ */
+
+static void alloc_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(alloc_dwork, alloc_work_func);
+
+static void alloc_work_func(struct work_struct *work)
+{
+ struct dummy *d;
+
+ d = dummy_alloc();
+ if (!d)
+ return;
+
+ mutex_lock(&dummy_list_mutex);
+ list_add(&d->list, &dummy_list);
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&alloc_dwork,
+ msecs_to_jiffies(1000 * ALLOC_PERIOD));
+}
+
+/*
+ * cleanup_work_func: frees dummy structures. Without knownledge of
+ * "leak", it leaks the additional memory that
+ * alloc_work_func created.
+ */
+
+static void cleanup_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(cleanup_dwork, cleanup_work_func);
+
+static void cleanup_work_func(struct work_struct *work)
+{
+ struct dummy *d, *tmp;
+ unsigned long j;
+
+ j = jiffies;
+ pr_info("%s: jiffies = %lx\n", __func__, j);
+
+ mutex_lock(&dummy_list_mutex);
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+
+ /* Kick out and free any expired dummies */
+ if (dummy_check(d, j)) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+ }
+ mutex_unlock(&dummy_list_mutex);
+
+ schedule_delayed_work(&cleanup_dwork,
+ msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+}
+
+static int livepatch_shadow_mod_init(void)
+{
+ schedule_delayed_work(&alloc_dwork,
+ msecs_to_jiffies(1000 * ALLOC_PERIOD));
+ schedule_delayed_work(&cleanup_dwork,
+ msecs_to_jiffies(1000 * CLEANUP_PERIOD));
+
+ return 0;
+}
+
+static void livepatch_shadow_mod_exit(void)
+{
+ struct dummy *d, *tmp;
+
+ /* Wait for any dummies at work */
+ cancel_delayed_work_sync(&alloc_dwork);
+ cancel_delayed_work_sync(&cleanup_dwork);
+
+ /* Cleanup residual dummies */
+ list_for_each_entry_safe(d, tmp, &dummy_list, list) {
+ list_del(&d->list);
+ dummy_free(d);
+ }
+}
+
+module_init(livepatch_shadow_mod_init);
+module_exit(livepatch_shadow_mod_exit);
diff --git a/samples/mic/mpssd/mpssd.c b/samples/mic/mpssd/mpssd.c
index 49db1def1721..f42ce551bb48 100644
--- a/samples/mic/mpssd/mpssd.c
+++ b/samples/mic/mpssd/mpssd.c
@@ -65,7 +65,7 @@ static struct mic_info mic_list;
/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) _ALIGN(addr, PAGE_SIZE)
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
#define GSO_ENABLED 1
#define MAX_GSO_SIZE (64 * 1024)
@@ -382,7 +382,7 @@ disp_iovec(struct mic_info *mic, struct mic_copy_desc *copy,
static inline __u16 read_avail_idx(struct mic_vring *vr)
{
- return ACCESS_ONCE(vr->info->avail_idx);
+ return READ_ONCE(vr->info->avail_idx);
}
static inline void txrx_prepare(int type, bool tx, struct mic_vring *vr,
@@ -523,7 +523,7 @@ spin_for_descriptors(struct mic_info *mic, struct mic_vring *vr)
{
__u16 avail_idx = read_avail_idx(vr);
- while (avail_idx == le16toh(ACCESS_ONCE(vr->vr.avail->idx))) {
+ while (avail_idx == le16toh(READ_ONCE(vr->vr.avail->idx))) {
#ifdef DEBUG
mpsslog("%s %s waiting for desc avail %d info_avail %d\n",
mic->name, __func__,
diff --git a/samples/pktgen/functions.sh b/samples/pktgen/functions.sh
index 205e4cde4601..f8bb3cd0f4ce 100644
--- a/samples/pktgen/functions.sh
+++ b/samples/pktgen/functions.sh
@@ -119,3 +119,46 @@ function root_check_run_with_sudo() {
err 4 "cannot perform sudo run of $0"
fi
}
+
+# Exact input device's NUMA node info
+function get_iface_node()
+{
+ local node=$(</sys/class/net/$1/device/numa_node)
+ if [[ $node == -1 ]]; then
+ echo 0
+ else
+ echo $node
+ fi
+}
+
+# Given an Dev/iface, get its queues' irq numbers
+function get_iface_irqs()
+{
+ local IFACE=$1
+ local queues="${IFACE}-.*TxRx"
+
+ irqs=$(grep "$queues" /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(grep $IFACE /proc/interrupts | cut -f1 -d:)
+ [ -z "$irqs" ] && irqs=$(for i in `ls -Ux /sys/class/net/$IFACE/device/msi_irqs` ;\
+ do grep "$i:.*TxRx" /proc/interrupts | grep -v fdir | cut -f 1 -d : ;\
+ done)
+ [ -z "$irqs" ] && err 3 "Could not find interrupts for $IFACE"
+
+ echo $irqs
+}
+
+# Given a NUMA node, return cpu ids belonging to it.
+function get_node_cpus()
+{
+ local node=$1
+ local node_cpu_list
+ local node_cpu_range_list=`cut -f1- -d, --output-delimiter=" " \
+ /sys/devices/system/node/node$node/cpulist`
+
+ for cpu_range in $node_cpu_range_list
+ do
+ node_cpu_list="$node_cpu_list "`seq -s " " ${cpu_range//-/ }`
+ done
+
+ echo $node_cpu_list
+}
diff --git a/samples/pktgen/pktgen.conf-1-1-ip6 b/samples/pktgen/pktgen.conf-1-1-ip6
deleted file mode 100755
index 62426afeef84..000000000000
--- a/samples/pktgen/pktgen.conf-1-1-ip6
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-#modprobe pktgen
-
-
-function pgset() {
- local result
-
- echo $1 > $PGDEV
-
- result=`cat $PGDEV | fgrep "Result: OK:"`
- if [ "$result" = "" ]; then
- cat $PGDEV | fgrep Result:
- fi
-}
-
-# Config Start Here -----------------------------------------------------------
-
-
-# thread config
-# Each CPU has its own thread. One CPU example. We add eth1.
-# IPv6. Note increase in minimal packet length
-
-PGDEV=/proc/net/pktgen/kpktgend_0
- echo "Removing all devices"
- pgset "rem_device_all"
- echo "Adding eth1"
- pgset "add_device eth1"
-
-
-# device config
-# delay 0
-
-CLONE_SKB="clone_skb 1000000"
-# NIC adds 4 bytes CRC
-PKT_SIZE="pkt_size 66"
-
-# COUNT 0 means forever
-#COUNT="count 0"
-COUNT="count 10000000"
-DELAY="delay 0"
-
-PGDEV=/proc/net/pktgen/eth1
- echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- pgset "dst6 fec0::1"
- pgset "src6 fec0::2"
- pgset "dst_mac 00:04:23:08:91:dc"
-
-# Time to run
-PGDEV=/proc/net/pktgen/pgctrl
-
- echo "Running... ctrl^C to stop"
- trap true INT
- pgset "start"
- echo "Done"
- cat /proc/net/pktgen/eth1
diff --git a/samples/pktgen/pktgen.conf-1-1-ip6-rdos b/samples/pktgen/pktgen.conf-1-1-ip6-rdos
deleted file mode 100755
index 3ac3eb1f3504..000000000000
--- a/samples/pktgen/pktgen.conf-1-1-ip6-rdos
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-#modprobe pktgen
-
-
-function pgset() {
- local result
-
- echo $1 > $PGDEV
-
- result=`cat $PGDEV | fgrep "Result: OK:"`
- if [ "$result" = "" ]; then
- cat $PGDEV | fgrep Result:
- fi
-}
-
-# Config Start Here -----------------------------------------------------------
-
-
-# thread config
-# Each CPU has its own thread. One CPU example. We add eth1.
-# IPv6. Note increase in minimal packet length
-
-PGDEV=/proc/net/pktgen/kpktgend_0
- echo "Removing all devices"
- pgset "rem_device_all"
- echo "Adding eth1"
- pgset "add_device eth1"
-
-
-# device config
-# delay 0 means maximum speed.
-
-# We need to do alloc for every skb since we cannot clone here.
-CLONE_SKB="clone_skb 0"
-
-# NIC adds 4 bytes CRC
-PKT_SIZE="pkt_size 66"
-
-# COUNT 0 means forever
-#COUNT="count 0"
-COUNT="count 10000000"
-DELAY="delay 0"
-
-PGDEV=/proc/net/pktgen/eth1
- echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- pgset "dst6_min fec0::1"
- pgset "dst6_max fec0::FFFF:FFFF"
-
- pgset "dst_mac 00:04:23:08:91:dc"
-
-# Time to run
-PGDEV=/proc/net/pktgen/pgctrl
-
- echo "Running... ctrl^C to stop"
- trap true INT
- pgset "start"
- echo "Done"
- cat /proc/net/pktgen/eth1
diff --git a/samples/pktgen/pktgen.conf-1-2 b/samples/pktgen/pktgen.conf-1-2
deleted file mode 100755
index a85552760762..000000000000
--- a/samples/pktgen/pktgen.conf-1-2
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0
-
-#modprobe pktgen
-
-
-function pgset() {
- local result
-
- echo $1 > $PGDEV
-
- result=`cat $PGDEV | fgrep "Result: OK:"`
- if [ "$result" = "" ]; then
- cat $PGDEV | fgrep Result:
- fi
-}
-
-# Config Start Here -----------------------------------------------------------
-
-
-# thread config
-# One CPU means one thread. One CPU example. We add eth1, eth2 respectivly.
-
-PGDEV=/proc/net/pktgen/kpktgend_0
- echo "Removing all devices"
- pgset "rem_device_all"
- echo "Adding eth1"
- pgset "add_device eth1"
- echo "Adding eth2"
- pgset "add_device eth2"
-
-
-# device config
-# delay 0 means maximum speed.
-
-CLONE_SKB="clone_skb 1000000"
-# NIC adds 4 bytes CRC
-PKT_SIZE="pkt_size 60"
-
-# COUNT 0 means forever
-#COUNT="count 0"
-COUNT="count 10000000"
-DELAY="delay 0"
-
-PGDEV=/proc/net/pktgen/eth1
- echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- pgset "dst 10.10.11.2"
- pgset "dst_mac 00:04:23:08:91:dc"
-
-PGDEV=/proc/net/pktgen/eth2
- echo "Configuring $PGDEV"
- pgset "$COUNT"
- pgset "$CLONE_SKB"
- pgset "$PKT_SIZE"
- pgset "$DELAY"
- pgset "dst 192.168.2.2"
- pgset "dst_mac 00:04:23:08:91:de"
-
-# Time to run
-PGDEV=/proc/net/pktgen/pgctrl
-
- echo "Running... ctrl^C to stop"
- trap true INT
- pgset "start"
- echo "Done"
- cat /proc/net/pktgen/eth1 /proc/net/pktgen/eth2
diff --git a/samples/pktgen/pktgen_sample03_burst_single_flow.sh b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
index 4c2e4217638a..8fdd36722d9e 100755
--- a/samples/pktgen/pktgen_sample03_burst_single_flow.sh
+++ b/samples/pktgen/pktgen_sample03_burst_single_flow.sh
@@ -31,7 +31,7 @@ if [ -z "$DEST_IP" ]; then
fi
[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
[ -z "$BURST" ] && BURST=32
-[ -z "$CLONE_SKB" ] && CLONE_SKB="100000"
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0" # No need for clones when bursting
[ -z "$COUNT" ] && COUNT="0" # Zero means indefinitely
# Base Config
diff --git a/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
new file mode 100755
index 000000000000..353adc17205e
--- /dev/null
+++ b/samples/pktgen/pktgen_sample06_numa_awared_queue_irq_affinity.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+#
+# Multiqueue: Using pktgen threads for sending on multiple CPUs
+# * adding devices to kernel threads which are in the same NUMA node
+# * bound devices queue's irq affinity to the threads, 1:1 mapping
+# * notice the naming scheme for keeping device names unique
+# * nameing scheme: dev@thread_number
+# * flow variation via random UDP source port
+#
+basedir=`dirname $0`
+source ${basedir}/functions.sh
+root_check_run_with_sudo "$@"
+#
+# Required param: -i dev in $DEV
+source ${basedir}/parameters.sh
+
+# Base Config
+DELAY="0" # Zero means max speed
+[ -z "$COUNT" ] && COUNT="20000000" # Zero means indefinitely
+[ -z "$CLONE_SKB" ] && CLONE_SKB="0"
+
+# Flow variation random source port between min and max
+UDP_MIN=9
+UDP_MAX=109
+
+node=`get_iface_node $DEV`
+irq_array=(`get_iface_irqs $DEV`)
+cpu_array=(`get_node_cpus $node`)
+
+[ $THREADS -gt ${#irq_array[*]} -o $THREADS -gt ${#cpu_array[*]} ] && \
+ err 1 "Thread number $THREADS exceeds: min (${#irq_array[*]},${#cpu_array[*]})"
+
+# (example of setting default params in your script)
+if [ -z "$DEST_IP" ]; then
+ [ -z "$IP6" ] && DEST_IP="198.18.0.42" || DEST_IP="FD00::1"
+fi
+[ -z "$DST_MAC" ] && DST_MAC="90:e2:ba:ff:ff:ff"
+
+# General cleanup everything since last run
+pg_ctrl "reset"
+
+# Threads are specified with parameter -t value in $THREADS
+for ((i = 0; i < $THREADS; i++)); do
+ # The device name is extended with @name, using thread number to
+ # make then unique, but any name will do.
+ # Set the queue's irq affinity to this $thread (processor)
+ # if '-f' is designated, offset cpu id
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo $thread > /proc/irq/${irq_array[$i]}/smp_affinity_list
+ info "irq ${irq_array[$i]} is set affinity to `cat /proc/irq/${irq_array[$i]}/smp_affinity_list`"
+
+ # Add remove all other devices and add_device $dev to thread
+ pg_thread $thread "rem_device_all"
+ pg_thread $thread "add_device" $dev
+
+ # select queue and bind the queue and $dev in 1:1 relationship
+ queue_num=$i
+ info "queue number is $queue_num"
+ pg_set $dev "queue_map_min $queue_num"
+ pg_set $dev "queue_map_max $queue_num"
+
+ # Notice config queue to map to cpu (mirrors smp_processor_id())
+ # It is beneficial to map IRQ /proc/irq/*/smp_affinity 1:1 to CPU number
+ pg_set $dev "flag QUEUE_MAP_CPU"
+
+ # Base config of dev
+ pg_set $dev "count $COUNT"
+ pg_set $dev "clone_skb $CLONE_SKB"
+ pg_set $dev "pkt_size $PKT_SIZE"
+ pg_set $dev "delay $DELAY"
+
+ # Flag example disabling timestamping
+ pg_set $dev "flag NO_TIMESTAMP"
+
+ # Destination
+ pg_set $dev "dst_mac $DST_MAC"
+ pg_set $dev "dst$IP6 $DEST_IP"
+
+ # Setup random UDP port src range
+ pg_set $dev "flag UDPSRC_RND"
+ pg_set $dev "udp_src_min $UDP_MIN"
+ pg_set $dev "udp_src_max $UDP_MAX"
+done
+
+# start_run
+echo "Running... ctrl^C to stop" >&2
+pg_ctrl "start"
+echo "Done" >&2
+
+# Print results
+for ((i = 0; i < $THREADS; i++)); do
+ thread=${cpu_array[$((i+F_THREAD))]}
+ dev=${DEV}@${thread}
+ echo "Device: $dev"
+ cat /proc/net/pktgen/$dev | grep -A2 "Result:"
+done
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index 19a870eed82b..ba942e3ead89 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,7 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
+ifndef CROSS_COMPILE
hostprogs-$(CONFIG_SAMPLE_SECCOMP) := bpf-fancy dropper bpf-direct
HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
@@ -19,7 +17,6 @@ HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
bpf-direct-objs := bpf-direct.o
# Try to match the kernel target.
-ifndef CROSS_COMPILE
ifndef CONFIG_64BIT
# s390 has -m31 flag to build 31 bit binaries
@@ -38,12 +35,4 @@ HOSTLOADLIBES_bpf-fancy += $(MFLAG)
HOSTLOADLIBES_dropper += $(MFLAG)
endif
always := $(hostprogs-m)
-else
-# MIPS system calls are defined based on the -mabi that is passed
-# to the toolchain which may or may not be a valid option
-# for the host toolchain. So disable tests if target architecture
-# is MIPS but the host isn't.
-ifndef CONFIG_MIPS
-always := $(hostprogs-m)
-endif
endif
diff --git a/samples/sockmap/Makefile b/samples/sockmap/Makefile
index 9291ab8e0f8c..73f1da4d116c 100644
--- a/samples/sockmap/Makefile
+++ b/samples/sockmap/Makefile
@@ -1,6 +1,3 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
# List of programs to build
hostprogs-y := sockmap
diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c
index 7cc9d228216f..7c25c0c112bc 100644
--- a/samples/sockmap/sockmap_user.c
+++ b/samples/sockmap/sockmap_user.c
@@ -23,8 +23,11 @@
#include <stdbool.h>
#include <signal.h>
#include <fcntl.h>
+#include <sys/wait.h>
+#include <time.h>
#include <sys/time.h>
+#include <sys/resource.h>
#include <sys/types.h>
#include <linux/netlink.h>
@@ -35,6 +38,8 @@
#include <assert.h>
#include <libgen.h>
+#include <getopt.h>
+
#include "../bpf/bpf_load.h"
#include "../bpf/bpf_util.h"
#include "../bpf/libbpf.h"
@@ -46,15 +51,42 @@ void running_handler(int a);
#define S1_PORT 10000
#define S2_PORT 10001
-static int sockmap_test_sockets(int rate, int dot)
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"cgroup", required_argument, NULL, 'c' },
+ {"rate", required_argument, NULL, 'r' },
+ {"verbose", no_argument, NULL, 'v' },
+ {"iov_count", required_argument, NULL, 'i' },
+ {"length", required_argument, NULL, 'l' },
+ {"test", required_argument, NULL, 't' },
+ {0, 0, NULL, 0 }
+};
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+ printf(" options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)\n",
+ *long_options[i].flag);
+ else
+ printf(" -%c\n", long_options[i].val);
+ }
+ printf("\n");
+}
+
+static int sockmap_init_sockets(void)
{
- int i, sc, err, max_fd, one = 1;
- int s1, s2, c1, c2, p1, p2;
+ int i, err, one = 1;
struct sockaddr_in addr;
- struct timeval timeout;
- char buf[1024] = {0};
int *fds[4] = {&s1, &s2, &c1, &c2};
- fd_set w;
s1 = s2 = p1 = p2 = c1 = c2 = 0;
@@ -63,8 +95,7 @@ static int sockmap_test_sockets(int rate, int dot)
*fds[i] = socket(AF_INET, SOCK_STREAM, 0);
if (*fds[i] < 0) {
perror("socket s1 failed()");
- err = *fds[i];
- goto out;
+ return errno;
}
}
@@ -74,16 +105,16 @@ static int sockmap_test_sockets(int rate, int dot)
(char *)&one, sizeof(one));
if (err) {
perror("setsockopt failed()");
- goto out;
+ return errno;
}
}
/* Non-blocking sockets */
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < 2; i++) {
err = ioctl(*fds[i], FIONBIO, (char *)&one);
if (err < 0) {
perror("ioctl s1 failed()");
- goto out;
+ return errno;
}
}
@@ -96,14 +127,14 @@ static int sockmap_test_sockets(int rate, int dot)
err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
if (err < 0) {
perror("bind s1 failed()\n");
- goto out;
+ return errno;
}
addr.sin_port = htons(S2_PORT);
err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
if (err < 0) {
perror("bind s2 failed()\n");
- goto out;
+ return errno;
}
/* Listen server sockets */
@@ -111,14 +142,14 @@ static int sockmap_test_sockets(int rate, int dot)
err = listen(s1, 32);
if (err < 0) {
perror("listen s1 failed()\n");
- goto out;
+ return errno;
}
addr.sin_port = htons(S2_PORT);
err = listen(s2, 32);
if (err < 0) {
perror("listen s1 failed()\n");
- goto out;
+ return errno;
}
/* Initiate Connect */
@@ -126,46 +157,232 @@ static int sockmap_test_sockets(int rate, int dot)
err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
if (err < 0 && errno != EINPROGRESS) {
perror("connect c1 failed()\n");
- goto out;
+ return errno;
}
addr.sin_port = htons(S2_PORT);
err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
if (err < 0 && errno != EINPROGRESS) {
perror("connect c2 failed()\n");
- goto out;
+ return errno;
+ } else if (err < 0) {
+ err = 0;
}
/* Accept Connecrtions */
p1 = accept(s1, NULL, NULL);
if (p1 < 0) {
perror("accept s1 failed()\n");
- goto out;
+ return errno;
}
p2 = accept(s2, NULL, NULL);
if (p2 < 0) {
perror("accept s1 failed()\n");
- goto out;
+ return errno;
}
- max_fd = p2;
- timeout.tv_sec = 10;
- timeout.tv_usec = 0;
-
printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
c1, s1, c2, s2);
+ return 0;
+}
+
+struct msg_stats {
+ size_t bytes_sent;
+ size_t bytes_recvd;
+ struct timespec start;
+ struct timespec end;
+};
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+ struct msg_stats *s, bool tx)
+{
+ struct msghdr msg = {0};
+ int err, i, flags = MSG_NOSIGNAL;
+ struct iovec *iov;
+
+ iov = calloc(iov_count, sizeof(struct iovec));
+ if (!iov)
+ return errno;
+
+ for (i = 0; i < iov_count; i++) {
+ char *d = calloc(iov_length, sizeof(char));
+
+ if (!d) {
+ fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+ goto out_errno;
+ }
+ iov[i].iov_base = d;
+ iov[i].iov_len = iov_length;
+ }
+
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_count;
+
+ if (tx) {
+ clock_gettime(CLOCK_MONOTONIC, &s->start);
+ for (i = 0; i < cnt; i++) {
+ int sent = sendmsg(fd, &msg, flags);
+
+ if (sent < 0) {
+ perror("send loop error:");
+ goto out_errno;
+ }
+ s->bytes_sent += sent;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ } else {
+ int slct, recv, max_fd = fd;
+ struct timeval timeout;
+ float total_bytes;
+ fd_set w;
+
+ total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+ err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+ if (err < 0)
+ perror("recv start time: ");
+ while (s->bytes_recvd < total_bytes) {
+ timeout.tv_sec = 1;
+ timeout.tv_usec = 0;
+
+ /* FD sets */
+ FD_ZERO(&w);
+ FD_SET(fd, &w);
+
+ slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+ if (slct == -1) {
+ perror("select()");
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ goto out_errno;
+ } else if (!slct) {
+ fprintf(stderr, "unexpected timeout\n");
+ errno = -EIO;
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ goto out_errno;
+ }
+
+ recv = recvmsg(fd, &msg, flags);
+ if (recv < 0) {
+ if (errno != EWOULDBLOCK) {
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ perror("recv failed()\n");
+ goto out_errno;
+ }
+ }
+
+ s->bytes_recvd += recv;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ }
+
+ for (i = 0; i < iov_count; i++)
+ free(iov[i].iov_base);
+ free(iov);
+ return 0;
+out_errno:
+ for (i = 0; i < iov_count; i++)
+ free(iov[i].iov_base);
+ free(iov);
+ return errno;
+}
+
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+ return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+ return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static int sendmsg_test(int iov_count, int iov_buf, int cnt,
+ int verbose, bool base)
+{
+ float sent_Bps = 0, recvd_Bps = 0;
+ int rx_fd, txpid, rxpid, err = 0;
+ struct msg_stats s = {0};
+ int status;
+
+ errno = 0;
+
+ if (base)
+ rx_fd = p1;
+ else
+ rx_fd = p2;
+
+ rxpid = fork();
+ if (rxpid == 0) {
+ err = msg_loop(rx_fd, iov_count, iov_buf, cnt, &s, false);
+ if (err)
+ fprintf(stderr,
+ "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+ iov_count, iov_buf, cnt, err);
+ shutdown(p2, SHUT_RDWR);
+ shutdown(p1, SHUT_RDWR);
+ if (s.end.tv_sec - s.start.tv_sec) {
+ sent_Bps = sentBps(s);
+ recvd_Bps = recvdBps(s);
+ }
+ fprintf(stdout,
+ "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n",
+ s.bytes_sent, sent_Bps, sent_Bps/giga,
+ s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+ exit(1);
+ } else if (rxpid == -1) {
+ perror("msg_loop_rx: ");
+ return errno;
+ }
+
+ txpid = fork();
+ if (txpid == 0) {
+ err = msg_loop(c1, iov_count, iov_buf, cnt, &s, true);
+ if (err)
+ fprintf(stderr,
+ "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+ iov_count, iov_buf, cnt, err);
+ shutdown(c1, SHUT_RDWR);
+ if (s.end.tv_sec - s.start.tv_sec) {
+ sent_Bps = sentBps(s);
+ recvd_Bps = recvdBps(s);
+ }
+ fprintf(stdout,
+ "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+ s.bytes_sent, sent_Bps, sent_Bps/giga,
+ s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+ exit(1);
+ } else if (txpid == -1) {
+ perror("msg_loop_tx: ");
+ return errno;
+ }
+
+ assert(waitpid(rxpid, &status, 0) == rxpid);
+ assert(waitpid(txpid, &status, 0) == txpid);
+ return err;
+}
+
+static int forever_ping_pong(int rate, int verbose)
+{
+ struct timeval timeout;
+ char buf[1024] = {0};
+ int sc;
+
+ timeout.tv_sec = 10;
+ timeout.tv_usec = 0;
/* Ping/Pong data from client to server */
sc = send(c1, buf, sizeof(buf), 0);
if (sc < 0) {
perror("send failed()\n");
- goto out;
+ return sc;
}
do {
- int s, rc, i;
+ int s, rc, i, max_fd = p2;
+ fd_set w;
/* FD sets */
FD_ZERO(&w);
@@ -193,7 +410,7 @@ static int sockmap_test_sockets(int rate, int dot)
if (rc < 0) {
if (errno != EWOULDBLOCK) {
perror("recv failed()\n");
- break;
+ return rc;
}
}
@@ -205,35 +422,92 @@ static int sockmap_test_sockets(int rate, int dot)
sc = send(i, buf, rc, 0);
if (sc < 0) {
perror("send failed()\n");
- break;
+ return sc;
}
}
- sleep(rate);
- if (dot) {
+
+ if (rate)
+ sleep(rate);
+
+ if (verbose) {
printf(".");
fflush(stdout);
}
} while (running);
-out:
- close(s1);
- close(s2);
- close(p1);
- close(p2);
- close(c1);
- close(c2);
- return err;
+ return 0;
}
+enum {
+ PING_PONG,
+ SENDMSG,
+ BASE,
+};
+
int main(int argc, char **argv)
{
- int rate = 1, dot = 1;
+ int iov_count = 1, length = 1024, rate = 1, verbose = 0;
+ struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY};
+ int opt, longindex, err, cg_fd = 0;
+ int test = PING_PONG;
char filename[256];
- int err, cg_fd;
- char *cg_path;
- cg_path = argv[argc - 1];
+ while ((opt = getopt_long(argc, argv, "hvc:r:i:l:t:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ /* Cgroup configuration */
+ case 'c':
+ cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+ if (cg_fd < 0) {
+ fprintf(stderr,
+ "ERROR: (%i) open cg path failed: %s\n",
+ cg_fd, optarg);
+ return cg_fd;
+ }
+ break;
+ case 'r':
+ rate = atoi(optarg);
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'i':
+ iov_count = atoi(optarg);
+ break;
+ case 'l':
+ length = atoi(optarg);
+ break;
+ case 't':
+ if (strcmp(optarg, "ping") == 0) {
+ test = PING_PONG;
+ } else if (strcmp(optarg, "sendmsg") == 0) {
+ test = SENDMSG;
+ } else if (strcmp(optarg, "base") == 0) {
+ test = BASE;
+ } else {
+ usage(argv);
+ return -1;
+ }
+ break;
+ case 'h':
+ default:
+ usage(argv);
+ return -1;
+ }
+ }
+
+ if (!cg_fd) {
+ fprintf(stderr, "%s requires cgroup option: --cgroup <path>\n",
+ argv[0]);
+ return -1;
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
running = 1;
@@ -241,20 +515,16 @@ int main(int argc, char **argv)
/* catch SIGINT */
signal(SIGINT, running_handler);
+ /* If base test skip BPF setup */
+ if (test == BASE)
+ goto run;
+
if (load_bpf_file(filename)) {
fprintf(stderr, "load_bpf_file: (%s) %s\n",
filename, strerror(errno));
return 1;
}
- /* Cgroup configuration */
- cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
- if (cg_fd < 0) {
- fprintf(stderr, "ERROR: (%i) open cg path failed: %s\n",
- cg_fd, cg_path);
- return cg_fd;
- }
-
/* Attach programs to sockmap */
err = bpf_prog_attach(prog_fd[0], map_fd[0],
BPF_SK_SKB_STREAM_PARSER, 0);
@@ -280,12 +550,30 @@ int main(int argc, char **argv)
return err;
}
- err = sockmap_test_sockets(rate, dot);
+run:
+ err = sockmap_init_sockets();
if (err) {
fprintf(stderr, "ERROR: test socket failed: %d\n", err);
- return err;
+ goto out;
}
- return 0;
+
+ if (test == PING_PONG)
+ err = forever_ping_pong(rate, verbose);
+ else if (test == SENDMSG)
+ err = sendmsg_test(iov_count, length, rate, verbose, false);
+ else if (test == BASE)
+ err = sendmsg_test(iov_count, length, rate, verbose, true);
+ else
+ fprintf(stderr, "unknown test\n");
+out:
+ close(s1);
+ close(s2);
+ close(p1);
+ close(p2);
+ close(c1);
+ close(c2);
+ close(cg_fd);
+ return err;
}
void running_handler(int a)
diff --git a/samples/statx/Makefile b/samples/statx/Makefile
index 1f80a3d8cf45..59df7c25a9d1 100644
--- a/samples/statx/Makefile
+++ b/samples/statx/Makefile
@@ -1,6 +1,3 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
# List of programs to build
hostprogs-$(CONFIG_SAMPLE_STATX) := test-statx
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index 5bcd91455ec8..80b4a70315b6 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -96,7 +96,7 @@
* __entry->bar.x = y;
* __array: There are three fields (type, name, size). The type is the
- * type of elements in teh array, the name is the name of the array.
+ * type of elements in the array, the name is the name of the array.
* size is the number of items in the array (not the total size).
*
* __array( char, foo, 10) is the same as saying: char foo[10];
@@ -113,7 +113,7 @@
* type is the type of the element, name is the name of the array.
* The size is different than __array. It is not a static number,
* but the algorithm to figure out the length of the array for the
- * specific instance of tracepoint. Again, size is the numebr of
+ * specific instance of tracepoint. Again, size is the number of
* items in the array, not the total length in bytes.
*
* __dynamic_array( int, foo, bar) is similar to: int foo[bar];
@@ -126,9 +126,9 @@
* Notice, that "__entry" is not needed here.
*
* __string: This is a special kind of __dynamic_array. It expects to
- * have a nul terminated character array passed to it (it allows
+ * have a null terminated character array passed to it (it allows
* for NULL too, which would be converted into "(null)"). __string
- * takes two paramenter (name, src), where name is the name of
+ * takes two parameter (name, src), where name is the name of
* the string saved, and src is the string to copy into the
* ring buffer.
*
@@ -445,7 +445,7 @@ DECLARE_EVENT_CLASS(foo_template,
/*
* Here's a better way for the previous samples (except, the first
- * exmaple had more fields and could not be used here).
+ * example had more fields and could not be used here).
*/
DEFINE_EVENT(foo_template, foo_with_template_simple,
TP_PROTO(const char *foo, int bar),
diff --git a/samples/uhid/Makefile b/samples/uhid/Makefile
index c95a696560a7..8d7fd6190ac4 100644
--- a/samples/uhid/Makefile
+++ b/samples/uhid/Makefile
@@ -1,6 +1,3 @@
-# kbuild trick to avoid linker error. Can be omitted if a module is built.
-obj- := dummy.o
-
# List of programs to build
hostprogs-y := uhid-example
diff --git a/samples/v4l/v4l2-pci-skeleton.c b/samples/v4l/v4l2-pci-skeleton.c
index 483e9bca9444..f520e3aef9c6 100644
--- a/samples/v4l/v4l2-pci-skeleton.c
+++ b/samples/v4l/v4l2-pci-skeleton.c
@@ -772,8 +772,10 @@ static int skeleton_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
/* Allocate a new instance */
skel = devm_kzalloc(&pdev->dev, sizeof(struct skeleton), GFP_KERNEL);
- if (!skel)
- return -ENOMEM;
+ if (!skel) {
+ ret = -ENOMEM;
+ goto disable_pci;
+ }
/* Allocate the interrupt */
ret = devm_request_irq(&pdev->dev, pdev->irq,
diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index ca495686b9c3..09f255bdf3ac 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -1413,7 +1413,7 @@ struct attribute_group *mdev_type_groups[] = {
NULL,
};
-struct mdev_parent_ops mdev_fops = {
+static const struct mdev_parent_ops mdev_fops = {
.owner = THIS_MODULE,
.dev_attr_groups = mtty_dev_groups,
.mdev_attr_groups = mdev_dev_groups,