diff options
Diffstat (limited to 'samples/bpf')
31 files changed, 1529 insertions, 797 deletions
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore index 8ae4940025f8..dbb817dbacfc 100644 --- a/samples/bpf/.gitignore +++ b/samples/bpf/.gitignore @@ -1,7 +1,6 @@ cpustat fds_example lathist -load_sock_ops lwt_len_hist map_perf_test offwaketime diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index a0ef7eddd0b3..65e667bdf979 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -40,7 +40,6 @@ hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel hostprogs-y += test_map_in_map hostprogs-y += per_socket_stats_example -hostprogs-y += load_sock_ops hostprogs-y += xdp_redirect hostprogs-y += xdp_redirect_map hostprogs-y += xdp_redirect_cpu @@ -53,6 +52,7 @@ hostprogs-y += xdpsock hostprogs-y += xdp_fwd hostprogs-y += task_fd_query hostprogs-y += xdp_sample_pkts +hostprogs-y += hbm # Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -60,9 +60,9 @@ LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o -fds_example-objs := bpf_load.o fds_example.o -sockex1-objs := bpf_load.o sockex1_user.o -sockex2-objs := bpf_load.o sockex2_user.o +fds_example-objs := fds_example.o +sockex1-objs := sockex1_user.o +sockex2-objs := sockex2_user.o sockex3-objs := bpf_load.o sockex3_user.o tracex1-objs := bpf_load.o tracex1_user.o tracex2-objs := bpf_load.o tracex2_user.o @@ -71,7 +71,6 @@ tracex4-objs := bpf_load.o tracex4_user.o tracex5-objs := bpf_load.o tracex5_user.o tracex6-objs := bpf_load.o tracex6_user.o tracex7-objs := bpf_load.o tracex7_user.o -load_sock_ops-objs := bpf_load.o load_sock_ops.o test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS) lathist-objs := bpf_load.o lathist_user.o @@ -109,6 +108,7 @@ xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) +hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -163,10 +163,10 @@ always += xdp2skb_meta_kern.o always += syscall_tp_kern.o always += cpustat_kern.o always += xdp_adjust_tail_kern.o -always += xdpsock_kern.o always += xdp_fwd_kern.o always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o +always += hbm_out_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -266,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +$(obj)/hbm.o: $(src)/hbm.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000000..56c8b4115c95 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 Facebook +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. + +Usage() { + echo "Script for testing HBM (Host Bandwidth Manager) framework." + echo "It creates a cgroup to use for testing and load a BPF program to limit" + echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" + echo "loads. The output is the goodput in Mbps (unless -D was used)." + echo "" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" + echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" + echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" + echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" + echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" + echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" + echo " Where:" + echo " out egress (default)" + echo " -b or --bpf BPF program filename to load and attach." + echo " Default is hbm_out_kern.o for egress," + echo " -c or -cc TCP congestion control (cubic or dctcp)" + echo " --debug print BPF trace buffer" + echo " -d or --delay add a delay in ms using netem" + echo " -D In addition to the goodput in Mbps, it also outputs" + echo " other detailed information. This information is" + echo " test dependent (i.e. iperf3 or netperf)." + echo " -E enable ECN (not required for dctcp)" + echo " -f or --flows number of concurrent flows (default=1)" + echo " -i or --id cgroup id (an integer, default is 1)" + echo " -N use netperf instead of iperf3" + echo " -l do not limit flows using loopback" + echo " -h Help" + echo " -p or --port iperf3 port (default is 5201)" + echo " -P use an iperf3 instance for each flow" + echo " -q use the specified qdisc" + echo " -r or --rate rate in Mbps (default 1s 1Gbps)" + echo " -R Use TCP_RR for netperf. 1st flow has req" + echo " size of 10KB, rest of 1MB. Reply in all" + echo " cases is 1 byte." + echo " More detailed output for each flow can be found" + echo " in the files netperf.<cg>.<flow>, where <cg> is the" + echo " cgroup id as specified with the -i flag, and <flow>" + echo " is the flow id starting at 1 and increasing by 1 for" + echo " flow (as specified by -f)." + echo " -s or --server hostname of netperf server. Used to create netperf" + echo " test traffic between to hosts (default is within host)" + echo " netserver must be running on the host." + echo " -S or --stats whether to update hbm stats (default is yes)." + echo " -t or --time duration of iperf3 in seconds (default=5)" + echo " -w Work conserving flag. cgroup can increase its" + echo " bandwidth beyond the rate limit specified" + echo " while there is available bandwidth. Current" + echo " implementation assumes there is only one NIC" + echo " (eth0), but can be extended to support multiple" + echo " NICs." + echo " cubic or dctcp specify which TCP CC to use" + echo " " + exit +} + +#set -x + +debug_flag=0 +args="$@" +name="$0" +netem=0 +cc=x +dir="-o" +dir_name="out" +dur=5 +flows=1 +id=1 +prog="" +port=5201 +rate=1000 +multi_iperf=0 +flow_cnt=1 +use_netperf=0 +rr=0 +ecn=0 +details=0 +server="" +qdisc="" +flags="" +do_stats=0 + +function start_hbm () { + rm -f hbm.out + echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out + echo " " >> hbm.out + ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & + echo $! +} + +processArgs () { + for i in $args ; do + case $i in + # Support for upcomming ingress rate limiting + #in) # support for upcoming ingress rate limiting + # dir="-i" + # dir_name="in" + # ;; + out) + dir="-o" + dir_name="out" + ;; + -b=*|--bpf=*) + prog="${i#*=}" + ;; + -c=*|--cc=*) + cc="${i#*=}" + ;; + --debug) + flags="$flags -d" + debug_flag=1 + ;; + -d=*|--delay=*) + netem="${i#*=}" + ;; + -D) + details=1 + ;; + -E) + ecn=1 + ;; + # Support for upcomming fq Early Departure Time egress rate limiting + #--edt) + # prog="hbm_out_edt_kern.o" + # qdisc="fq" + # ;; + -f=*|--flows=*) + flows="${i#*=}" + ;; + -i=*|--id=*) + id="${i#*=}" + ;; + -l) + flags="$flags -l" + ;; + -N) + use_netperf=1 + ;; + -p=*|--port=*) + port="${i#*=}" + ;; + -P) + multi_iperf=1 + ;; + -q=*) + qdisc="${i#*=}" + ;; + -r=*|--rate=*) + rate="${i#*=}" + ;; + -R) + rr=1 + ;; + -s=*|--server=*) + server="${i#*=}" + ;; + -S|--stats) + flags="$flags -s" + do_stats=1 + ;; + -t=*|--time=*) + dur="${i#*=}" + ;; + -w) + flags="$flags -w" + ;; + cubic) + cc=cubic + ;; + dctcp) + cc=dctcp + ;; + *) + echo "Unknown arg:$i" + Usage + ;; + esac + done +} + +processArgs + +if [ $debug_flag -eq 1 ] ; then + rm -f hbm_out.log +fi + +hbm_pid=$(start_hbm) +usleep 100000 + +host=`hostname` +cg_base_dir=/sys/fs/cgroup +cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" + +echo $$ >> $cg_dir/cgroup.procs + +ulimit -l unlimited + +rm -f ss.out +rm -f hbm.[0-9]*.$dir_name +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=1 +fi + +if [ $use_netperf -eq 0 ] ; then + cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` + if [ "$cc" != "x" ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc + fi +fi + +if [ "$netem" -ne "0" ] ; then + if [ "$qdisc" != "" ] ; then + echo "WARNING: Ignoring -q options because -d option used" + fi + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 +elif [ "$qdisc" != "" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root $qdisc > /dev/null 2>&1 +fi + +n=0 +m=$[$dur * 5] +hn="::1" +if [ $use_netperf -ne 0 ] ; then + if [ "$server" != "" ] ; then + hn=$server + fi +fi + +( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & + +if [ $use_netperf -ne 0 ] ; then + begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ + awk '{ print $1 }'` + if [ "$begNetserverPid" == "" ] ; then + if [ "$server" == "" ] ; then + ( ./netserver > /dev/null 2>&1) & + usleep 100000 + fi + fi + flow_cnt=1 + if [ "$server" == "" ] ; then + np_server=$host + else + np_server=$server + fi + if [ "$cc" == "x" ] ; then + np_cc="" + else + np_cc="-K $cc,$cc" + fi + replySize=1 + while [ $flow_cnt -le $flows ] ; do + if [ $rr -ne 0 ] ; then + reqSize=1M + if [ $flow_cnt -eq 1 ] ; then + reqSize=10K + fi + if [ "$dir" == "-i" ] ; then + replySize=$reqSize + reqSize=1 + fi + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + if [ "$dir" == "-i" ] ; then + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + fi + fi + flow_cnt=$[flow_cnt+1] + done + +# sleep for duration of test (plus some buffer) + n=$[dur+2] + sleep $n + +# force graceful termination of netperf + pids=`pgrep netperf` + for p in $pids ; do + kill -SIGALRM $p + done + + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + while [ $flow_cnt -le $flows ] ; do + if [ "$dir" == "-i" ] ; then + r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + else + r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + fi + echo "rate for flow $flow_cnt: $r" + rate=$[rate+r] + if [ $details -ne 0 ] ; then + echo "-----" + echo "Details for cgroup $id, flow $flow_cnt" + cat netperf.$id.$flow_cnt + fi + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + echo "" + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +elif [ $multi_iperf -eq 0 ] ; then + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + usleep 100000 + iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id + rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` + rate=`echo $rates | grep -o "[0-9]*$"` + + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +else + flow_cnt=1 + while [ $flow_cnt -le $flows ] ; do + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & + port=$[port+1] + flow_cnt=$[flow_cnt+1] + done + n=$[dur+1] + sleep $n + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + + while [ $flow_cnt -le $flows ] ; do + r=`cat iperf3.$id.$flow_cnt` +# echo "rate for flow $flow_cnt: $r" + if [ $details -ne 0 ] ; then + echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" + fi + rate=$[rate+r] + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +fi + +if [ $use_netperf -eq 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc +fi +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=0 +fi +if [ "$netem" -ne "0" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 +fi + +sleep 2 + +hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` +if [ "$hbmPid" == "$hbm_pid" ] ; then + kill $hbm_pid +fi + +sleep 1 + +# Detach any BPF programs that may have lingered +ttx=`bpftool cgroup tree | grep hbm` +v=2 +for x in $ttx ; do + if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then + cg=$x ; v=0 + else + if [ $v -eq 0 ] ; then + id=$x ; v=1 + else + if [ $v -eq 1 ] ; then + type=$x ; bpftool cgroup detach $cg $type id $id + v=0 + fi + fi + fi +done + +if [ $use_netperf -ne 0 ] ; then + if [ "$server" == "" ] ; then + if [ "$begNetserverPid" == "" ] ; then + netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` + if [ "$netserverPid" != "" ] ; then + kill $netserverPid + fi + fi + fi +fi +exit diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c index 9854854f05d1..e51eb060244e 100644 --- a/samples/bpf/fds_example.c +++ b/samples/bpf/fds_example.c @@ -14,8 +14,8 @@ #include <bpf/bpf.h> +#include "bpf/libbpf.h" #include "bpf_insn.h" -#include "bpf_load.h" #include "sock_example.h" #define BPF_F_PIN (1 << 0) @@ -57,10 +57,14 @@ static int bpf_prog_create(const char *object) BPF_EXIT_INSN(), }; size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn); + char bpf_log_buf[BPF_LOG_BUF_SIZE]; + struct bpf_object *obj; + int prog_fd; if (object) { - assert(!load_bpf_file((char *)object)); - return prog_fd[0]; + assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC, + &obj, &prog_fd)); + return prog_fd; } else { return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, insns, insns_cnt, "GPL", 0, diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000000..8408ccb7409f --- /dev/null +++ b/samples/bpf/hbm.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Example program for Host Bandwidth Managment + * + * This program loads a cgroup skb BPF program to enforce cgroup output + * (egress) or input (ingress) bandwidth limits. + * + * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] + * Where: + * -d Print BPF trace debug buffer + * -l Also limit flows doing loopback + * -n <#> To create cgroup \"/hbm#\" and attach prog + * Default is /hbm1 + * -r <rate> Rate limit in Mbps + * -s Get HBM stats (marked, dropped, etc.) + * -t <time> Exit after specified seconds (deault is 0) + * -w Work conserving flag. cgroup can increase its bandwidth + * beyond the rate limit specified while there is available + * bandwidth. Current implementation assumes there is only + * NIC (eth0), but can be extended to support multiple NICs. + * Currrently only supported for egress. + * -h Print this info + * prog BPF program file name. Name defaults to hbm_out_kern.o + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> + +#include "bpf_load.h" +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" +#include "hbm.h" +#include "bpf_util.h" +#include "bpf/bpf.h" +#include "bpf/libbpf.h" + +bool outFlag = true; +int minRate = 1000; /* cgroup rate limit in Mbps */ +int rate = 1000; /* can grow if rate conserving is enabled */ +int dur = 1; +bool stats_flag; +bool loopback_flag; +bool debugFlag; +bool work_conserving_flag; + +static void Usage(void); +static void read_trace_pipe2(void); +static void do_error(char *msg, bool errno_flag); + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +struct bpf_object *obj; +int bpfprog_fd; +int cgroup_storage_fd; + +static void read_trace_pipe2(void) +{ + int trace_fd; + FILE *outf; + char *outFname = "hbm_out.log"; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) { + printf("Error opening trace_pipe\n"); + return; + } + +// Future support of ingress +// if (!outFlag) +// outFname = "hbm_in.log"; + outf = fopen(outFname, "w"); + + if (outf == NULL) + printf("Error creating %s\n", outFname); + + while (1) { + static char buf[4097]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + if (outf != NULL) { + fprintf(outf, "%s\n", buf); + fflush(outf); + } + } + } +} + +static void do_error(char *msg, bool errno_flag) +{ + if (errno_flag) + printf("ERROR: %s, errno: %d\n", msg, errno); + else + printf("ERROR: %s\n", msg); + exit(1); +} + +static int prog_load(char *prog) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .file = prog, + .expected_attach_type = BPF_CGROUP_INET_EGRESS, + }; + int map_fd; + struct bpf_map *map; + + int ret = 0; + + if (access(prog, O_RDONLY) < 0) { + printf("Error accessing file %s: %s\n", prog, strerror(errno)); + return 1; + } + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) + ret = 1; + if (!ret) { + map = bpf_object__find_map_by_name(obj, "queue_stats"); + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + printf("Map not found: %s\n", strerror(map_fd)); + ret = 1; + } + } + + if (ret) { + printf("ERROR: load_bpf_file failed for: %s\n", prog); + printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); + ret = -1; + } else { + ret = map_fd; + } + + return ret; +} + +static int run_bpf_prog(char *prog, int cg_id) +{ + int map_fd; + int rc = 0; + int key = 0; + int cg1 = 0; + int type = BPF_CGROUP_INET_EGRESS; + char cg_dir[100]; + struct hbm_queue_stats qstats = {0}; + + sprintf(cg_dir, "/hbm%d", cg_id); + map_fd = prog_load(prog); + if (map_fd == -1) + return 1; + + if (setup_cgroup_environment()) { + printf("ERROR: setting cgroup environment\n"); + goto err; + } + cg1 = create_and_get_cgroup(cg_dir); + if (!cg1) { + printf("ERROR: create_and_get_cgroup\n"); + goto err; + } + if (join_cgroup(cg_dir)) { + printf("ERROR: join_cgroup\n"); + goto err; + } + + qstats.rate = rate; + qstats.stats = stats_flag ? 1 : 0; + qstats.loopback = loopback_flag ? 1 : 0; + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { + printf("ERROR: Could not update map element\n"); + goto err; + } + + if (!outFlag) + type = BPF_CGROUP_INET_INGRESS; + if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { + printf("ERROR: bpf_prog_attach fails!\n"); + log_err("Attaching prog"); + goto err; + } + + if (work_conserving_flag) { + struct timeval t0, t_last, t_new; + FILE *fin; + unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; + signed long long last_cg_tx_bytes, new_cg_tx_bytes; + signed long long delta_time, delta_bytes, delta_rate; + int delta_ms; +#define DELTA_RATE_CHECK 10000 /* in us */ +#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ + + bpf_map_lookup_elem(map_fd, &key, &qstats); + if (gettimeofday(&t0, NULL) < 0) + do_error("gettimeofday failed", true); + t_last = t0; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); + if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + last_cg_tx_bytes = qstats.bytes_total; + while (true) { + usleep(DELTA_RATE_CHECK); + if (gettimeofday(&t_new, NULL) < 0) + do_error("gettimeofday failed", true); + delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + + (t_new.tv_usec - t0.tv_usec)/1000; + if (delta_ms > dur * 1000) + break; + delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + + (t_new.tv_usec - t_last.tv_usec); + if (delta_time == 0) + continue; + t_last = t_new; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", + "r"); + if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + printf(" new_eth_tx_bytes:%llu\n", + new_eth_tx_bytes); + bpf_map_lookup_elem(map_fd, &key, &qstats); + new_cg_tx_bytes = qstats.bytes_total; + delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; + last_eth_tx_bytes = new_eth_tx_bytes; + delta_rate = (delta_bytes * 8000000) / delta_time; + printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", + delta_ms, delta_rate/1000000000.0, + rate/1000.0); + if (delta_rate < RATE_THRESHOLD) { + /* can increase cgroup rate limit, but first + * check if we are using the current limit. + * Currently increasing by 6.25%, unknown + * if that is the optimal rate. + */ + int rate_diff100; + + delta_bytes = new_cg_tx_bytes - + last_cg_tx_bytes; + last_cg_tx_bytes = new_cg_tx_bytes; + delta_rate = (delta_bytes * 8000000) / + delta_time; + printf(" rate:%.3fGbps", + delta_rate/1000000000.0); + rate_diff100 = (((long long)rate)*1000000 - + delta_rate) * 100 / + (((long long) rate) * 1000000); + printf(" rdiff:%d", rate_diff100); + if (rate_diff100 <= 3) { + rate += (rate >> 4); + if (rate > RATE_THRESHOLD / 1000000) + rate = RATE_THRESHOLD / 1000000; + qstats.rate = rate; + printf(" INC\n"); + } else { + printf("\n"); + } + } else { + /* Need to decrease cgroup rate limit. + * Currently decreasing by 12.5%, unknown + * if that is optimal + */ + printf(" DEC\n"); + rate -= (rate >> 3); + if (rate < minRate) + rate = minRate; + qstats.rate = rate; + } + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) + do_error("update map element fails", false); + } + } else { + sleep(dur); + } + // Get stats! + if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { + char fname[100]; + FILE *fout; + + if (!outFlag) + sprintf(fname, "hbm.%d.in", cg_id); + else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "ERROR: Could not lookup queue_stats\n"); + } else if (stats_flag && qstats.lastPacketTime > + qstats.firstPacketTime) { + long long delta_us = (qstats.lastPacketTime - + qstats.firstPacketTime)/1000; + unsigned int rate_mbps = ((qstats.bytes_total - + qstats.bytes_dropped) * 8 / + delta_us); + double percent_pkts, percent_bytes; + char fname[100]; + FILE *fout; + +// Future support of ingress +// if (!outFlag) +// sprintf(fname, "hbm.%d.in", cg_id); +// else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "rate_mbps:%d\n", rate_mbps); + fprintf(fout, "duration:%.1f secs\n", + (qstats.lastPacketTime - qstats.firstPacketTime) / + 1000000000.0); + fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); + fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / + 1000000)); + fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); + fprintf(fout, "bytes_dropped_MB:%d\n", + (int)(qstats.bytes_dropped / + 1000000)); + // Marked Pkts and Bytes + percent_pkts = (qstats.pkts_marked * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_marked * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); + + // Dropped Pkts and Bytes + percent_pkts = (qstats.pkts_dropped * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_dropped * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + fclose(fout); + } + + if (debugFlag) + read_trace_pipe2(); + return rc; +err: + rc = 1; + + if (cg1) + close(cg1); + cleanup_cgroup_environment(); + + return rc; +} + +static void Usage(void) +{ + printf("This program loads a cgroup skb BPF program to enforce\n" + "cgroup output (egress) bandwidth limits.\n\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" + " [-t <secs>] [-w] [-h] [prog]\n" + " Where:\n" + " -o indicates egress direction (default)\n" + " -d print BPF trace debug buffer\n" + " -l also limit flows using loopback\n" + " -n <#> to create cgroup \"/hbm#\" and attach prog\n" + " Default is /hbm1\n" + " -r <rate> Rate in Mbps\n" + " -s Update HBM stats\n" + " -t <time> Exit after specified seconds (deault is 0)\n" + " -w Work conserving flag. cgroup can increase\n" + " bandwidth beyond the rate limit specified\n" + " while there is available bandwidth. Current\n" + " implementation assumes there is only eth0\n" + " but can be extended to support multiple NICs\n" + " -h print this info\n" + " prog BPF program file name. Name defaults to\n" + " hbm_out_kern.o\n"); +} + +int main(int argc, char **argv) +{ + char *prog = "hbm_out_kern.o"; + int k; + int cg_id = 1; + char *optstring = "iodln:r:st:wh"; + + while ((k = getopt(argc, argv, optstring)) != -1) { + switch (k) { + case'o': + break; + case 'd': + debugFlag = true; + break; + case 'l': + loopback_flag = true; + break; + case 'n': + cg_id = atoi(optarg); + break; + case 'r': + minRate = atoi(optarg) * 1.024; + rate = minRate; + break; + case 's': + stats_flag = true; + break; + case 't': + dur = atoi(optarg); + break; + case 'w': + work_conserving_flag = true; + break; + case '?': + if (optopt == 'n' || optopt == 'r' || optopt == 't') + fprintf(stderr, + "Option -%c requires an argument.\n\n", + optopt); + case 'h': + // fallthrough + default: + Usage(); + return 0; + } + } + + if (optind < argc) + prog = argv[optind]; + printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); + + return run_bpf_prog(prog, cg_id); +} diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..518e8147d084 --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1; /* also limit flows using loopback */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; +}; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..c5635d924193 --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include <stddef.h> +#include <stdbool.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include <net/inet_ecn.h> +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 + +#define HBM_DEBUG 0 // Set to 1 to enable debugging +#if HBM_DEBUG +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) +#else +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) + +struct bpf_map_def SEC("maps") queue_state = { + .type = BPF_MAP_TYPE_CGROUP_STORAGE, + .key_size = sizeof(struct bpf_cgroup_storage_key), + .value_size = sizeof(struct hbm_vqueue), +}; +BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, + struct hbm_vqueue); + +struct bpf_map_def SEC("maps") queue_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct hbm_queue_stats), + .max_entries = 1, +}; +BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); + +struct hbm_pkt_info { + bool is_ip; + bool is_tcp; + short ecn; +}; + +static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag) +{ + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag || drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..f806863d0b79 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, zero = 0; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + credit += CREDIT_PER_NS(delta, qdp->rate); + if (credit > MAX_CREDIT) + credit = MAX_CREDIT; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && + credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop flag + drop_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (!bpf_skb_ecn_set_ce(skb)) { + if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (drop_flag) + rv = DROP_PKT; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qdp->credit), len); + + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c deleted file mode 100644 index 8ecb41ea0c03..000000000000 --- a/samples/bpf/load_sock_ops.c +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2017 Facebook - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <linux/bpf.h> -#include <bpf/bpf.h> -#include "bpf_load.h" -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <linux/unistd.h> - -static void usage(char *pname) -{ - printf("USAGE:\n %s [-l] <cg-path> <prog filename>\n", pname); - printf("\tLoad and attach a sock_ops program to the specified " - "cgroup\n"); - printf("\tIf \"-l\" is used, the program will continue to run\n"); - printf("\tprinting the BPF log buffer\n"); - printf("\tIf the specified filename does not end in \".o\", it\n"); - printf("\tappends \"_kern.o\" to the name\n"); - printf("\n"); - printf(" %s -r <cg-path>\n", pname); - printf("\tDetaches the currently attached sock_ops program\n"); - printf("\tfrom the specified cgroup\n"); - printf("\n"); - exit(1); -} - -int main(int argc, char **argv) -{ - int logFlag = 0; - int error = 0; - char *cg_path; - char fn[500]; - char *prog; - int cg_fd; - - if (argc < 3) - usage(argv[0]); - - if (!strcmp(argv[1], "-r")) { - cg_path = argv[2]; - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS); - if (error) { - printf("ERROR: bpf_prog_detach: %d (%s)\n", - error, strerror(errno)); - return 2; - } - return 0; - } else if (!strcmp(argv[1], "-h")) { - usage(argv[0]); - } else if (!strcmp(argv[1], "-l")) { - logFlag = 1; - if (argc < 4) - usage(argv[0]); - } - - prog = argv[argc - 1]; - cg_path = argv[argc - 2]; - if (strlen(prog) > 480) { - fprintf(stderr, "ERROR: program name too long (> 480 chars)\n"); - return 3; - } - cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY); - - if (!strcmp(prog + strlen(prog)-2, ".o")) - strcpy(fn, prog); - else - sprintf(fn, "%s_kern.o", prog); - if (logFlag) - printf("loading bpf file:%s\n", fn); - if (load_bpf_file(fn)) { - printf("ERROR: load_bpf_file failed for: %s\n", fn); - printf("%s", bpf_log_buf); - return 4; - } - if (logFlag) - printf("TCP BPF Loaded %s\n", fn); - - error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0); - if (error) { - printf("ERROR: bpf_prog_attach: %d (%s)\n", - error, strerror(errno)); - return 5; - } else if (logFlag) { - read_trace_pipe(); - } - - return error; -} diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 60ec467c78ab..00aae1d33fca 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -99,7 +99,7 @@ int main(void) { FILE *f; - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void)f; return test_sock(); diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c index 93ec01c56104..7f90796ae15a 100644 --- a/samples/bpf/sockex1_user.c +++ b/samples/bpf/sockex1_user.c @@ -3,30 +3,33 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include "bpf/libbpf.h" #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> int main(int ac, char **argv) { + struct bpf_object *obj; + int map_fd, prog_fd; char filename[256]; - FILE *f; int i, sock; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, + &obj, &prog_fd)) return 1; - } + + map_fd = bpf_object__find_map_fd_by_name(obj, "my_map"); sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, - sizeof(prog_fd[0])) == 0); + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { @@ -34,13 +37,13 @@ int main(int ac, char **argv) int key; key = IPPROTO_TCP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0); key = IPPROTO_UDP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0); key = IPPROTO_ICMP; - assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); + assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0); printf("TCP %lld UDP %lld ICMP %lld bytes\n", tcp_cnt, udp_cnt, icmp_cnt); diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c index 1d5c6e9a6d27..bc257333ad92 100644 --- a/samples/bpf/sockex2_user.c +++ b/samples/bpf/sockex2_user.c @@ -3,7 +3,7 @@ #include <assert.h> #include <linux/bpf.h> #include <bpf/bpf.h> -#include "bpf_load.h" +#include "bpf/libbpf.h" #include "sock_example.h" #include <unistd.h> #include <arpa/inet.h> @@ -17,32 +17,35 @@ struct pair { int main(int ac, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + struct bpf_object *obj; + int map_fd, prog_fd; char filename[256]; - FILE *f; int i, sock; + FILE *f; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); setrlimit(RLIMIT_MEMLOCK, &r); - if (load_bpf_file(filename)) { - printf("%s", bpf_log_buf); + if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER, + &obj, &prog_fd)) return 1; - } + + map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map"); sock = open_raw_sock("lo"); - assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, - sizeof(prog_fd[0])) == 0); + assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd, + sizeof(prog_fd)) == 0); - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { int key = 0, next_key; struct pair value; - while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) { - bpf_map_lookup_elem(map_fd[0], &next_key, &value); + while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) { + bpf_map_lookup_elem(map_fd, &next_key, &value); printf("ip %s bytes %lld packets %lld\n", inet_ntoa((struct in_addr){htonl(next_key)}), value.bytes, value.packets); diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c index 9d02e0404719..bbb1cd0666a9 100644 --- a/samples/bpf/sockex3_user.c +++ b/samples/bpf/sockex3_user.c @@ -58,7 +58,7 @@ int main(int argc, char **argv) sizeof(__u32)) == 0); if (argc > 1) - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); else f = popen("netperf -l 4 localhost", "r"); (void) f; diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c index f4b0a9ea674d..fb56fc2a3e5d 100644 --- a/samples/bpf/task_fd_query_kern.c +++ b/samples/bpf/task_fd_query_kern.c @@ -4,7 +4,7 @@ #include <uapi/linux/bpf.h> #include "bpf_helpers.h" -SEC("kprobe/blk_start_request") +SEC("kprobe/blk_mq_start_request") int bpf_prog1(struct pt_regs *ctx) { return 0; diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 8381d792f138..aff2b4ae914e 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -311,7 +311,7 @@ int main(int argc, char **argv) } /* test two functions in the corresponding *_kern.c file */ - CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request", + CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request", BPF_FD_TYPE_KPROBE)); CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion", BPF_FD_TYPE_KRETPROBE)); diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c index 4bf4fc597db9..6ef1625e8b2c 100644 --- a/samples/bpf/tcp_basertt_kern.c +++ b/samples/bpf/tcp_basertt_kern.c @@ -7,7 +7,7 @@ * BPF program to set base_rtt to 80us when host is running TCP-NV and * both hosts are in the same datacenter (as determined by IPv6 prefix). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme index 831fb601e3c9..fee746621aec 100644 --- a/samples/bpf/tcp_bpf.readme +++ b/samples/bpf/tcp_bpf.readme @@ -8,14 +8,16 @@ a cgroupv2 and attach a bash shell to the group. bash echo $$ >> /tmp/cgroupv2/foo/cgroup.procs -Anything that runs under this shell belongs to the foo cgroupv2 To load +Anything that runs under this shell belongs to the foo cgroupv2. To load (attach) one of the tcp_*_kern.o programs: - ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o + bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog + bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog + bpftool prog tracelog -If the "-l" flag is used, the load_sock_ops program will continue to run -printing the BPF log buffer. The tcp_*_kern.o programs use special print -functions to print logging information (if enabled by the ifdef). +"bpftool prog tracelog" will continue to run printing the BPF log buffer. +The tcp_*_kern.o programs use special print functions to print logging +information (if enabled by the ifdef). If using netperf/netserver to create traffic, you need to run them under the cgroupv2 to which the BPF programs are attached (i.e. under bash shell @@ -23,4 +25,4 @@ attached to the cgroupv2). To remove (unattach) a socket_ops BPF program from a cgroupv2: - ./load_sock_ops -r /tmp/cgroupv2/foo + bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c index 0566b7fa38a1..e03e204739fa 100644 --- a/samples/bpf/tcp_bufs_kern.c +++ b/samples/bpf/tcp_bufs_kern.c @@ -9,7 +9,7 @@ * doing appropriate checks that indicate the hosts are far enough * away (i.e. large RTT). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c index f4225c9d2c0c..a0dc2d254aca 100644 --- a/samples/bpf/tcp_clamp_kern.c +++ b/samples/bpf/tcp_clamp_kern.c @@ -9,7 +9,7 @@ * the same datacenter. For his example, we assume they are within the same * datacenter when the first 5.5 bytes of their IPv6 addresses are the same. * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c index ad0f1ba8206a..4fd3ca979a06 100644 --- a/samples/bpf/tcp_cong_kern.c +++ b/samples/bpf/tcp_cong_kern.c @@ -7,7 +7,7 @@ * BPF program to set congestion control to dctcp when both hosts are * in the same datacenter (as deteremined by IPv6 prefix). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c index 4ca5ecc9f580..9b139ec69560 100644 --- a/samples/bpf/tcp_iw_kern.c +++ b/samples/bpf/tcp_iw_kern.c @@ -9,7 +9,7 @@ * would usually be done after doing appropriate checks that indicate * the hosts are far enough away (i.e. large RTT). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c index 09ff65b40b31..cc71ee96e044 100644 --- a/samples/bpf/tcp_rwnd_kern.c +++ b/samples/bpf/tcp_rwnd_kern.c @@ -8,7 +8,7 @@ * and the first 5.5 bytes of the IPv6 addresses are not the same (in this * example that means both hosts are not the same datacenter). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c index 232bb242823e..ca87ed34f896 100644 --- a/samples/bpf/tcp_synrto_kern.c +++ b/samples/bpf/tcp_synrto_kern.c @@ -8,7 +8,7 @@ * and the first 5.5 bytes of the IPv6 addresses are the same (in this example * that means both hosts are in the same datacenter). * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c index d51dab19eca6..de788be6f862 100644 --- a/samples/bpf/tcp_tos_reflect_kern.c +++ b/samples/bpf/tcp_tos_reflect_kern.c @@ -4,7 +4,7 @@ * * BPF program to automatically reflect TOS option from received syn packet * - * Use load_sock_ops to load this BPF program. + * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program. */ #include <uapi/linux/bpf.h> diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 1a81e6a5c2ea..c9544a4ce61a 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -131,7 +131,7 @@ int main(int ac, char **argv) signal(SIGTERM, int_exit); /* start 'ping' in the background to have some kfree_skb events */ - f = popen("ping -c5 localhost", "r"); + f = popen("ping -4 -c5 localhost", "r"); (void) f; /* start 'dd' in the background to have plenty of 'write' syscalls */ diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c index 9974c3d7c18b..ea1d4c19c132 100644 --- a/samples/bpf/tracex3_kern.c +++ b/samples/bpf/tracex3_kern.c @@ -20,7 +20,7 @@ struct bpf_map_def SEC("maps") my_map = { /* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe * example will no longer be meaningful */ -SEC("kprobe/blk_start_request") +SEC("kprobe/blk_mq_start_request") int bpf_prog1(struct pt_regs *ctx) { long rq = PT_REGS_PARM1(ctx); diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c index 327226be5a06..1dbe7fd3a1a8 100644 --- a/samples/bpf/xdp_redirect_map_user.c +++ b/samples/bpf/xdp_redirect_map_user.c @@ -57,7 +57,7 @@ static void int_exit(int sig) printf("bpf_get_link_xdp_id failed\n"); exit(1); } - if (prog_id == curr_prog_id) + if (dummy_prog_id == curr_prog_id) bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); else if (!curr_prog_id) printf("couldn't find a prog id on iface OUT\n"); diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c index a5d8ad3129ed..e9054c0269ff 100644 --- a/samples/bpf/xdp_redirect_user.c +++ b/samples/bpf/xdp_redirect_user.c @@ -57,7 +57,7 @@ static void int_exit(int sig) printf("bpf_get_link_xdp_id failed\n"); exit(1); } - if (prog_id == curr_prog_id) + if (dummy_prog_id == curr_prog_id) bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags); else if (!curr_prog_id) printf("couldn't find a prog id on iface OUT\n"); diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h deleted file mode 100644 index 533ab81adfa1..000000000000 --- a/samples/bpf/xdpsock.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef XDPSOCK_H_ -#define XDPSOCK_H_ - -/* Power-of-2 number of sockets */ -#define MAX_SOCKS 4 - -/* Round-robin receive */ -#define RR_LB 0 - -#endif /* XDPSOCK_H_ */ diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c deleted file mode 100644 index b8ccd0802b3f..000000000000 --- a/samples/bpf/xdpsock_kern.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define KBUILD_MODNAME "foo" -#include <uapi/linux/bpf.h> -#include "bpf_helpers.h" - -#include "xdpsock.h" - -struct bpf_map_def SEC("maps") qidconf_map = { - .type = BPF_MAP_TYPE_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = 1, -}; - -struct bpf_map_def SEC("maps") xsks_map = { - .type = BPF_MAP_TYPE_XSKMAP, - .key_size = sizeof(int), - .value_size = sizeof(int), - .max_entries = MAX_SOCKS, -}; - -struct bpf_map_def SEC("maps") rr_map = { - .type = BPF_MAP_TYPE_PERCPU_ARRAY, - .key_size = sizeof(int), - .value_size = sizeof(unsigned int), - .max_entries = 1, -}; - -SEC("xdp_sock") -int xdp_sock_prog(struct xdp_md *ctx) -{ - int *qidconf, key = 0, idx; - unsigned int *rr; - - qidconf = bpf_map_lookup_elem(&qidconf_map, &key); - if (!qidconf) - return XDP_ABORTED; - - if (*qidconf != ctx->rx_queue_index) - return XDP_PASS; - -#if RR_LB /* NB! RR_LB is configured in xdpsock.h */ - rr = bpf_map_lookup_elem(&rr_map, &key); - if (!rr) - return XDP_ABORTED; - - *rr = (*rr + 1) & (MAX_SOCKS - 1); - idx = *rr; -#else - idx = 0; -#endif - - return bpf_redirect_map(&xsks_map, idx, 0); -} - -char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index f73055e0191f..d08ee1ab7bb4 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1,37 +1,36 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright(c) 2017 - 2018 Intel Corporation. */ -#include <assert.h> +#include <asm/barrier.h> #include <errno.h> #include <getopt.h> #include <libgen.h> #include <linux/bpf.h> +#include <linux/compiler.h> #include <linux/if_link.h> #include <linux/if_xdp.h> #include <linux/if_ether.h> +#include <locale.h> +#include <net/ethernet.h> #include <net/if.h> +#include <poll.h> +#include <pthread.h> #include <signal.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <net/ethernet.h> +#include <sys/mman.h> #include <sys/resource.h> #include <sys/socket.h> -#include <sys/mman.h> +#include <sys/types.h> #include <time.h> #include <unistd.h> -#include <pthread.h> -#include <locale.h> -#include <sys/types.h> -#include <poll.h> #include "bpf/libbpf.h" -#include "bpf_util.h" +#include "bpf/xsk.h" #include <bpf/bpf.h> -#include "xdpsock.h" - #ifndef SOL_XDP #define SOL_XDP 283 #endif @@ -44,17 +43,11 @@ #define PF_XDP AF_XDP #endif -#define NUM_FRAMES 131072 -#define FRAME_HEADROOM 0 -#define FRAME_SHIFT 11 -#define FRAME_SIZE 2048 -#define NUM_DESCS 1024 -#define BATCH_SIZE 16 - -#define FQ_NUM_DESCS 1024 -#define CQ_NUM_DESCS 1024 +#define NUM_FRAMES (4 * 1024) +#define BATCH_SIZE 64 #define DEBUG_HEXDUMP 0 +#define MAX_SOCKS 8 typedef __u64 u64; typedef __u32 u32; @@ -73,54 +66,31 @@ static const char *opt_if = ""; static int opt_ifindex; static int opt_queue; static int opt_poll; -static int opt_shared_packet_buffer; static int opt_interval = 1; static u32 opt_xdp_bind_flags; static __u32 prog_id; -struct xdp_umem_uqueue { - u32 cached_prod; - u32 cached_cons; - u32 mask; - u32 size; - u32 *producer; - u32 *consumer; - u64 *ring; - void *map; +struct xsk_umem_info { + struct xsk_ring_prod fq; + struct xsk_ring_cons cq; + struct xsk_umem *umem; + void *buffer; }; -struct xdp_umem { - char *frames; - struct xdp_umem_uqueue fq; - struct xdp_umem_uqueue cq; - int fd; -}; - -struct xdp_uqueue { - u32 cached_prod; - u32 cached_cons; - u32 mask; - u32 size; - u32 *producer; - u32 *consumer; - struct xdp_desc *ring; - void *map; -}; - -struct xdpsock { - struct xdp_uqueue rx; - struct xdp_uqueue tx; - int sfd; - struct xdp_umem *umem; - u32 outstanding_tx; +struct xsk_socket_info { + struct xsk_ring_cons rx; + struct xsk_ring_prod tx; + struct xsk_umem_info *umem; + struct xsk_socket *xsk; unsigned long rx_npkts; unsigned long tx_npkts; unsigned long prev_rx_npkts; unsigned long prev_tx_npkts; + u32 outstanding_tx; }; static int num_socks; -struct xdpsock *xsks[MAX_SOCKS]; +struct xsk_socket_info *xsks[MAX_SOCKS]; static unsigned long get_nsecs(void) { @@ -130,225 +100,124 @@ static unsigned long get_nsecs(void) return ts.tv_sec * 1000000000UL + ts.tv_nsec; } -static void dump_stats(void); - -#define lassert(expr) \ - do { \ - if (!(expr)) { \ - fprintf(stderr, "%s:%s:%i: Assertion failed: " \ - #expr ": errno: %d/\"%s\"\n", \ - __FILE__, __func__, __LINE__, \ - errno, strerror(errno)); \ - dump_stats(); \ - exit(EXIT_FAILURE); \ - } \ - } while (0) - -#define barrier() __asm__ __volatile__("": : :"memory") -#ifdef __aarch64__ -#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory") -#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory") -#else -#define u_smp_rmb() barrier() -#define u_smp_wmb() barrier() -#endif -#define likely(x) __builtin_expect(!!(x), 1) -#define unlikely(x) __builtin_expect(!!(x), 0) - -static const char pkt_data[] = - "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" - "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" - "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" - "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; - -static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb) -{ - u32 free_entries = q->cached_cons - q->cached_prod; - - if (free_entries >= nb) - return free_entries; - - /* Refresh the local tail pointer */ - q->cached_cons = *q->consumer + q->size; - - return q->cached_cons - q->cached_prod; -} - -static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs) +static void print_benchmark(bool running) { - u32 free_entries = q->cached_cons - q->cached_prod; + const char *bench_str = "INVALID"; - if (free_entries >= ndescs) - return free_entries; + if (opt_bench == BENCH_RXDROP) + bench_str = "rxdrop"; + else if (opt_bench == BENCH_TXONLY) + bench_str = "txonly"; + else if (opt_bench == BENCH_L2FWD) + bench_str = "l2fwd"; - /* Refresh the local tail pointer */ - q->cached_cons = *q->consumer + q->size; - return q->cached_cons - q->cached_prod; -} + printf("%s:%d %s ", opt_if, opt_queue, bench_str); + if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) + printf("xdp-skb "); + else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) + printf("xdp-drv "); + else + printf(" "); -static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb) -{ - u32 entries = q->cached_prod - q->cached_cons; + if (opt_poll) + printf("poll() "); - if (entries == 0) { - q->cached_prod = *q->producer; - entries = q->cached_prod - q->cached_cons; + if (running) { + printf("running..."); + fflush(stdout); } - - return (entries > nb) ? nb : entries; } -static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs) +static void dump_stats(void) { - u32 entries = q->cached_prod - q->cached_cons; + unsigned long now = get_nsecs(); + long dt = now - prev_time; + int i; - if (entries == 0) { - q->cached_prod = *q->producer; - entries = q->cached_prod - q->cached_cons; - } + prev_time = now; - return (entries > ndescs) ? ndescs : entries; -} + for (i = 0; i < num_socks && xsks[i]; i++) { + char *fmt = "%-15s %'-11.0f %'-11lu\n"; + double rx_pps, tx_pps; -static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq, - struct xdp_desc *d, - size_t nb) -{ - u32 i; + rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * + 1000000000. / dt; + tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * + 1000000000. / dt; - if (umem_nb_free(fq, nb) < nb) - return -ENOSPC; + printf("\n sock%d@", i); + print_benchmark(false); + printf("\n"); - for (i = 0; i < nb; i++) { - u32 idx = fq->cached_prod++ & fq->mask; + printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", + dt / 1000000000.); + printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); + printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); - fq->ring[idx] = d[i].addr; + xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; + xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; } - - u_smp_wmb(); - - *fq->producer = fq->cached_prod; - - return 0; } -static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d, - size_t nb) +static void *poller(void *arg) { - u32 i; - - if (umem_nb_free(fq, nb) < nb) - return -ENOSPC; - - for (i = 0; i < nb; i++) { - u32 idx = fq->cached_prod++ & fq->mask; - - fq->ring[idx] = d[i]; + (void)arg; + for (;;) { + sleep(opt_interval); + dump_stats(); } - u_smp_wmb(); - - *fq->producer = fq->cached_prod; - - return 0; + return NULL; } -static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq, - u64 *d, size_t nb) +static void remove_xdp_program(void) { - u32 idx, i, entries = umem_nb_avail(cq, nb); - - u_smp_rmb(); - - for (i = 0; i < entries; i++) { - idx = cq->cached_cons++ & cq->mask; - d[i] = cq->ring[idx]; - } - - if (entries > 0) { - u_smp_wmb(); + __u32 curr_prog_id = 0; - *cq->consumer = cq->cached_cons; + if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { + printf("bpf_get_link_xdp_id failed\n"); + exit(EXIT_FAILURE); } - - return entries; -} - -static inline void *xq_get_data(struct xdpsock *xsk, u64 addr) -{ - return &xsk->umem->frames[addr]; + if (prog_id == curr_prog_id) + bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); + else if (!curr_prog_id) + printf("couldn't find a prog id on a given interface\n"); + else + printf("program on interface changed, not removing\n"); } -static inline int xq_enq(struct xdp_uqueue *uq, - const struct xdp_desc *descs, - unsigned int ndescs) +static void int_exit(int sig) { - struct xdp_desc *r = uq->ring; - unsigned int i; + struct xsk_umem *umem = xsks[0]->umem->umem; - if (xq_nb_free(uq, ndescs) < ndescs) - return -ENOSPC; - - for (i = 0; i < ndescs; i++) { - u32 idx = uq->cached_prod++ & uq->mask; - - r[idx].addr = descs[i].addr; - r[idx].len = descs[i].len; - } + (void)sig; - u_smp_wmb(); + dump_stats(); + xsk_socket__delete(xsks[0]->xsk); + (void)xsk_umem__delete(umem); + remove_xdp_program(); - *uq->producer = uq->cached_prod; - return 0; + exit(EXIT_SUCCESS); } -static inline int xq_enq_tx_only(struct xdp_uqueue *uq, - unsigned int id, unsigned int ndescs) +static void __exit_with_error(int error, const char *file, const char *func, + int line) { - struct xdp_desc *r = uq->ring; - unsigned int i; - - if (xq_nb_free(uq, ndescs) < ndescs) - return -ENOSPC; - - for (i = 0; i < ndescs; i++) { - u32 idx = uq->cached_prod++ & uq->mask; - - r[idx].addr = (id + i) << FRAME_SHIFT; - r[idx].len = sizeof(pkt_data) - 1; - } - - u_smp_wmb(); - - *uq->producer = uq->cached_prod; - return 0; + fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func, + line, error, strerror(error)); + dump_stats(); + remove_xdp_program(); + exit(EXIT_FAILURE); } -static inline int xq_deq(struct xdp_uqueue *uq, - struct xdp_desc *descs, - int ndescs) -{ - struct xdp_desc *r = uq->ring; - unsigned int idx; - int i, entries; - - entries = xq_nb_avail(uq, ndescs); - - u_smp_rmb(); - - for (i = 0; i < entries; i++) { - idx = uq->cached_cons++ & uq->mask; - descs[i] = r[idx]; - } - - if (entries > 0) { - u_smp_wmb(); +#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \ + __LINE__) - *uq->consumer = uq->cached_cons; - } - - return entries; -} +static const char pkt_data[] = + "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00" + "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14" + "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b" + "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa"; static void swap_mac_addresses(void *data) { @@ -397,258 +266,74 @@ static void hex_dump(void *pkt, size_t length, u64 addr) printf("\n"); } -static size_t gen_eth_frame(char *frame) +static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr) { - memcpy(frame, pkt_data, sizeof(pkt_data) - 1); + memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data, + sizeof(pkt_data) - 1); return sizeof(pkt_data) - 1; } -static struct xdp_umem *xdp_umem_configure(int sfd) +static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size) { - int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS; - struct xdp_mmap_offsets off; - struct xdp_umem_reg mr; - struct xdp_umem *umem; - socklen_t optlen; - void *bufs; + struct xsk_umem_info *umem; + int ret; umem = calloc(1, sizeof(*umem)); - lassert(umem); - - lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ - NUM_FRAMES * FRAME_SIZE) == 0); - - mr.addr = (__u64)bufs; - mr.len = NUM_FRAMES * FRAME_SIZE; - mr.chunk_size = FRAME_SIZE; - mr.headroom = FRAME_HEADROOM; - - lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0); - lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size, - sizeof(int)) == 0); - lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size, - sizeof(int)) == 0); - - optlen = sizeof(off); - lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, - &optlen) == 0); - - umem->fq.map = mmap(0, off.fr.desc + - FQ_NUM_DESCS * sizeof(u64), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_UMEM_PGOFF_FILL_RING); - lassert(umem->fq.map != MAP_FAILED); - - umem->fq.mask = FQ_NUM_DESCS - 1; - umem->fq.size = FQ_NUM_DESCS; - umem->fq.producer = umem->fq.map + off.fr.producer; - umem->fq.consumer = umem->fq.map + off.fr.consumer; - umem->fq.ring = umem->fq.map + off.fr.desc; - umem->fq.cached_cons = FQ_NUM_DESCS; - - umem->cq.map = mmap(0, off.cr.desc + - CQ_NUM_DESCS * sizeof(u64), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_UMEM_PGOFF_COMPLETION_RING); - lassert(umem->cq.map != MAP_FAILED); - - umem->cq.mask = CQ_NUM_DESCS - 1; - umem->cq.size = CQ_NUM_DESCS; - umem->cq.producer = umem->cq.map + off.cr.producer; - umem->cq.consumer = umem->cq.map + off.cr.consumer; - umem->cq.ring = umem->cq.map + off.cr.desc; - - umem->frames = bufs; - umem->fd = sfd; + if (!umem) + exit_with_error(errno); - if (opt_bench == BENCH_TXONLY) { - int i; - - for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE) - (void)gen_eth_frame(&umem->frames[i]); - } + ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq, + NULL); + if (ret) + exit_with_error(-ret); + umem->buffer = buffer; return umem; } -static struct xdpsock *xsk_configure(struct xdp_umem *umem) +static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem) { - struct sockaddr_xdp sxdp = {}; - struct xdp_mmap_offsets off; - int sfd, ndescs = NUM_DESCS; - struct xdpsock *xsk; - bool shared = true; - socklen_t optlen; - u64 i; - - sfd = socket(PF_XDP, SOCK_RAW, 0); - lassert(sfd >= 0); + struct xsk_socket_config cfg; + struct xsk_socket_info *xsk; + int ret; + u32 idx; + int i; xsk = calloc(1, sizeof(*xsk)); - lassert(xsk); - - xsk->sfd = sfd; - xsk->outstanding_tx = 0; - - if (!umem) { - shared = false; - xsk->umem = xdp_umem_configure(sfd); - } else { - xsk->umem = umem; - } - - lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING, - &ndescs, sizeof(int)) == 0); - lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING, - &ndescs, sizeof(int)) == 0); - optlen = sizeof(off); - lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off, - &optlen) == 0); - - /* Rx */ - xsk->rx.map = mmap(NULL, - off.rx.desc + - NUM_DESCS * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_PGOFF_RX_RING); - lassert(xsk->rx.map != MAP_FAILED); - - if (!shared) { - for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE) - lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1) - == 0); - } - - /* Tx */ - xsk->tx.map = mmap(NULL, - off.tx.desc + - NUM_DESCS * sizeof(struct xdp_desc), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, sfd, - XDP_PGOFF_TX_RING); - lassert(xsk->tx.map != MAP_FAILED); - - xsk->rx.mask = NUM_DESCS - 1; - xsk->rx.size = NUM_DESCS; - xsk->rx.producer = xsk->rx.map + off.rx.producer; - xsk->rx.consumer = xsk->rx.map + off.rx.consumer; - xsk->rx.ring = xsk->rx.map + off.rx.desc; - - xsk->tx.mask = NUM_DESCS - 1; - xsk->tx.size = NUM_DESCS; - xsk->tx.producer = xsk->tx.map + off.tx.producer; - xsk->tx.consumer = xsk->tx.map + off.tx.consumer; - xsk->tx.ring = xsk->tx.map + off.tx.desc; - xsk->tx.cached_cons = NUM_DESCS; - - sxdp.sxdp_family = PF_XDP; - sxdp.sxdp_ifindex = opt_ifindex; - sxdp.sxdp_queue_id = opt_queue; - - if (shared) { - sxdp.sxdp_flags = XDP_SHARED_UMEM; - sxdp.sxdp_shared_umem_fd = umem->fd; - } else { - sxdp.sxdp_flags = opt_xdp_bind_flags; - } - - lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0); + if (!xsk) + exit_with_error(errno); + + xsk->umem = umem; + cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS; + cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS; + cfg.libbpf_flags = 0; + cfg.xdp_flags = opt_xdp_flags; + cfg.bind_flags = opt_xdp_bind_flags; + ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem, + &xsk->rx, &xsk->tx, &cfg); + if (ret) + exit_with_error(-ret); + + ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags); + if (ret) + exit_with_error(-ret); + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, + XSK_RING_PROD__DEFAULT_NUM_DESCS, + &idx); + if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS) + exit_with_error(-ret); + for (i = 0; + i < XSK_RING_PROD__DEFAULT_NUM_DESCS * + XSK_UMEM__DEFAULT_FRAME_SIZE; + i += XSK_UMEM__DEFAULT_FRAME_SIZE) + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i; + xsk_ring_prod__submit(&xsk->umem->fq, + XSK_RING_PROD__DEFAULT_NUM_DESCS); return xsk; } -static void print_benchmark(bool running) -{ - const char *bench_str = "INVALID"; - - if (opt_bench == BENCH_RXDROP) - bench_str = "rxdrop"; - else if (opt_bench == BENCH_TXONLY) - bench_str = "txonly"; - else if (opt_bench == BENCH_L2FWD) - bench_str = "l2fwd"; - - printf("%s:%d %s ", opt_if, opt_queue, bench_str); - if (opt_xdp_flags & XDP_FLAGS_SKB_MODE) - printf("xdp-skb "); - else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE) - printf("xdp-drv "); - else - printf(" "); - - if (opt_poll) - printf("poll() "); - - if (running) { - printf("running..."); - fflush(stdout); - } -} - -static void dump_stats(void) -{ - unsigned long now = get_nsecs(); - long dt = now - prev_time; - int i; - - prev_time = now; - - for (i = 0; i < num_socks && xsks[i]; i++) { - char *fmt = "%-15s %'-11.0f %'-11lu\n"; - double rx_pps, tx_pps; - - rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) * - 1000000000. / dt; - tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) * - 1000000000. / dt; - - printf("\n sock%d@", i); - print_benchmark(false); - printf("\n"); - - printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts", - dt / 1000000000.); - printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts); - printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts); - - xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts; - xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts; - } -} - -static void *poller(void *arg) -{ - (void)arg; - for (;;) { - sleep(opt_interval); - dump_stats(); - } - - return NULL; -} - -static void int_exit(int sig) -{ - __u32 curr_prog_id = 0; - - (void)sig; - dump_stats(); - if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) { - printf("bpf_get_link_xdp_id failed\n"); - exit(EXIT_FAILURE); - } - if (prog_id == curr_prog_id) - bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags); - else if (!curr_prog_id) - printf("couldn't find a prog id on a given interface\n"); - else - printf("program on interface changed, not removing\n"); - exit(EXIT_SUCCESS); -} - static struct option long_options[] = { {"rxdrop", no_argument, 0, 'r'}, {"txonly", no_argument, 0, 't'}, @@ -656,7 +341,6 @@ static struct option long_options[] = { {"interface", required_argument, 0, 'i'}, {"queue", required_argument, 0, 'q'}, {"poll", no_argument, 0, 'p'}, - {"shared-buffer", no_argument, 0, 's'}, {"xdp-skb", no_argument, 0, 'S'}, {"xdp-native", no_argument, 0, 'N'}, {"interval", required_argument, 0, 'n'}, @@ -676,7 +360,6 @@ static void usage(const char *prog) " -i, --interface=n Run on interface n\n" " -q, --queue=n Use queue n (default 0)\n" " -p, --poll Use poll syscall\n" - " -s, --shared-buffer Use shared packet buffer\n" " -S, --xdp-skb=n Use XDP skb-mod\n" " -N, --xdp-native=n Enfore XDP native mode\n" " -n, --interval=n Specify statistics update interval (default 1 sec).\n" @@ -715,9 +398,6 @@ static void parse_command_line(int argc, char **argv) case 'q': opt_queue = atoi(optarg); break; - case 's': - opt_shared_packet_buffer = 1; - break; case 'p': opt_poll = 1; break; @@ -751,75 +431,104 @@ static void parse_command_line(int argc, char **argv) opt_if); usage(basename(argv[0])); } + } -static void kick_tx(int fd) +static void kick_tx(struct xsk_socket_info *xsk) { int ret; - ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0); + ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY) return; - lassert(0); + exit_with_error(errno); } -static inline void complete_tx_l2fwd(struct xdpsock *xsk) +static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk) { - u64 descs[BATCH_SIZE]; + u32 idx_cq = 0, idx_fq = 0; unsigned int rcvd; size_t ndescs; if (!xsk->outstanding_tx) return; - kick_tx(xsk->sfd); + kick_tx(xsk); ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE : - xsk->outstanding_tx; + xsk->outstanding_tx; /* re-add completed Tx buffers */ - rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs); + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq); if (rcvd > 0) { - umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd); + unsigned int i; + int ret; + + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, + &idx_fq); + } + for (i = 0; i < rcvd; i++) + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = + *xsk_ring_cons__comp_addr(&xsk->umem->cq, + idx_cq++); + + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; xsk->tx_npkts += rcvd; } } -static inline void complete_tx_only(struct xdpsock *xsk) +static inline void complete_tx_only(struct xsk_socket_info *xsk) { - u64 descs[BATCH_SIZE]; unsigned int rcvd; + u32 idx; if (!xsk->outstanding_tx) return; - kick_tx(xsk->sfd); + kick_tx(xsk); - rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE); + rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx); if (rcvd > 0) { + xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; xsk->tx_npkts += rcvd; } } -static void rx_drop(struct xdpsock *xsk) +static void rx_drop(struct xsk_socket_info *xsk) { - struct xdp_desc descs[BATCH_SIZE]; unsigned int rcvd, i; + u32 idx_rx = 0, idx_fq = 0; + int ret; - rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx); if (!rcvd) return; + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq); + } + for (i = 0; i < rcvd; i++) { - char *pkt = xq_get_data(xsk, descs[i].addr); + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len; + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); - hex_dump(pkt, descs[i].len, descs[i].addr); + hex_dump(pkt, len, addr); + *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr; } + xsk_ring_prod__submit(&xsk->umem->fq, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); xsk->rx_npkts += rcvd; - - umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd); } static void rx_drop_all(void) @@ -830,7 +539,7 @@ static void rx_drop_all(void) memset(fds, 0, sizeof(fds)); for (i = 0; i < num_socks; i++) { - fds[i].fd = xsks[i]->sfd; + fds[i].fd = xsk_socket__fd(xsks[i]->xsk); fds[i].events = POLLIN; timeout = 1000; /* 1sn */ } @@ -847,14 +556,14 @@ static void rx_drop_all(void) } } -static void tx_only(struct xdpsock *xsk) +static void tx_only(struct xsk_socket_info *xsk) { int timeout, ret, nfds = 1; struct pollfd fds[nfds + 1]; - unsigned int idx = 0; + u32 idx, frame_nb = 0; memset(fds, 0, sizeof(fds)); - fds[0].fd = xsk->sfd; + fds[0].fd = xsk_socket__fd(xsk->xsk); fds[0].events = POLLOUT; timeout = 1000; /* 1sn */ @@ -864,50 +573,73 @@ static void tx_only(struct xdpsock *xsk) if (ret <= 0) continue; - if (fds[0].fd != xsk->sfd || - !(fds[0].revents & POLLOUT)) + if (!(fds[0].revents & POLLOUT)) continue; } - if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) { - lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0); + if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == + BATCH_SIZE) { + unsigned int i; + for (i = 0; i < BATCH_SIZE; i++) { + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr + = (frame_nb + i) << + XSK_UMEM__DEFAULT_FRAME_SHIFT; + xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len = + sizeof(pkt_data) - 1; + } + + xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE); xsk->outstanding_tx += BATCH_SIZE; - idx += BATCH_SIZE; - idx %= NUM_FRAMES; + frame_nb += BATCH_SIZE; + frame_nb %= NUM_FRAMES; } complete_tx_only(xsk); } } -static void l2fwd(struct xdpsock *xsk) +static void l2fwd(struct xsk_socket_info *xsk) { for (;;) { - struct xdp_desc descs[BATCH_SIZE]; unsigned int rcvd, i; + u32 idx_rx = 0, idx_tx = 0; int ret; for (;;) { complete_tx_l2fwd(xsk); - rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE); + rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, + &idx_rx); if (rcvd > 0) break; } + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + while (ret != rcvd) { + if (ret < 0) + exit_with_error(-ret); + ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); + } + for (i = 0; i < rcvd; i++) { - char *pkt = xq_get_data(xsk, descs[i].addr); + u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, + idx_rx)->addr; + u32 len = xsk_ring_cons__rx_desc(&xsk->rx, + idx_rx++)->len; + char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr); swap_mac_addresses(pkt); - hex_dump(pkt, descs[i].len, descs[i].addr); + hex_dump(pkt, len, addr); + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr; + xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len; } - xsk->rx_npkts += rcvd; + xsk_ring_prod__submit(&xsk->tx, rcvd); + xsk_ring_cons__release(&xsk->rx, rcvd); - ret = xq_enq(&xsk->tx, descs, rcvd); - lassert(ret == 0); + xsk->rx_npkts += rcvd; xsk->outstanding_tx += rcvd; } } @@ -915,17 +647,10 @@ static void l2fwd(struct xdpsock *xsk) int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; - struct bpf_prog_load_attr prog_load_attr = { - .prog_type = BPF_PROG_TYPE_XDP, - }; - int prog_fd, qidconf_map, xsks_map; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - struct bpf_object *obj; - char xdp_filename[256]; - struct bpf_map *map; - int i, ret, key = 0; + struct xsk_umem_info *umem; pthread_t pt; + void *bufs; + int ret; parse_command_line(argc, argv); @@ -935,67 +660,22 @@ int main(int argc, char **argv) exit(EXIT_FAILURE); } - snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]); - prog_load_attr.file = xdp_filename; - - if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd)) - exit(EXIT_FAILURE); - if (prog_fd < 0) { - fprintf(stderr, "ERROR: no program found: %s\n", - strerror(prog_fd)); - exit(EXIT_FAILURE); - } - - map = bpf_object__find_map_by_name(obj, "qidconf_map"); - qidconf_map = bpf_map__fd(map); - if (qidconf_map < 0) { - fprintf(stderr, "ERROR: no qidconf map found: %s\n", - strerror(qidconf_map)); - exit(EXIT_FAILURE); - } - - map = bpf_object__find_map_by_name(obj, "xsks_map"); - xsks_map = bpf_map__fd(map); - if (xsks_map < 0) { - fprintf(stderr, "ERROR: no xsks map found: %s\n", - strerror(xsks_map)); - exit(EXIT_FAILURE); - } - - if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) { - fprintf(stderr, "ERROR: link set xdp fd failed\n"); - exit(EXIT_FAILURE); - } - - ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len); - if (ret) { - printf("can't get prog info - %s\n", strerror(errno)); - return 1; - } - prog_id = info.id; + ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */ + NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + if (ret) + exit_with_error(ret); - ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0); - if (ret) { - fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n"); - exit(EXIT_FAILURE); - } + /* Create sockets... */ + umem = xsk_configure_umem(bufs, + NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE); + xsks[num_socks++] = xsk_configure_socket(umem); - /* Create sockets... */ - xsks[num_socks++] = xsk_configure(NULL); - -#if RR_LB - for (i = 0; i < MAX_SOCKS - 1; i++) - xsks[num_socks++] = xsk_configure(xsks[0]->umem); -#endif + if (opt_bench == BENCH_TXONLY) { + int i; - /* ...and insert them into the map. */ - for (i = 0; i < num_socks; i++) { - key = i; - ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0); - if (ret) { - fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i); - exit(EXIT_FAILURE); - } + for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE; + i += XSK_UMEM__DEFAULT_FRAME_SIZE) + (void)gen_eth_frame(umem, i); } signal(SIGINT, int_exit); @@ -1005,7 +685,8 @@ int main(int argc, char **argv) setlocale(LC_ALL, ""); ret = pthread_create(&pt, NULL, poller, NULL); - lassert(ret == 0); + if (ret) + exit_with_error(ret); prev_time = get_nsecs(); |