aboutsummaryrefslogtreecommitdiffstats
path: root/samples/bpf
diff options
context:
space:
mode:
Diffstat (limited to 'samples/bpf')
-rw-r--r--samples/bpf/.gitignore1
-rw-r--r--samples/bpf/Makefile14
-rwxr-xr-xsamples/bpf/do_hbm_test.sh436
-rw-r--r--samples/bpf/fds_example.c10
-rw-r--r--samples/bpf/hbm.c441
-rw-r--r--samples/bpf/hbm.h31
-rw-r--r--samples/bpf/hbm_kern.h137
-rw-r--r--samples/bpf/hbm_out_kern.c157
-rw-r--r--samples/bpf/load_sock_ops.c97
-rw-r--r--samples/bpf/sock_example.c2
-rw-r--r--samples/bpf/sockex1_user.c25
-rw-r--r--samples/bpf/sockex2_user.c23
-rw-r--r--samples/bpf/sockex3_user.c2
-rw-r--r--samples/bpf/task_fd_query_kern.c2
-rw-r--r--samples/bpf/task_fd_query_user.c2
-rw-r--r--samples/bpf/tcp_basertt_kern.c2
-rw-r--r--samples/bpf/tcp_bpf.readme14
-rw-r--r--samples/bpf/tcp_bufs_kern.c2
-rw-r--r--samples/bpf/tcp_clamp_kern.c2
-rw-r--r--samples/bpf/tcp_cong_kern.c2
-rw-r--r--samples/bpf/tcp_iw_kern.c2
-rw-r--r--samples/bpf/tcp_rwnd_kern.c2
-rw-r--r--samples/bpf/tcp_synrto_kern.c2
-rw-r--r--samples/bpf/tcp_tos_reflect_kern.c2
-rw-r--r--samples/bpf/tracex2_user.c2
-rw-r--r--samples/bpf/tracex3_kern.c2
-rw-r--r--samples/bpf/xdp_redirect_map_user.c2
-rw-r--r--samples/bpf/xdp_redirect_user.c2
-rw-r--r--samples/bpf/xdpsock.h11
-rw-r--r--samples/bpf/xdpsock_kern.c56
-rw-r--r--samples/bpf/xdpsock_user.c841
31 files changed, 1529 insertions, 797 deletions
diff --git a/samples/bpf/.gitignore b/samples/bpf/.gitignore
index 8ae4940025f8..dbb817dbacfc 100644
--- a/samples/bpf/.gitignore
+++ b/samples/bpf/.gitignore
@@ -1,7 +1,6 @@
cpustat
fds_example
lathist
-load_sock_ops
lwt_len_hist
map_perf_test
offwaketime
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index a0ef7eddd0b3..65e667bdf979 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -40,7 +40,6 @@ hostprogs-y += lwt_len_hist
hostprogs-y += xdp_tx_iptunnel
hostprogs-y += test_map_in_map
hostprogs-y += per_socket_stats_example
-hostprogs-y += load_sock_ops
hostprogs-y += xdp_redirect
hostprogs-y += xdp_redirect_map
hostprogs-y += xdp_redirect_cpu
@@ -53,6 +52,7 @@ hostprogs-y += xdpsock
hostprogs-y += xdp_fwd
hostprogs-y += task_fd_query
hostprogs-y += xdp_sample_pkts
+hostprogs-y += hbm
# Libbpf dependencies
LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -60,9 +60,9 @@ LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
-fds_example-objs := bpf_load.o fds_example.o
-sockex1-objs := bpf_load.o sockex1_user.o
-sockex2-objs := bpf_load.o sockex2_user.o
+fds_example-objs := fds_example.o
+sockex1-objs := sockex1_user.o
+sockex2-objs := sockex2_user.o
sockex3-objs := bpf_load.o sockex3_user.o
tracex1-objs := bpf_load.o tracex1_user.o
tracex2-objs := bpf_load.o tracex2_user.o
@@ -71,7 +71,6 @@ tracex4-objs := bpf_load.o tracex4_user.o
tracex5-objs := bpf_load.o tracex5_user.o
tracex6-objs := bpf_load.o tracex6_user.o
tracex7-objs := bpf_load.o tracex7_user.o
-load_sock_ops-objs := bpf_load.o load_sock_ops.o
test_probe_write_user-objs := bpf_load.o test_probe_write_user_user.o
trace_output-objs := bpf_load.o trace_output_user.o $(TRACE_HELPERS)
lathist-objs := bpf_load.o lathist_user.o
@@ -109,6 +108,7 @@ xdpsock-objs := xdpsock_user.o
xdp_fwd-objs := xdp_fwd_user.o
task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -163,10 +163,10 @@ always += xdp2skb_meta_kern.o
always += syscall_tp_kern.o
always += cpustat_kern.o
always += xdp_adjust_tail_kern.o
-always += xdpsock_kern.o
always += xdp_fwd_kern.o
always += task_fd_query_kern.o
always += xdp_sample_pkts_kern.o
+always += hbm_out_kern.o
KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -266,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
$(src)/*.c: verify_target_bpf $(LIBBPF)
$(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
+$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+$(obj)/hbm.o: $(src)/hbm.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..56c8b4115c95
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+
+Usage() {
+ echo "Script for testing HBM (Host Bandwidth Manager) framework."
+ echo "It creates a cgroup to use for testing and load a BPF program to limit"
+ echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
+ echo "loads. The output is the goodput in Mbps (unless -D was used)."
+ echo ""
+ echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
+ echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+ echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
+ echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+ echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
+ echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
+ echo " Where:"
+ echo " out egress (default)"
+ echo " -b or --bpf BPF program filename to load and attach."
+ echo " Default is hbm_out_kern.o for egress,"
+ echo " -c or -cc TCP congestion control (cubic or dctcp)"
+ echo " --debug print BPF trace buffer"
+ echo " -d or --delay add a delay in ms using netem"
+ echo " -D In addition to the goodput in Mbps, it also outputs"
+ echo " other detailed information. This information is"
+ echo " test dependent (i.e. iperf3 or netperf)."
+ echo " -E enable ECN (not required for dctcp)"
+ echo " -f or --flows number of concurrent flows (default=1)"
+ echo " -i or --id cgroup id (an integer, default is 1)"
+ echo " -N use netperf instead of iperf3"
+ echo " -l do not limit flows using loopback"
+ echo " -h Help"
+ echo " -p or --port iperf3 port (default is 5201)"
+ echo " -P use an iperf3 instance for each flow"
+ echo " -q use the specified qdisc"
+ echo " -r or --rate rate in Mbps (default 1s 1Gbps)"
+ echo " -R Use TCP_RR for netperf. 1st flow has req"
+ echo " size of 10KB, rest of 1MB. Reply in all"
+ echo " cases is 1 byte."
+ echo " More detailed output for each flow can be found"
+ echo " in the files netperf.<cg>.<flow>, where <cg> is the"
+ echo " cgroup id as specified with the -i flag, and <flow>"
+ echo " is the flow id starting at 1 and increasing by 1 for"
+ echo " flow (as specified by -f)."
+ echo " -s or --server hostname of netperf server. Used to create netperf"
+ echo " test traffic between to hosts (default is within host)"
+ echo " netserver must be running on the host."
+ echo " -S or --stats whether to update hbm stats (default is yes)."
+ echo " -t or --time duration of iperf3 in seconds (default=5)"
+ echo " -w Work conserving flag. cgroup can increase its"
+ echo " bandwidth beyond the rate limit specified"
+ echo " while there is available bandwidth. Current"
+ echo " implementation assumes there is only one NIC"
+ echo " (eth0), but can be extended to support multiple"
+ echo " NICs."
+ echo " cubic or dctcp specify which TCP CC to use"
+ echo " "
+ exit
+}
+
+#set -x
+
+debug_flag=0
+args="$@"
+name="$0"
+netem=0
+cc=x
+dir="-o"
+dir_name="out"
+dur=5
+flows=1
+id=1
+prog=""
+port=5201
+rate=1000
+multi_iperf=0
+flow_cnt=1
+use_netperf=0
+rr=0
+ecn=0
+details=0
+server=""
+qdisc=""
+flags=""
+do_stats=0
+
+function start_hbm () {
+ rm -f hbm.out
+ echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
+ echo " " >> hbm.out
+ ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 &
+ echo $!
+}
+
+processArgs () {
+ for i in $args ; do
+ case $i in
+ # Support for upcomming ingress rate limiting
+ #in) # support for upcoming ingress rate limiting
+ # dir="-i"
+ # dir_name="in"
+ # ;;
+ out)
+ dir="-o"
+ dir_name="out"
+ ;;
+ -b=*|--bpf=*)
+ prog="${i#*=}"
+ ;;
+ -c=*|--cc=*)
+ cc="${i#*=}"
+ ;;
+ --debug)
+ flags="$flags -d"
+ debug_flag=1
+ ;;
+ -d=*|--delay=*)
+ netem="${i#*=}"
+ ;;
+ -D)
+ details=1
+ ;;
+ -E)
+ ecn=1
+ ;;
+ # Support for upcomming fq Early Departure Time egress rate limiting
+ #--edt)
+ # prog="hbm_out_edt_kern.o"
+ # qdisc="fq"
+ # ;;
+ -f=*|--flows=*)
+ flows="${i#*=}"
+ ;;
+ -i=*|--id=*)
+ id="${i#*=}"
+ ;;
+ -l)
+ flags="$flags -l"
+ ;;
+ -N)
+ use_netperf=1
+ ;;
+ -p=*|--port=*)
+ port="${i#*=}"
+ ;;
+ -P)
+ multi_iperf=1
+ ;;
+ -q=*)
+ qdisc="${i#*=}"
+ ;;
+ -r=*|--rate=*)
+ rate="${i#*=}"
+ ;;
+ -R)
+ rr=1
+ ;;
+ -s=*|--server=*)
+ server="${i#*=}"
+ ;;
+ -S|--stats)
+ flags="$flags -s"
+ do_stats=1
+ ;;
+ -t=*|--time=*)
+ dur="${i#*=}"
+ ;;
+ -w)
+ flags="$flags -w"
+ ;;
+ cubic)
+ cc=cubic
+ ;;
+ dctcp)
+ cc=dctcp
+ ;;
+ *)
+ echo "Unknown arg:$i"
+ Usage
+ ;;
+ esac
+ done
+}
+
+processArgs
+
+if [ $debug_flag -eq 1 ] ; then
+ rm -f hbm_out.log
+fi
+
+hbm_pid=$(start_hbm)
+usleep 100000
+
+host=`hostname`
+cg_base_dir=/sys/fs/cgroup
+cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
+
+echo $$ >> $cg_dir/cgroup.procs
+
+ulimit -l unlimited
+
+rm -f ss.out
+rm -f hbm.[0-9]*.$dir_name
+if [ $ecn -ne 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_ecn=1
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+ cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
+ if [ "$cc" != "x" ] ; then
+ sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
+ fi
+fi
+
+if [ "$netem" -ne "0" ] ; then
+ if [ "$qdisc" != "" ] ; then
+ echo "WARNING: Ignoring -q options because -d option used"
+ fi
+ tc qdisc del dev lo root > /dev/null 2>&1
+ tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
+elif [ "$qdisc" != "" ] ; then
+ tc qdisc del dev lo root > /dev/null 2>&1
+ tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+fi
+
+n=0
+m=$[$dur * 5]
+hn="::1"
+if [ $use_netperf -ne 0 ] ; then
+ if [ "$server" != "" ] ; then
+ hn=$server
+ fi
+fi
+
+( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
+
+if [ $use_netperf -ne 0 ] ; then
+ begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
+ awk '{ print $1 }'`
+ if [ "$begNetserverPid" == "" ] ; then
+ if [ "$server" == "" ] ; then
+ ( ./netserver > /dev/null 2>&1) &
+ usleep 100000
+ fi
+ fi
+ flow_cnt=1
+ if [ "$server" == "" ] ; then
+ np_server=$host
+ else
+ np_server=$server
+ fi
+ if [ "$cc" == "x" ] ; then
+ np_cc=""
+ else
+ np_cc="-K $cc,$cc"
+ fi
+ replySize=1
+ while [ $flow_cnt -le $flows ] ; do
+ if [ $rr -ne 0 ] ; then
+ reqSize=1M
+ if [ $flow_cnt -eq 1 ] ; then
+ reqSize=10K
+ fi
+ if [ "$dir" == "-i" ] ; then
+ replySize=$reqSize
+ reqSize=1
+ fi
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ else
+ if [ "$dir" == "-i" ] ; then
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ else
+ ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+ fi
+ fi
+ flow_cnt=$[flow_cnt+1]
+ done
+
+# sleep for duration of test (plus some buffer)
+ n=$[dur+2]
+ sleep $n
+
+# force graceful termination of netperf
+ pids=`pgrep netperf`
+ for p in $pids ; do
+ kill -SIGALRM $p
+ done
+
+ flow_cnt=1
+ rate=0
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ fi
+ while [ $flow_cnt -le $flows ] ; do
+ if [ "$dir" == "-i" ] ; then
+ r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+ else
+ r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+ fi
+ echo "rate for flow $flow_cnt: $r"
+ rate=$[rate+r]
+ if [ $details -ne 0 ] ; then
+ echo "-----"
+ echo "Details for cgroup $id, flow $flow_cnt"
+ cat netperf.$id.$flow_cnt
+ fi
+ flow_cnt=$[flow_cnt+1]
+ done
+ if [ $details -ne 0 ] ; then
+ echo ""
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+elif [ $multi_iperf -eq 0 ] ; then
+ (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+ usleep 100000
+ iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
+ rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
+ rate=`echo $rates | grep -o "[0-9]*$"`
+
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+else
+ flow_cnt=1
+ while [ $flow_cnt -le $flows ] ; do
+ (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+ ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
+ port=$[port+1]
+ flow_cnt=$[flow_cnt+1]
+ done
+ n=$[dur+1]
+ sleep $n
+ flow_cnt=1
+ rate=0
+ if [ $details -ne 0 ] ; then
+ echo ""
+ echo "Details for HBM in cgroup $id"
+ if [ $do_stats -eq 1 ] ; then
+ if [ -e hbm.$id.$dir_name ] ; then
+ cat hbm.$id.$dir_name
+ fi
+ fi
+ fi
+
+ while [ $flow_cnt -le $flows ] ; do
+ r=`cat iperf3.$id.$flow_cnt`
+# echo "rate for flow $flow_cnt: $r"
+ if [ $details -ne 0 ] ; then
+ echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
+ fi
+ rate=$[rate+r]
+ flow_cnt=$[flow_cnt+1]
+ done
+ if [ $details -ne 0 ] ; then
+ delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+ echo "PING AVG DELAY:$delay"
+ echo "AGGREGATE_GOODPUT:$rate"
+ else
+ echo $rate
+ fi
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
+fi
+if [ $ecn -ne 0 ] ; then
+ sysctl -w -q -n net.ipv4.tcp_ecn=0
+fi
+if [ "$netem" -ne "0" ] ; then
+ tc qdisc del dev lo root > /dev/null 2>&1
+fi
+
+sleep 2
+
+hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
+if [ "$hbmPid" == "$hbm_pid" ] ; then
+ kill $hbm_pid
+fi
+
+sleep 1
+
+# Detach any BPF programs that may have lingered
+ttx=`bpftool cgroup tree | grep hbm`
+v=2
+for x in $ttx ; do
+ if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
+ cg=$x ; v=0
+ else
+ if [ $v -eq 0 ] ; then
+ id=$x ; v=1
+ else
+ if [ $v -eq 1 ] ; then
+ type=$x ; bpftool cgroup detach $cg $type id $id
+ v=0
+ fi
+ fi
+ fi
+done
+
+if [ $use_netperf -ne 0 ] ; then
+ if [ "$server" == "" ] ; then
+ if [ "$begNetserverPid" == "" ] ; then
+ netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
+ if [ "$netserverPid" != "" ] ; then
+ kill $netserverPid
+ fi
+ fi
+ fi
+fi
+exit
diff --git a/samples/bpf/fds_example.c b/samples/bpf/fds_example.c
index 9854854f05d1..e51eb060244e 100644
--- a/samples/bpf/fds_example.c
+++ b/samples/bpf/fds_example.c
@@ -14,8 +14,8 @@
#include <bpf/bpf.h>
+#include "bpf/libbpf.h"
#include "bpf_insn.h"
-#include "bpf_load.h"
#include "sock_example.h"
#define BPF_F_PIN (1 << 0)
@@ -57,10 +57,14 @@ static int bpf_prog_create(const char *object)
BPF_EXIT_INSN(),
};
size_t insns_cnt = sizeof(insns) / sizeof(struct bpf_insn);
+ char bpf_log_buf[BPF_LOG_BUF_SIZE];
+ struct bpf_object *obj;
+ int prog_fd;
if (object) {
- assert(!load_bpf_file((char *)object));
- return prog_fd[0];
+ assert(!bpf_prog_load(object, BPF_PROG_TYPE_UNSPEC,
+ &obj, &prog_fd));
+ return prog_fd;
} else {
return bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER,
insns, insns_cnt, "GPL", 0,
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..8408ccb7409f
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Example program for Host Bandwidth Managment
+ *
+ * This program loads a cgroup skb BPF program to enforce cgroup output
+ * (egress) or input (ingress) bandwidth limits.
+ *
+ * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
+ * Where:
+ * -d Print BPF trace debug buffer
+ * -l Also limit flows doing loopback
+ * -n <#> To create cgroup \"/hbm#\" and attach prog
+ * Default is /hbm1
+ * -r <rate> Rate limit in Mbps
+ * -s Get HBM stats (marked, dropped, etc.)
+ * -t <time> Exit after specified seconds (deault is 0)
+ * -w Work conserving flag. cgroup can increase its bandwidth
+ * beyond the rate limit specified while there is available
+ * bandwidth. Current implementation assumes there is only
+ * NIC (eth0), but can be extended to support multiple NICs.
+ * Currrently only supported for egress.
+ * -h Print this info
+ * prog BPF program file name. Name defaults to hbm_out_kern.o
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_load.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+#include "hbm.h"
+#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+bool outFlag = true;
+int minRate = 1000; /* cgroup rate limit in Mbps */
+int rate = 1000; /* can grow if rate conserving is enabled */
+int dur = 1;
+bool stats_flag;
+bool loopback_flag;
+bool debugFlag;
+bool work_conserving_flag;
+
+static void Usage(void);
+static void read_trace_pipe2(void);
+static void do_error(char *msg, bool errno_flag);
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+struct bpf_object *obj;
+int bpfprog_fd;
+int cgroup_storage_fd;
+
+static void read_trace_pipe2(void)
+{
+ int trace_fd;
+ FILE *outf;
+ char *outFname = "hbm_out.log";
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0) {
+ printf("Error opening trace_pipe\n");
+ return;
+ }
+
+// Future support of ingress
+// if (!outFlag)
+// outFname = "hbm_in.log";
+ outf = fopen(outFname, "w");
+
+ if (outf == NULL)
+ printf("Error creating %s\n", outFname);
+
+ while (1) {
+ static char buf[4097];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf) - 1);
+ if (sz > 0) {
+ buf[sz] = 0;
+ puts(buf);
+ if (outf != NULL) {
+ fprintf(outf, "%s\n", buf);
+ fflush(outf);
+ }
+ }
+ }
+}
+
+static void do_error(char *msg, bool errno_flag)
+{
+ if (errno_flag)
+ printf("ERROR: %s, errno: %d\n", msg, errno);
+ else
+ printf("ERROR: %s\n", msg);
+ exit(1);
+}
+
+static int prog_load(char *prog)
+{
+ struct bpf_prog_load_attr prog_load_attr = {
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .file = prog,
+ .expected_attach_type = BPF_CGROUP_INET_EGRESS,
+ };
+ int map_fd;
+ struct bpf_map *map;
+
+ int ret = 0;
+
+ if (access(prog, O_RDONLY) < 0) {
+ printf("Error accessing file %s: %s\n", prog, strerror(errno));
+ return 1;
+ }
+ if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
+ ret = 1;
+ if (!ret) {
+ map = bpf_object__find_map_by_name(obj, "queue_stats");
+ map_fd = bpf_map__fd(map);
+ if (map_fd < 0) {
+ printf("Map not found: %s\n", strerror(map_fd));
+ ret = 1;
+ }
+ }
+
+ if (ret) {
+ printf("ERROR: load_bpf_file failed for: %s\n", prog);
+ printf(" Output from verifier:\n%s\n------\n", bpf_log_buf);
+ ret = -1;
+ } else {
+ ret = map_fd;
+ }
+
+ return ret;
+}
+
+static int run_bpf_prog(char *prog, int cg_id)
+{
+ int map_fd;
+ int rc = 0;
+ int key = 0;
+ int cg1 = 0;
+ int type = BPF_CGROUP_INET_EGRESS;
+ char cg_dir[100];
+ struct hbm_queue_stats qstats = {0};
+
+ sprintf(cg_dir, "/hbm%d", cg_id);
+ map_fd = prog_load(prog);
+ if (map_fd == -1)
+ return 1;
+
+ if (setup_cgroup_environment()) {
+ printf("ERROR: setting cgroup environment\n");
+ goto err;
+ }
+ cg1 = create_and_get_cgroup(cg_dir);
+ if (!cg1) {
+ printf("ERROR: create_and_get_cgroup\n");
+ goto err;
+ }
+ if (join_cgroup(cg_dir)) {
+ printf("ERROR: join_cgroup\n");
+ goto err;
+ }
+
+ qstats.rate = rate;
+ qstats.stats = stats_flag ? 1 : 0;
+ qstats.loopback = loopback_flag ? 1 : 0;
+ if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
+ printf("ERROR: Could not update map element\n");
+ goto err;
+ }
+
+ if (!outFlag)
+ type = BPF_CGROUP_INET_INGRESS;
+ if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
+ printf("ERROR: bpf_prog_attach fails!\n");
+ log_err("Attaching prog");
+ goto err;
+ }
+
+ if (work_conserving_flag) {
+ struct timeval t0, t_last, t_new;
+ FILE *fin;
+ unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
+ signed long long last_cg_tx_bytes, new_cg_tx_bytes;
+ signed long long delta_time, delta_bytes, delta_rate;
+ int delta_ms;
+#define DELTA_RATE_CHECK 10000 /* in us */
+#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */
+
+ bpf_map_lookup_elem(map_fd, &key, &qstats);
+ if (gettimeofday(&t0, NULL) < 0)
+ do_error("gettimeofday failed", true);
+ t_last = t0;
+ fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
+ if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
+ do_error("fscanf fails", false);
+ fclose(fin);
+ last_cg_tx_bytes = qstats.bytes_total;
+ while (true) {
+ usleep(DELTA_RATE_CHECK);
+ if (gettimeofday(&t_new, NULL) < 0)
+ do_error("gettimeofday failed", true);
+ delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
+ (t_new.tv_usec - t0.tv_usec)/1000;
+ if (delta_ms > dur * 1000)
+ break;
+ delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
+ (t_new.tv_usec - t_last.tv_usec);
+ if (delta_time == 0)
+ continue;
+ t_last = t_new;
+ fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
+ "r");
+ if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
+ do_error("fscanf fails", false);
+ fclose(fin);
+ printf(" new_eth_tx_bytes:%llu\n",
+ new_eth_tx_bytes);
+ bpf_map_lookup_elem(map_fd, &key, &qstats);
+ new_cg_tx_bytes = qstats.bytes_total;
+ delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
+ last_eth_tx_bytes = new_eth_tx_bytes;
+ delta_rate = (delta_bytes * 8000000) / delta_time;
+ printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
+ delta_ms, delta_rate/1000000000.0,
+ rate/1000.0);
+ if (delta_rate < RATE_THRESHOLD) {
+ /* can increase cgroup rate limit, but first
+ * check if we are using the current limit.
+ * Currently increasing by 6.25%, unknown
+ * if that is the optimal rate.
+ */
+ int rate_diff100;
+
+ delta_bytes = new_cg_tx_bytes -
+ last_cg_tx_bytes;
+ last_cg_tx_bytes = new_cg_tx_bytes;
+ delta_rate = (delta_bytes * 8000000) /
+ delta_time;
+ printf(" rate:%.3fGbps",
+ delta_rate/1000000000.0);
+ rate_diff100 = (((long long)rate)*1000000 -
+ delta_rate) * 100 /
+ (((long long) rate) * 1000000);
+ printf(" rdiff:%d", rate_diff100);
+ if (rate_diff100 <= 3) {
+ rate += (rate >> 4);
+ if (rate > RATE_THRESHOLD / 1000000)
+ rate = RATE_THRESHOLD / 1000000;
+ qstats.rate = rate;
+ printf(" INC\n");
+ } else {
+ printf("\n");
+ }
+ } else {
+ /* Need to decrease cgroup rate limit.
+ * Currently decreasing by 12.5%, unknown
+ * if that is optimal
+ */
+ printf(" DEC\n");
+ rate -= (rate >> 3);
+ if (rate < minRate)
+ rate = minRate;
+ qstats.rate = rate;
+ }
+ if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
+ do_error("update map element fails", false);
+ }
+ } else {
+ sleep(dur);
+ }
+ // Get stats!
+ if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
+ char fname[100];
+ FILE *fout;
+
+ if (!outFlag)
+ sprintf(fname, "hbm.%d.in", cg_id);
+ else
+ sprintf(fname, "hbm.%d.out", cg_id);
+ fout = fopen(fname, "w");
+ fprintf(fout, "id:%d\n", cg_id);
+ fprintf(fout, "ERROR: Could not lookup queue_stats\n");
+ } else if (stats_flag && qstats.lastPacketTime >
+ qstats.firstPacketTime) {
+ long long delta_us = (qstats.lastPacketTime -
+ qstats.firstPacketTime)/1000;
+ unsigned int rate_mbps = ((qstats.bytes_total -
+ qstats.bytes_dropped) * 8 /
+ delta_us);
+ double percent_pkts, percent_bytes;
+ char fname[100];
+ FILE *fout;
+
+// Future support of ingress
+// if (!outFlag)
+// sprintf(fname, "hbm.%d.in", cg_id);
+// else
+ sprintf(fname, "hbm.%d.out", cg_id);
+ fout = fopen(fname, "w");
+ fprintf(fout, "id:%d\n", cg_id);
+ fprintf(fout, "rate_mbps:%d\n", rate_mbps);
+ fprintf(fout, "duration:%.1f secs\n",
+ (qstats.lastPacketTime - qstats.firstPacketTime) /
+ 1000000000.0);
+ fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
+ fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
+ 1000000));
+ fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
+ fprintf(fout, "bytes_dropped_MB:%d\n",
+ (int)(qstats.bytes_dropped /
+ 1000000));
+ // Marked Pkts and Bytes
+ percent_pkts = (qstats.pkts_marked * 100.0) /
+ (qstats.pkts_total + 1);
+ percent_bytes = (qstats.bytes_marked * 100.0) /
+ (qstats.bytes_total + 1);
+ fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
+ fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
+
+ // Dropped Pkts and Bytes
+ percent_pkts = (qstats.pkts_dropped * 100.0) /
+ (qstats.pkts_total + 1);
+ percent_bytes = (qstats.bytes_dropped * 100.0) /
+ (qstats.bytes_total + 1);
+ fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
+ fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+ fclose(fout);
+ }
+
+ if (debugFlag)
+ read_trace_pipe2();
+ return rc;
+err:
+ rc = 1;
+
+ if (cg1)
+ close(cg1);
+ cleanup_cgroup_environment();
+
+ return rc;
+}
+
+static void Usage(void)
+{
+ printf("This program loads a cgroup skb BPF program to enforce\n"
+ "cgroup output (egress) bandwidth limits.\n\n"
+ "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n"
+ " [-t <secs>] [-w] [-h] [prog]\n"
+ " Where:\n"
+ " -o indicates egress direction (default)\n"
+ " -d print BPF trace debug buffer\n"
+ " -l also limit flows using loopback\n"
+ " -n <#> to create cgroup \"/hbm#\" and attach prog\n"
+ " Default is /hbm1\n"
+ " -r <rate> Rate in Mbps\n"
+ " -s Update HBM stats\n"
+ " -t <time> Exit after specified seconds (deault is 0)\n"
+ " -w Work conserving flag. cgroup can increase\n"
+ " bandwidth beyond the rate limit specified\n"
+ " while there is available bandwidth. Current\n"
+ " implementation assumes there is only eth0\n"
+ " but can be extended to support multiple NICs\n"
+ " -h print this info\n"
+ " prog BPF program file name. Name defaults to\n"
+ " hbm_out_kern.o\n");
+}
+
+int main(int argc, char **argv)
+{
+ char *prog = "hbm_out_kern.o";
+ int k;
+ int cg_id = 1;
+ char *optstring = "iodln:r:st:wh";
+
+ while ((k = getopt(argc, argv, optstring)) != -1) {
+ switch (k) {
+ case'o':
+ break;
+ case 'd':
+ debugFlag = true;
+ break;
+ case 'l':
+ loopback_flag = true;
+ break;
+ case 'n':
+ cg_id = atoi(optarg);
+ break;
+ case 'r':
+ minRate = atoi(optarg) * 1.024;
+ rate = minRate;
+ break;
+ case 's':
+ stats_flag = true;
+ break;
+ case 't':
+ dur = atoi(optarg);
+ break;
+ case 'w':
+ work_conserving_flag = true;
+ break;
+ case '?':
+ if (optopt == 'n' || optopt == 'r' || optopt == 't')
+ fprintf(stderr,
+ "Option -%c requires an argument.\n\n",
+ optopt);
+ case 'h':
+ // fallthrough
+ default:
+ Usage();
+ return 0;
+ }
+ }
+
+ if (optind < argc)
+ prog = argv[optind];
+ printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
+
+ return run_bpf_prog(prog, cg_id);
+}
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for Host Bandwidth Management (HBM) programs
+ */
+struct hbm_vqueue {
+ struct bpf_spin_lock lock;
+ /* 4 byte hole */
+ unsigned long long lasttime; /* In ns */
+ int credit; /* In bytes */
+ unsigned int rate; /* In bytes per NS << 20 */
+};
+
+struct hbm_queue_stats {
+ unsigned long rate; /* in Mbps*/
+ unsigned long stats:1, /* get HBM stats (marked, dropped,..) */
+ loopback:1; /* also limit flows using loopback */
+ unsigned long long pkts_marked;
+ unsigned long long bytes_marked;
+ unsigned long long pkts_dropped;
+ unsigned long long bytes_dropped;
+ unsigned long long pkts_total;
+ unsigned long long bytes_total;
+ unsigned long long firstPacketTime;
+ unsigned long long lastPacketTime;
+};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for sample Host Bandwidth Manager (HBM) BPF programs
+ */
+#define KBUILD_MODNAME "foo"
+#include <stddef.h>
+#include <stdbool.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <net/inet_ecn.h>
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+#include "hbm.h"
+
+#define DROP_PKT 0
+#define ALLOW_PKT 1
+#define TCP_ECN_OK 1
+
+#define HBM_DEBUG 0 // Set to 1 to enable debugging
+#if HBM_DEBUG
+#define bpf_printk(fmt, ...) \
+({ \
+ char ____fmt[] = fmt; \
+ bpf_trace_printk(____fmt, sizeof(____fmt), \
+ ##__VA_ARGS__); \
+})
+#else
+#define bpf_printk(fmt, ...)
+#endif
+
+#define INITIAL_CREDIT_PACKETS 100
+#define MAX_BYTES_PER_PACKET 1500
+#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET)
+#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET)
+#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
+#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH)
+#define LARGE_PKT_THRESH 120
+#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET)
+#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+
+// rate in bytes per ns << 20
+#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+
+struct bpf_map_def SEC("maps") queue_state = {
+ .type = BPF_MAP_TYPE_CGROUP_STORAGE,
+ .key_size = sizeof(struct bpf_cgroup_storage_key),
+ .value_size = sizeof(struct hbm_vqueue),
+};
+BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
+ struct hbm_vqueue);
+
+struct bpf_map_def SEC("maps") queue_stats = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct hbm_queue_stats),
+ .max_entries = 1,
+};
+BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
+
+struct hbm_pkt_info {
+ bool is_ip;
+ bool is_tcp;
+ short ecn;
+};
+
+static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
+ struct hbm_pkt_info *pkti)
+{
+ struct iphdr iph;
+ struct ipv6hdr *ip6h;
+
+ bpf_skb_load_bytes(skb, 0, &iph, 12);
+ if (iph.version == 6) {
+ ip6h = (struct ipv6hdr *)&iph;
+ pkti->is_ip = true;
+ pkti->is_tcp = (ip6h->nexthdr == 6);
+ pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
+ } else if (iph.version == 4) {
+ pkti->is_ip = true;
+ pkti->is_tcp = (iph.protocol == 6);
+ pkti->ecn = iph.tos & INET_ECN_MASK;
+ } else {
+ pkti->is_ip = false;
+ pkti->is_tcp = false;
+ pkti->ecn = 0;
+ }
+}
+
+static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
+{
+ bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+ qdp->lasttime = bpf_ktime_get_ns();
+ qdp->credit = INIT_CREDIT;
+ qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
+ int len,
+ unsigned long long curtime,
+ bool congestion_flag,
+ bool drop_flag)
+{
+ if (qsp != NULL) {
+ // Following is needed for work conserving
+ __sync_add_and_fetch(&(qsp->bytes_total), len);
+ if (qsp->stats) {
+ // Optionally update statistics
+ if (qsp->firstPacketTime == 0)
+ qsp->firstPacketTime = curtime;
+ qsp->lastPacketTime = curtime;
+ __sync_add_and_fetch(&(qsp->pkts_total), 1);
+ if (congestion_flag || drop_flag) {
+ __sync_add_and_fetch(&(qsp->pkts_marked), 1);
+ __sync_add_and_fetch(&(qsp->bytes_marked), len);
+ }
+ if (drop_flag) {
+ __sync_add_and_fetch(&(qsp->pkts_dropped), 1);
+ __sync_add_and_fetch(&(qsp->bytes_dropped),
+ len);
+ }
+ }
+ }
+}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ * CREDIT
+ * - <--------------------------|------------------------> +
+ * | | | 0
+ * | Large pkt |
+ * | drop thresh |
+ * Small pkt drop Mark threshold
+ * thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ * is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ * to reduce the congestion window. The current implementation uses a linear
+ * distribution (0% probability at marking threshold, 100% probability
+ * at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+ struct hbm_pkt_info pkti;
+ int len = skb->len;
+ unsigned int queue_index = 0;
+ unsigned long long curtime;
+ int credit;
+ signed long long delta = 0, zero = 0;
+ int max_credit = MAX_CREDIT;
+ bool congestion_flag = false;
+ bool drop_flag = false;
+ bool cwr_flag = false;
+ struct hbm_vqueue *qdp;
+ struct hbm_queue_stats *qsp = NULL;
+ int rv = ALLOW_PKT;
+
+ qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+ if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+ return ALLOW_PKT;
+
+ hbm_get_pkt_info(skb, &pkti);
+
+ // We may want to account for the length of headers in len
+ // calculation, like ETH header + overhead, specially if it
+ // is a gso packet. But I am not doing it right now.
+
+ qdp = bpf_get_local_storage(&queue_state, 0);
+ if (!qdp)
+ return ALLOW_PKT;
+ else if (qdp->lasttime == 0)
+ hbm_init_vqueue(qdp, 1024);
+
+ curtime = bpf_ktime_get_ns();
+
+ // Begin critical section
+ bpf_spin_lock(&qdp->lock);
+ credit = qdp->credit;
+ delta = curtime - qdp->lasttime;
+ /* delta < 0 implies that another process with a curtime greater
+ * than ours beat us to the critical section and already added
+ * the new credit, so we should not add it ourselves
+ */
+ if (delta > 0) {
+ qdp->lasttime = curtime;
+ credit += CREDIT_PER_NS(delta, qdp->rate);
+ if (credit > MAX_CREDIT)
+ credit = MAX_CREDIT;
+ }
+ credit -= len;
+ qdp->credit = credit;
+ bpf_spin_unlock(&qdp->lock);
+ // End critical section
+
+ // Check if we should update rate
+ if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+ qdp->rate = qsp->rate * 128;
+ bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+ (int)qdp->rate,
+ CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+ }
+
+ // Set flags (drop, congestion, cwr)
+ // Dropping => we are congested, so ignore congestion flag
+ if (credit < -DROP_THRESH ||
+ (len > LARGE_PKT_THRESH &&
+ credit < -LARGE_PKT_DROP_THRESH)) {
+ // Very congested, set drop flag
+ drop_flag = true;
+ } else if (credit < 0) {
+ // Congested, set congestion flag
+ if (pkti.ecn) {
+ if (credit < -MARK_THRESH)
+ congestion_flag = true;
+ else
+ congestion_flag = false;
+ } else {
+ congestion_flag = true;
+ }
+ }
+
+ if (congestion_flag) {
+ if (!bpf_skb_ecn_set_ce(skb)) {
+ if (len > LARGE_PKT_THRESH) {
+ // Problem if too many small packets?
+ drop_flag = true;
+ }
+ }
+ }
+
+ if (drop_flag)
+ rv = DROP_PKT;
+
+ hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+
+ if (rv == DROP_PKT)
+ __sync_add_and_fetch(&(qdp->credit), len);
+
+ return rv;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/load_sock_ops.c b/samples/bpf/load_sock_ops.c
deleted file mode 100644
index 8ecb41ea0c03..000000000000
--- a/samples/bpf/load_sock_ops.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <linux/bpf.h>
-#include <bpf/bpf.h>
-#include "bpf_load.h"
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <linux/unistd.h>
-
-static void usage(char *pname)
-{
- printf("USAGE:\n %s [-l] <cg-path> <prog filename>\n", pname);
- printf("\tLoad and attach a sock_ops program to the specified "
- "cgroup\n");
- printf("\tIf \"-l\" is used, the program will continue to run\n");
- printf("\tprinting the BPF log buffer\n");
- printf("\tIf the specified filename does not end in \".o\", it\n");
- printf("\tappends \"_kern.o\" to the name\n");
- printf("\n");
- printf(" %s -r <cg-path>\n", pname);
- printf("\tDetaches the currently attached sock_ops program\n");
- printf("\tfrom the specified cgroup\n");
- printf("\n");
- exit(1);
-}
-
-int main(int argc, char **argv)
-{
- int logFlag = 0;
- int error = 0;
- char *cg_path;
- char fn[500];
- char *prog;
- int cg_fd;
-
- if (argc < 3)
- usage(argv[0]);
-
- if (!strcmp(argv[1], "-r")) {
- cg_path = argv[2];
- cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
- error = bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
- if (error) {
- printf("ERROR: bpf_prog_detach: %d (%s)\n",
- error, strerror(errno));
- return 2;
- }
- return 0;
- } else if (!strcmp(argv[1], "-h")) {
- usage(argv[0]);
- } else if (!strcmp(argv[1], "-l")) {
- logFlag = 1;
- if (argc < 4)
- usage(argv[0]);
- }
-
- prog = argv[argc - 1];
- cg_path = argv[argc - 2];
- if (strlen(prog) > 480) {
- fprintf(stderr, "ERROR: program name too long (> 480 chars)\n");
- return 3;
- }
- cg_fd = open(cg_path, O_DIRECTORY, O_RDONLY);
-
- if (!strcmp(prog + strlen(prog)-2, ".o"))
- strcpy(fn, prog);
- else
- sprintf(fn, "%s_kern.o", prog);
- if (logFlag)
- printf("loading bpf file:%s\n", fn);
- if (load_bpf_file(fn)) {
- printf("ERROR: load_bpf_file failed for: %s\n", fn);
- printf("%s", bpf_log_buf);
- return 4;
- }
- if (logFlag)
- printf("TCP BPF Loaded %s\n", fn);
-
- error = bpf_prog_attach(prog_fd[0], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
- if (error) {
- printf("ERROR: bpf_prog_attach: %d (%s)\n",
- error, strerror(errno));
- return 5;
- } else if (logFlag) {
- read_trace_pipe();
- }
-
- return error;
-}
diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c
index 60ec467c78ab..00aae1d33fca 100644
--- a/samples/bpf/sock_example.c
+++ b/samples/bpf/sock_example.c
@@ -99,7 +99,7 @@ int main(void)
{
FILE *f;
- f = popen("ping -c5 localhost", "r");
+ f = popen("ping -4 -c5 localhost", "r");
(void)f;
return test_sock();
diff --git a/samples/bpf/sockex1_user.c b/samples/bpf/sockex1_user.c
index 93ec01c56104..7f90796ae15a 100644
--- a/samples/bpf/sockex1_user.c
+++ b/samples/bpf/sockex1_user.c
@@ -3,30 +3,33 @@
#include <assert.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include "bpf/libbpf.h"
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
int main(int ac, char **argv)
{
+ struct bpf_object *obj;
+ int map_fd, prog_fd;
char filename[256];
- FILE *f;
int i, sock;
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
+ if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
+ &obj, &prog_fd))
return 1;
- }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "my_map");
sock = open_raw_sock("lo");
- assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
- sizeof(prog_fd[0])) == 0);
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
- f = popen("ping -c5 localhost", "r");
+ f = popen("ping -4 -c5 localhost", "r");
(void) f;
for (i = 0; i < 5; i++) {
@@ -34,13 +37,13 @@ int main(int ac, char **argv)
int key;
key = IPPROTO_TCP;
- assert(bpf_map_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &tcp_cnt) == 0);
key = IPPROTO_UDP;
- assert(bpf_map_lookup_elem(map_fd[0], &key, &udp_cnt) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &udp_cnt) == 0);
key = IPPROTO_ICMP;
- assert(bpf_map_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0);
+ assert(bpf_map_lookup_elem(map_fd, &key, &icmp_cnt) == 0);
printf("TCP %lld UDP %lld ICMP %lld bytes\n",
tcp_cnt, udp_cnt, icmp_cnt);
diff --git a/samples/bpf/sockex2_user.c b/samples/bpf/sockex2_user.c
index 1d5c6e9a6d27..bc257333ad92 100644
--- a/samples/bpf/sockex2_user.c
+++ b/samples/bpf/sockex2_user.c
@@ -3,7 +3,7 @@
#include <assert.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
-#include "bpf_load.h"
+#include "bpf/libbpf.h"
#include "sock_example.h"
#include <unistd.h>
#include <arpa/inet.h>
@@ -17,32 +17,35 @@ struct pair {
int main(int ac, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ struct bpf_object *obj;
+ int map_fd, prog_fd;
char filename[256];
- FILE *f;
int i, sock;
+ FILE *f;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
- if (load_bpf_file(filename)) {
- printf("%s", bpf_log_buf);
+ if (bpf_prog_load(filename, BPF_PROG_TYPE_SOCKET_FILTER,
+ &obj, &prog_fd))
return 1;
- }
+
+ map_fd = bpf_object__find_map_fd_by_name(obj, "hash_map");
sock = open_raw_sock("lo");
- assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd,
- sizeof(prog_fd[0])) == 0);
+ assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,
+ sizeof(prog_fd)) == 0);
- f = popen("ping -c5 localhost", "r");
+ f = popen("ping -4 -c5 localhost", "r");
(void) f;
for (i = 0; i < 5; i++) {
int key = 0, next_key;
struct pair value;
- while (bpf_map_get_next_key(map_fd[0], &key, &next_key) == 0) {
- bpf_map_lookup_elem(map_fd[0], &next_key, &value);
+ while (bpf_map_get_next_key(map_fd, &key, &next_key) == 0) {
+ bpf_map_lookup_elem(map_fd, &next_key, &value);
printf("ip %s bytes %lld packets %lld\n",
inet_ntoa((struct in_addr){htonl(next_key)}),
value.bytes, value.packets);
diff --git a/samples/bpf/sockex3_user.c b/samples/bpf/sockex3_user.c
index 9d02e0404719..bbb1cd0666a9 100644
--- a/samples/bpf/sockex3_user.c
+++ b/samples/bpf/sockex3_user.c
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
sizeof(__u32)) == 0);
if (argc > 1)
- f = popen("ping -c5 localhost", "r");
+ f = popen("ping -4 -c5 localhost", "r");
else
f = popen("netperf -l 4 localhost", "r");
(void) f;
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
index f4b0a9ea674d..fb56fc2a3e5d 100644
--- a/samples/bpf/task_fd_query_kern.c
+++ b/samples/bpf/task_fd_query_kern.c
@@ -4,7 +4,7 @@
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
-SEC("kprobe/blk_start_request")
+SEC("kprobe/blk_mq_start_request")
int bpf_prog1(struct pt_regs *ctx)
{
return 0;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
index 8381d792f138..aff2b4ae914e 100644
--- a/samples/bpf/task_fd_query_user.c
+++ b/samples/bpf/task_fd_query_user.c
@@ -311,7 +311,7 @@ int main(int argc, char **argv)
}
/* test two functions in the corresponding *_kern.c file */
- CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_mq_start_request",
BPF_FD_TYPE_KPROBE));
CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
BPF_FD_TYPE_KRETPROBE));
diff --git a/samples/bpf/tcp_basertt_kern.c b/samples/bpf/tcp_basertt_kern.c
index 4bf4fc597db9..6ef1625e8b2c 100644
--- a/samples/bpf/tcp_basertt_kern.c
+++ b/samples/bpf/tcp_basertt_kern.c
@@ -7,7 +7,7 @@
* BPF program to set base_rtt to 80us when host is running TCP-NV and
* both hosts are in the same datacenter (as determined by IPv6 prefix).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_bpf.readme b/samples/bpf/tcp_bpf.readme
index 831fb601e3c9..fee746621aec 100644
--- a/samples/bpf/tcp_bpf.readme
+++ b/samples/bpf/tcp_bpf.readme
@@ -8,14 +8,16 @@ a cgroupv2 and attach a bash shell to the group.
bash
echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
-Anything that runs under this shell belongs to the foo cgroupv2 To load
+Anything that runs under this shell belongs to the foo cgroupv2. To load
(attach) one of the tcp_*_kern.o programs:
- ./load_sock_ops -l /tmp/cgroupv2/foo tcp_basertt_kern.o
+ bpftool prog load tcp_basertt_kern.o /sys/fs/bpf/tcp_prog
+ bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
+ bpftool prog tracelog
-If the "-l" flag is used, the load_sock_ops program will continue to run
-printing the BPF log buffer. The tcp_*_kern.o programs use special print
-functions to print logging information (if enabled by the ifdef).
+"bpftool prog tracelog" will continue to run printing the BPF log buffer.
+The tcp_*_kern.o programs use special print functions to print logging
+information (if enabled by the ifdef).
If using netperf/netserver to create traffic, you need to run them under the
cgroupv2 to which the BPF programs are attached (i.e. under bash shell
@@ -23,4 +25,4 @@ attached to the cgroupv2).
To remove (unattach) a socket_ops BPF program from a cgroupv2:
- ./load_sock_ops -r /tmp/cgroupv2/foo
+ bpftool cgroup attach /tmp/cgroupv2/foo sock_ops pinned /sys/fs/bpf/tcp_prog
diff --git a/samples/bpf/tcp_bufs_kern.c b/samples/bpf/tcp_bufs_kern.c
index 0566b7fa38a1..e03e204739fa 100644
--- a/samples/bpf/tcp_bufs_kern.c
+++ b/samples/bpf/tcp_bufs_kern.c
@@ -9,7 +9,7 @@
* doing appropriate checks that indicate the hosts are far enough
* away (i.e. large RTT).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_clamp_kern.c b/samples/bpf/tcp_clamp_kern.c
index f4225c9d2c0c..a0dc2d254aca 100644
--- a/samples/bpf/tcp_clamp_kern.c
+++ b/samples/bpf/tcp_clamp_kern.c
@@ -9,7 +9,7 @@
* the same datacenter. For his example, we assume they are within the same
* datacenter when the first 5.5 bytes of their IPv6 addresses are the same.
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_cong_kern.c b/samples/bpf/tcp_cong_kern.c
index ad0f1ba8206a..4fd3ca979a06 100644
--- a/samples/bpf/tcp_cong_kern.c
+++ b/samples/bpf/tcp_cong_kern.c
@@ -7,7 +7,7 @@
* BPF program to set congestion control to dctcp when both hosts are
* in the same datacenter (as deteremined by IPv6 prefix).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_iw_kern.c b/samples/bpf/tcp_iw_kern.c
index 4ca5ecc9f580..9b139ec69560 100644
--- a/samples/bpf/tcp_iw_kern.c
+++ b/samples/bpf/tcp_iw_kern.c
@@ -9,7 +9,7 @@
* would usually be done after doing appropriate checks that indicate
* the hosts are far enough away (i.e. large RTT).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_rwnd_kern.c b/samples/bpf/tcp_rwnd_kern.c
index 09ff65b40b31..cc71ee96e044 100644
--- a/samples/bpf/tcp_rwnd_kern.c
+++ b/samples/bpf/tcp_rwnd_kern.c
@@ -8,7 +8,7 @@
* and the first 5.5 bytes of the IPv6 addresses are not the same (in this
* example that means both hosts are not the same datacenter).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_synrto_kern.c b/samples/bpf/tcp_synrto_kern.c
index 232bb242823e..ca87ed34f896 100644
--- a/samples/bpf/tcp_synrto_kern.c
+++ b/samples/bpf/tcp_synrto_kern.c
@@ -8,7 +8,7 @@
* and the first 5.5 bytes of the IPv6 addresses are the same (in this example
* that means both hosts are in the same datacenter).
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tcp_tos_reflect_kern.c b/samples/bpf/tcp_tos_reflect_kern.c
index d51dab19eca6..de788be6f862 100644
--- a/samples/bpf/tcp_tos_reflect_kern.c
+++ b/samples/bpf/tcp_tos_reflect_kern.c
@@ -4,7 +4,7 @@
*
* BPF program to automatically reflect TOS option from received syn packet
*
- * Use load_sock_ops to load this BPF program.
+ * Use "bpftool cgroup attach $cg sock_ops $prog" to load this BPF program.
*/
#include <uapi/linux/bpf.h>
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 1a81e6a5c2ea..c9544a4ce61a 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -131,7 +131,7 @@ int main(int ac, char **argv)
signal(SIGTERM, int_exit);
/* start 'ping' in the background to have some kfree_skb events */
- f = popen("ping -c5 localhost", "r");
+ f = popen("ping -4 -c5 localhost", "r");
(void) f;
/* start 'dd' in the background to have plenty of 'write' syscalls */
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
index 9974c3d7c18b..ea1d4c19c132 100644
--- a/samples/bpf/tracex3_kern.c
+++ b/samples/bpf/tracex3_kern.c
@@ -20,7 +20,7 @@ struct bpf_map_def SEC("maps") my_map = {
/* kprobe is NOT a stable ABI. If kernel internals change this bpf+kprobe
* example will no longer be meaningful
*/
-SEC("kprobe/blk_start_request")
+SEC("kprobe/blk_mq_start_request")
int bpf_prog1(struct pt_regs *ctx)
{
long rq = PT_REGS_PARM1(ctx);
diff --git a/samples/bpf/xdp_redirect_map_user.c b/samples/bpf/xdp_redirect_map_user.c
index 327226be5a06..1dbe7fd3a1a8 100644
--- a/samples/bpf/xdp_redirect_map_user.c
+++ b/samples/bpf/xdp_redirect_map_user.c
@@ -57,7 +57,7 @@ static void int_exit(int sig)
printf("bpf_get_link_xdp_id failed\n");
exit(1);
}
- if (prog_id == curr_prog_id)
+ if (dummy_prog_id == curr_prog_id)
bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
else if (!curr_prog_id)
printf("couldn't find a prog id on iface OUT\n");
diff --git a/samples/bpf/xdp_redirect_user.c b/samples/bpf/xdp_redirect_user.c
index a5d8ad3129ed..e9054c0269ff 100644
--- a/samples/bpf/xdp_redirect_user.c
+++ b/samples/bpf/xdp_redirect_user.c
@@ -57,7 +57,7 @@ static void int_exit(int sig)
printf("bpf_get_link_xdp_id failed\n");
exit(1);
}
- if (prog_id == curr_prog_id)
+ if (dummy_prog_id == curr_prog_id)
bpf_set_link_xdp_fd(ifindex_out, -1, xdp_flags);
else if (!curr_prog_id)
printf("couldn't find a prog id on iface OUT\n");
diff --git a/samples/bpf/xdpsock.h b/samples/bpf/xdpsock.h
deleted file mode 100644
index 533ab81adfa1..000000000000
--- a/samples/bpf/xdpsock.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef XDPSOCK_H_
-#define XDPSOCK_H_
-
-/* Power-of-2 number of sockets */
-#define MAX_SOCKS 4
-
-/* Round-robin receive */
-#define RR_LB 0
-
-#endif /* XDPSOCK_H_ */
diff --git a/samples/bpf/xdpsock_kern.c b/samples/bpf/xdpsock_kern.c
deleted file mode 100644
index b8ccd0802b3f..000000000000
--- a/samples/bpf/xdpsock_kern.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include "bpf_helpers.h"
-
-#include "xdpsock.h"
-
-struct bpf_map_def SEC("maps") qidconf_map = {
- .type = BPF_MAP_TYPE_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = 1,
-};
-
-struct bpf_map_def SEC("maps") xsks_map = {
- .type = BPF_MAP_TYPE_XSKMAP,
- .key_size = sizeof(int),
- .value_size = sizeof(int),
- .max_entries = MAX_SOCKS,
-};
-
-struct bpf_map_def SEC("maps") rr_map = {
- .type = BPF_MAP_TYPE_PERCPU_ARRAY,
- .key_size = sizeof(int),
- .value_size = sizeof(unsigned int),
- .max_entries = 1,
-};
-
-SEC("xdp_sock")
-int xdp_sock_prog(struct xdp_md *ctx)
-{
- int *qidconf, key = 0, idx;
- unsigned int *rr;
-
- qidconf = bpf_map_lookup_elem(&qidconf_map, &key);
- if (!qidconf)
- return XDP_ABORTED;
-
- if (*qidconf != ctx->rx_queue_index)
- return XDP_PASS;
-
-#if RR_LB /* NB! RR_LB is configured in xdpsock.h */
- rr = bpf_map_lookup_elem(&rr_map, &key);
- if (!rr)
- return XDP_ABORTED;
-
- *rr = (*rr + 1) & (MAX_SOCKS - 1);
- idx = *rr;
-#else
- idx = 0;
-#endif
-
- return bpf_redirect_map(&xsks_map, idx, 0);
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index f73055e0191f..d08ee1ab7bb4 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -1,37 +1,36 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright(c) 2017 - 2018 Intel Corporation. */
-#include <assert.h>
+#include <asm/barrier.h>
#include <errno.h>
#include <getopt.h>
#include <libgen.h>
#include <linux/bpf.h>
+#include <linux/compiler.h>
#include <linux/if_link.h>
#include <linux/if_xdp.h>
#include <linux/if_ether.h>
+#include <locale.h>
+#include <net/ethernet.h>
#include <net/if.h>
+#include <poll.h>
+#include <pthread.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
-#include <net/ethernet.h>
+#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/socket.h>
-#include <sys/mman.h>
+#include <sys/types.h>
#include <time.h>
#include <unistd.h>
-#include <pthread.h>
-#include <locale.h>
-#include <sys/types.h>
-#include <poll.h>
#include "bpf/libbpf.h"
-#include "bpf_util.h"
+#include "bpf/xsk.h"
#include <bpf/bpf.h>
-#include "xdpsock.h"
-
#ifndef SOL_XDP
#define SOL_XDP 283
#endif
@@ -44,17 +43,11 @@
#define PF_XDP AF_XDP
#endif
-#define NUM_FRAMES 131072
-#define FRAME_HEADROOM 0
-#define FRAME_SHIFT 11
-#define FRAME_SIZE 2048
-#define NUM_DESCS 1024
-#define BATCH_SIZE 16
-
-#define FQ_NUM_DESCS 1024
-#define CQ_NUM_DESCS 1024
+#define NUM_FRAMES (4 * 1024)
+#define BATCH_SIZE 64
#define DEBUG_HEXDUMP 0
+#define MAX_SOCKS 8
typedef __u64 u64;
typedef __u32 u32;
@@ -73,54 +66,31 @@ static const char *opt_if = "";
static int opt_ifindex;
static int opt_queue;
static int opt_poll;
-static int opt_shared_packet_buffer;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags;
static __u32 prog_id;
-struct xdp_umem_uqueue {
- u32 cached_prod;
- u32 cached_cons;
- u32 mask;
- u32 size;
- u32 *producer;
- u32 *consumer;
- u64 *ring;
- void *map;
+struct xsk_umem_info {
+ struct xsk_ring_prod fq;
+ struct xsk_ring_cons cq;
+ struct xsk_umem *umem;
+ void *buffer;
};
-struct xdp_umem {
- char *frames;
- struct xdp_umem_uqueue fq;
- struct xdp_umem_uqueue cq;
- int fd;
-};
-
-struct xdp_uqueue {
- u32 cached_prod;
- u32 cached_cons;
- u32 mask;
- u32 size;
- u32 *producer;
- u32 *consumer;
- struct xdp_desc *ring;
- void *map;
-};
-
-struct xdpsock {
- struct xdp_uqueue rx;
- struct xdp_uqueue tx;
- int sfd;
- struct xdp_umem *umem;
- u32 outstanding_tx;
+struct xsk_socket_info {
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_umem_info *umem;
+ struct xsk_socket *xsk;
unsigned long rx_npkts;
unsigned long tx_npkts;
unsigned long prev_rx_npkts;
unsigned long prev_tx_npkts;
+ u32 outstanding_tx;
};
static int num_socks;
-struct xdpsock *xsks[MAX_SOCKS];
+struct xsk_socket_info *xsks[MAX_SOCKS];
static unsigned long get_nsecs(void)
{
@@ -130,225 +100,124 @@ static unsigned long get_nsecs(void)
return ts.tv_sec * 1000000000UL + ts.tv_nsec;
}
-static void dump_stats(void);
-
-#define lassert(expr) \
- do { \
- if (!(expr)) { \
- fprintf(stderr, "%s:%s:%i: Assertion failed: " \
- #expr ": errno: %d/\"%s\"\n", \
- __FILE__, __func__, __LINE__, \
- errno, strerror(errno)); \
- dump_stats(); \
- exit(EXIT_FAILURE); \
- } \
- } while (0)
-
-#define barrier() __asm__ __volatile__("": : :"memory")
-#ifdef __aarch64__
-#define u_smp_rmb() __asm__ __volatile__("dmb ishld": : :"memory")
-#define u_smp_wmb() __asm__ __volatile__("dmb ishst": : :"memory")
-#else
-#define u_smp_rmb() barrier()
-#define u_smp_wmb() barrier()
-#endif
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-static const char pkt_data[] =
- "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
- "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
- "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
- "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
-
-static inline u32 umem_nb_free(struct xdp_umem_uqueue *q, u32 nb)
-{
- u32 free_entries = q->cached_cons - q->cached_prod;
-
- if (free_entries >= nb)
- return free_entries;
-
- /* Refresh the local tail pointer */
- q->cached_cons = *q->consumer + q->size;
-
- return q->cached_cons - q->cached_prod;
-}
-
-static inline u32 xq_nb_free(struct xdp_uqueue *q, u32 ndescs)
+static void print_benchmark(bool running)
{
- u32 free_entries = q->cached_cons - q->cached_prod;
+ const char *bench_str = "INVALID";
- if (free_entries >= ndescs)
- return free_entries;
+ if (opt_bench == BENCH_RXDROP)
+ bench_str = "rxdrop";
+ else if (opt_bench == BENCH_TXONLY)
+ bench_str = "txonly";
+ else if (opt_bench == BENCH_L2FWD)
+ bench_str = "l2fwd";
- /* Refresh the local tail pointer */
- q->cached_cons = *q->consumer + q->size;
- return q->cached_cons - q->cached_prod;
-}
+ printf("%s:%d %s ", opt_if, opt_queue, bench_str);
+ if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
+ printf("xdp-skb ");
+ else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
+ printf("xdp-drv ");
+ else
+ printf(" ");
-static inline u32 umem_nb_avail(struct xdp_umem_uqueue *q, u32 nb)
-{
- u32 entries = q->cached_prod - q->cached_cons;
+ if (opt_poll)
+ printf("poll() ");
- if (entries == 0) {
- q->cached_prod = *q->producer;
- entries = q->cached_prod - q->cached_cons;
+ if (running) {
+ printf("running...");
+ fflush(stdout);
}
-
- return (entries > nb) ? nb : entries;
}
-static inline u32 xq_nb_avail(struct xdp_uqueue *q, u32 ndescs)
+static void dump_stats(void)
{
- u32 entries = q->cached_prod - q->cached_cons;
+ unsigned long now = get_nsecs();
+ long dt = now - prev_time;
+ int i;
- if (entries == 0) {
- q->cached_prod = *q->producer;
- entries = q->cached_prod - q->cached_cons;
- }
+ prev_time = now;
- return (entries > ndescs) ? ndescs : entries;
-}
+ for (i = 0; i < num_socks && xsks[i]; i++) {
+ char *fmt = "%-15s %'-11.0f %'-11lu\n";
+ double rx_pps, tx_pps;
-static inline int umem_fill_to_kernel_ex(struct xdp_umem_uqueue *fq,
- struct xdp_desc *d,
- size_t nb)
-{
- u32 i;
+ rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
+ 1000000000. / dt;
+ tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
+ 1000000000. / dt;
- if (umem_nb_free(fq, nb) < nb)
- return -ENOSPC;
+ printf("\n sock%d@", i);
+ print_benchmark(false);
+ printf("\n");
- for (i = 0; i < nb; i++) {
- u32 idx = fq->cached_prod++ & fq->mask;
+ printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
+ dt / 1000000000.);
+ printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
+ printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
- fq->ring[idx] = d[i].addr;
+ xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
+ xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
}
-
- u_smp_wmb();
-
- *fq->producer = fq->cached_prod;
-
- return 0;
}
-static inline int umem_fill_to_kernel(struct xdp_umem_uqueue *fq, u64 *d,
- size_t nb)
+static void *poller(void *arg)
{
- u32 i;
-
- if (umem_nb_free(fq, nb) < nb)
- return -ENOSPC;
-
- for (i = 0; i < nb; i++) {
- u32 idx = fq->cached_prod++ & fq->mask;
-
- fq->ring[idx] = d[i];
+ (void)arg;
+ for (;;) {
+ sleep(opt_interval);
+ dump_stats();
}
- u_smp_wmb();
-
- *fq->producer = fq->cached_prod;
-
- return 0;
+ return NULL;
}
-static inline size_t umem_complete_from_kernel(struct xdp_umem_uqueue *cq,
- u64 *d, size_t nb)
+static void remove_xdp_program(void)
{
- u32 idx, i, entries = umem_nb_avail(cq, nb);
-
- u_smp_rmb();
-
- for (i = 0; i < entries; i++) {
- idx = cq->cached_cons++ & cq->mask;
- d[i] = cq->ring[idx];
- }
-
- if (entries > 0) {
- u_smp_wmb();
+ __u32 curr_prog_id = 0;
- *cq->consumer = cq->cached_cons;
+ if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
+ printf("bpf_get_link_xdp_id failed\n");
+ exit(EXIT_FAILURE);
}
-
- return entries;
-}
-
-static inline void *xq_get_data(struct xdpsock *xsk, u64 addr)
-{
- return &xsk->umem->frames[addr];
+ if (prog_id == curr_prog_id)
+ bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
+ else if (!curr_prog_id)
+ printf("couldn't find a prog id on a given interface\n");
+ else
+ printf("program on interface changed, not removing\n");
}
-static inline int xq_enq(struct xdp_uqueue *uq,
- const struct xdp_desc *descs,
- unsigned int ndescs)
+static void int_exit(int sig)
{
- struct xdp_desc *r = uq->ring;
- unsigned int i;
+ struct xsk_umem *umem = xsks[0]->umem->umem;
- if (xq_nb_free(uq, ndescs) < ndescs)
- return -ENOSPC;
-
- for (i = 0; i < ndescs; i++) {
- u32 idx = uq->cached_prod++ & uq->mask;
-
- r[idx].addr = descs[i].addr;
- r[idx].len = descs[i].len;
- }
+ (void)sig;
- u_smp_wmb();
+ dump_stats();
+ xsk_socket__delete(xsks[0]->xsk);
+ (void)xsk_umem__delete(umem);
+ remove_xdp_program();
- *uq->producer = uq->cached_prod;
- return 0;
+ exit(EXIT_SUCCESS);
}
-static inline int xq_enq_tx_only(struct xdp_uqueue *uq,
- unsigned int id, unsigned int ndescs)
+static void __exit_with_error(int error, const char *file, const char *func,
+ int line)
{
- struct xdp_desc *r = uq->ring;
- unsigned int i;
-
- if (xq_nb_free(uq, ndescs) < ndescs)
- return -ENOSPC;
-
- for (i = 0; i < ndescs; i++) {
- u32 idx = uq->cached_prod++ & uq->mask;
-
- r[idx].addr = (id + i) << FRAME_SHIFT;
- r[idx].len = sizeof(pkt_data) - 1;
- }
-
- u_smp_wmb();
-
- *uq->producer = uq->cached_prod;
- return 0;
+ fprintf(stderr, "%s:%s:%i: errno: %d/\"%s\"\n", file, func,
+ line, error, strerror(error));
+ dump_stats();
+ remove_xdp_program();
+ exit(EXIT_FAILURE);
}
-static inline int xq_deq(struct xdp_uqueue *uq,
- struct xdp_desc *descs,
- int ndescs)
-{
- struct xdp_desc *r = uq->ring;
- unsigned int idx;
- int i, entries;
-
- entries = xq_nb_avail(uq, ndescs);
-
- u_smp_rmb();
-
- for (i = 0; i < entries; i++) {
- idx = uq->cached_cons++ & uq->mask;
- descs[i] = r[idx];
- }
-
- if (entries > 0) {
- u_smp_wmb();
+#define exit_with_error(error) __exit_with_error(error, __FILE__, __func__, \
+ __LINE__)
- *uq->consumer = uq->cached_cons;
- }
-
- return entries;
-}
+static const char pkt_data[] =
+ "\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+ "\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+ "\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+ "\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
static void swap_mac_addresses(void *data)
{
@@ -397,258 +266,74 @@ static void hex_dump(void *pkt, size_t length, u64 addr)
printf("\n");
}
-static size_t gen_eth_frame(char *frame)
+static size_t gen_eth_frame(struct xsk_umem_info *umem, u64 addr)
{
- memcpy(frame, pkt_data, sizeof(pkt_data) - 1);
+ memcpy(xsk_umem__get_data(umem->buffer, addr), pkt_data,
+ sizeof(pkt_data) - 1);
return sizeof(pkt_data) - 1;
}
-static struct xdp_umem *xdp_umem_configure(int sfd)
+static struct xsk_umem_info *xsk_configure_umem(void *buffer, u64 size)
{
- int fq_size = FQ_NUM_DESCS, cq_size = CQ_NUM_DESCS;
- struct xdp_mmap_offsets off;
- struct xdp_umem_reg mr;
- struct xdp_umem *umem;
- socklen_t optlen;
- void *bufs;
+ struct xsk_umem_info *umem;
+ int ret;
umem = calloc(1, sizeof(*umem));
- lassert(umem);
-
- lassert(posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
- NUM_FRAMES * FRAME_SIZE) == 0);
-
- mr.addr = (__u64)bufs;
- mr.len = NUM_FRAMES * FRAME_SIZE;
- mr.chunk_size = FRAME_SIZE;
- mr.headroom = FRAME_HEADROOM;
-
- lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) == 0);
- lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_FILL_RING, &fq_size,
- sizeof(int)) == 0);
- lassert(setsockopt(sfd, SOL_XDP, XDP_UMEM_COMPLETION_RING, &cq_size,
- sizeof(int)) == 0);
-
- optlen = sizeof(off);
- lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
- &optlen) == 0);
-
- umem->fq.map = mmap(0, off.fr.desc +
- FQ_NUM_DESCS * sizeof(u64),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, sfd,
- XDP_UMEM_PGOFF_FILL_RING);
- lassert(umem->fq.map != MAP_FAILED);
-
- umem->fq.mask = FQ_NUM_DESCS - 1;
- umem->fq.size = FQ_NUM_DESCS;
- umem->fq.producer = umem->fq.map + off.fr.producer;
- umem->fq.consumer = umem->fq.map + off.fr.consumer;
- umem->fq.ring = umem->fq.map + off.fr.desc;
- umem->fq.cached_cons = FQ_NUM_DESCS;
-
- umem->cq.map = mmap(0, off.cr.desc +
- CQ_NUM_DESCS * sizeof(u64),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, sfd,
- XDP_UMEM_PGOFF_COMPLETION_RING);
- lassert(umem->cq.map != MAP_FAILED);
-
- umem->cq.mask = CQ_NUM_DESCS - 1;
- umem->cq.size = CQ_NUM_DESCS;
- umem->cq.producer = umem->cq.map + off.cr.producer;
- umem->cq.consumer = umem->cq.map + off.cr.consumer;
- umem->cq.ring = umem->cq.map + off.cr.desc;
-
- umem->frames = bufs;
- umem->fd = sfd;
+ if (!umem)
+ exit_with_error(errno);
- if (opt_bench == BENCH_TXONLY) {
- int i;
-
- for (i = 0; i < NUM_FRAMES * FRAME_SIZE; i += FRAME_SIZE)
- (void)gen_eth_frame(&umem->frames[i]);
- }
+ ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
+ NULL);
+ if (ret)
+ exit_with_error(-ret);
+ umem->buffer = buffer;
return umem;
}
-static struct xdpsock *xsk_configure(struct xdp_umem *umem)
+static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem)
{
- struct sockaddr_xdp sxdp = {};
- struct xdp_mmap_offsets off;
- int sfd, ndescs = NUM_DESCS;
- struct xdpsock *xsk;
- bool shared = true;
- socklen_t optlen;
- u64 i;
-
- sfd = socket(PF_XDP, SOCK_RAW, 0);
- lassert(sfd >= 0);
+ struct xsk_socket_config cfg;
+ struct xsk_socket_info *xsk;
+ int ret;
+ u32 idx;
+ int i;
xsk = calloc(1, sizeof(*xsk));
- lassert(xsk);
-
- xsk->sfd = sfd;
- xsk->outstanding_tx = 0;
-
- if (!umem) {
- shared = false;
- xsk->umem = xdp_umem_configure(sfd);
- } else {
- xsk->umem = umem;
- }
-
- lassert(setsockopt(sfd, SOL_XDP, XDP_RX_RING,
- &ndescs, sizeof(int)) == 0);
- lassert(setsockopt(sfd, SOL_XDP, XDP_TX_RING,
- &ndescs, sizeof(int)) == 0);
- optlen = sizeof(off);
- lassert(getsockopt(sfd, SOL_XDP, XDP_MMAP_OFFSETS, &off,
- &optlen) == 0);
-
- /* Rx */
- xsk->rx.map = mmap(NULL,
- off.rx.desc +
- NUM_DESCS * sizeof(struct xdp_desc),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, sfd,
- XDP_PGOFF_RX_RING);
- lassert(xsk->rx.map != MAP_FAILED);
-
- if (!shared) {
- for (i = 0; i < NUM_DESCS * FRAME_SIZE; i += FRAME_SIZE)
- lassert(umem_fill_to_kernel(&xsk->umem->fq, &i, 1)
- == 0);
- }
-
- /* Tx */
- xsk->tx.map = mmap(NULL,
- off.tx.desc +
- NUM_DESCS * sizeof(struct xdp_desc),
- PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, sfd,
- XDP_PGOFF_TX_RING);
- lassert(xsk->tx.map != MAP_FAILED);
-
- xsk->rx.mask = NUM_DESCS - 1;
- xsk->rx.size = NUM_DESCS;
- xsk->rx.producer = xsk->rx.map + off.rx.producer;
- xsk->rx.consumer = xsk->rx.map + off.rx.consumer;
- xsk->rx.ring = xsk->rx.map + off.rx.desc;
-
- xsk->tx.mask = NUM_DESCS - 1;
- xsk->tx.size = NUM_DESCS;
- xsk->tx.producer = xsk->tx.map + off.tx.producer;
- xsk->tx.consumer = xsk->tx.map + off.tx.consumer;
- xsk->tx.ring = xsk->tx.map + off.tx.desc;
- xsk->tx.cached_cons = NUM_DESCS;
-
- sxdp.sxdp_family = PF_XDP;
- sxdp.sxdp_ifindex = opt_ifindex;
- sxdp.sxdp_queue_id = opt_queue;
-
- if (shared) {
- sxdp.sxdp_flags = XDP_SHARED_UMEM;
- sxdp.sxdp_shared_umem_fd = umem->fd;
- } else {
- sxdp.sxdp_flags = opt_xdp_bind_flags;
- }
-
- lassert(bind(sfd, (struct sockaddr *)&sxdp, sizeof(sxdp)) == 0);
+ if (!xsk)
+ exit_with_error(errno);
+
+ xsk->umem = umem;
+ cfg.rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
+ cfg.tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
+ cfg.libbpf_flags = 0;
+ cfg.xdp_flags = opt_xdp_flags;
+ cfg.bind_flags = opt_xdp_bind_flags;
+ ret = xsk_socket__create(&xsk->xsk, opt_if, opt_queue, umem->umem,
+ &xsk->rx, &xsk->tx, &cfg);
+ if (ret)
+ exit_with_error(-ret);
+
+ ret = bpf_get_link_xdp_id(opt_ifindex, &prog_id, opt_xdp_flags);
+ if (ret)
+ exit_with_error(-ret);
+
+ ret = xsk_ring_prod__reserve(&xsk->umem->fq,
+ XSK_RING_PROD__DEFAULT_NUM_DESCS,
+ &idx);
+ if (ret != XSK_RING_PROD__DEFAULT_NUM_DESCS)
+ exit_with_error(-ret);
+ for (i = 0;
+ i < XSK_RING_PROD__DEFAULT_NUM_DESCS *
+ XSK_UMEM__DEFAULT_FRAME_SIZE;
+ i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = i;
+ xsk_ring_prod__submit(&xsk->umem->fq,
+ XSK_RING_PROD__DEFAULT_NUM_DESCS);
return xsk;
}
-static void print_benchmark(bool running)
-{
- const char *bench_str = "INVALID";
-
- if (opt_bench == BENCH_RXDROP)
- bench_str = "rxdrop";
- else if (opt_bench == BENCH_TXONLY)
- bench_str = "txonly";
- else if (opt_bench == BENCH_L2FWD)
- bench_str = "l2fwd";
-
- printf("%s:%d %s ", opt_if, opt_queue, bench_str);
- if (opt_xdp_flags & XDP_FLAGS_SKB_MODE)
- printf("xdp-skb ");
- else if (opt_xdp_flags & XDP_FLAGS_DRV_MODE)
- printf("xdp-drv ");
- else
- printf(" ");
-
- if (opt_poll)
- printf("poll() ");
-
- if (running) {
- printf("running...");
- fflush(stdout);
- }
-}
-
-static void dump_stats(void)
-{
- unsigned long now = get_nsecs();
- long dt = now - prev_time;
- int i;
-
- prev_time = now;
-
- for (i = 0; i < num_socks && xsks[i]; i++) {
- char *fmt = "%-15s %'-11.0f %'-11lu\n";
- double rx_pps, tx_pps;
-
- rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
- 1000000000. / dt;
- tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
- 1000000000. / dt;
-
- printf("\n sock%d@", i);
- print_benchmark(false);
- printf("\n");
-
- printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
- dt / 1000000000.);
- printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
- printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
-
- xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
- xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
- }
-}
-
-static void *poller(void *arg)
-{
- (void)arg;
- for (;;) {
- sleep(opt_interval);
- dump_stats();
- }
-
- return NULL;
-}
-
-static void int_exit(int sig)
-{
- __u32 curr_prog_id = 0;
-
- (void)sig;
- dump_stats();
- if (bpf_get_link_xdp_id(opt_ifindex, &curr_prog_id, opt_xdp_flags)) {
- printf("bpf_get_link_xdp_id failed\n");
- exit(EXIT_FAILURE);
- }
- if (prog_id == curr_prog_id)
- bpf_set_link_xdp_fd(opt_ifindex, -1, opt_xdp_flags);
- else if (!curr_prog_id)
- printf("couldn't find a prog id on a given interface\n");
- else
- printf("program on interface changed, not removing\n");
- exit(EXIT_SUCCESS);
-}
-
static struct option long_options[] = {
{"rxdrop", no_argument, 0, 'r'},
{"txonly", no_argument, 0, 't'},
@@ -656,7 +341,6 @@ static struct option long_options[] = {
{"interface", required_argument, 0, 'i'},
{"queue", required_argument, 0, 'q'},
{"poll", no_argument, 0, 'p'},
- {"shared-buffer", no_argument, 0, 's'},
{"xdp-skb", no_argument, 0, 'S'},
{"xdp-native", no_argument, 0, 'N'},
{"interval", required_argument, 0, 'n'},
@@ -676,7 +360,6 @@ static void usage(const char *prog)
" -i, --interface=n Run on interface n\n"
" -q, --queue=n Use queue n (default 0)\n"
" -p, --poll Use poll syscall\n"
- " -s, --shared-buffer Use shared packet buffer\n"
" -S, --xdp-skb=n Use XDP skb-mod\n"
" -N, --xdp-native=n Enfore XDP native mode\n"
" -n, --interval=n Specify statistics update interval (default 1 sec).\n"
@@ -715,9 +398,6 @@ static void parse_command_line(int argc, char **argv)
case 'q':
opt_queue = atoi(optarg);
break;
- case 's':
- opt_shared_packet_buffer = 1;
- break;
case 'p':
opt_poll = 1;
break;
@@ -751,75 +431,104 @@ static void parse_command_line(int argc, char **argv)
opt_if);
usage(basename(argv[0]));
}
+
}
-static void kick_tx(int fd)
+static void kick_tx(struct xsk_socket_info *xsk)
{
int ret;
- ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+ ret = sendto(xsk_socket__fd(xsk->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0);
if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY)
return;
- lassert(0);
+ exit_with_error(errno);
}
-static inline void complete_tx_l2fwd(struct xdpsock *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
{
- u64 descs[BATCH_SIZE];
+ u32 idx_cq = 0, idx_fq = 0;
unsigned int rcvd;
size_t ndescs;
if (!xsk->outstanding_tx)
return;
- kick_tx(xsk->sfd);
+ kick_tx(xsk);
ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
- xsk->outstanding_tx;
+ xsk->outstanding_tx;
/* re-add completed Tx buffers */
- rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, ndescs);
+ rcvd = xsk_ring_cons__peek(&xsk->umem->cq, ndescs, &idx_cq);
if (rcvd > 0) {
- umem_fill_to_kernel(&xsk->umem->fq, descs, rcvd);
+ unsigned int i;
+ int ret;
+
+ ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+ while (ret != rcvd) {
+ if (ret < 0)
+ exit_with_error(-ret);
+ ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
+ &idx_fq);
+ }
+ for (i = 0; i < rcvd; i++)
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) =
+ *xsk_ring_cons__comp_addr(&xsk->umem->cq,
+ idx_cq++);
+
+ xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+ xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
xsk->tx_npkts += rcvd;
}
}
-static inline void complete_tx_only(struct xdpsock *xsk)
+static inline void complete_tx_only(struct xsk_socket_info *xsk)
{
- u64 descs[BATCH_SIZE];
unsigned int rcvd;
+ u32 idx;
if (!xsk->outstanding_tx)
return;
- kick_tx(xsk->sfd);
+ kick_tx(xsk);
- rcvd = umem_complete_from_kernel(&xsk->umem->cq, descs, BATCH_SIZE);
+ rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
if (rcvd > 0) {
+ xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
xsk->tx_npkts += rcvd;
}
}
-static void rx_drop(struct xdpsock *xsk)
+static void rx_drop(struct xsk_socket_info *xsk)
{
- struct xdp_desc descs[BATCH_SIZE];
unsigned int rcvd, i;
+ u32 idx_rx = 0, idx_fq = 0;
+ int ret;
- rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
if (!rcvd)
return;
+ ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+ while (ret != rcvd) {
+ if (ret < 0)
+ exit_with_error(-ret);
+ ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
+ }
+
for (i = 0; i < rcvd; i++) {
- char *pkt = xq_get_data(xsk, descs[i].addr);
+ u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+ u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+ char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
- hex_dump(pkt, descs[i].len, descs[i].addr);
+ hex_dump(pkt, len, addr);
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx_fq++) = addr;
}
+ xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
+ xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
-
- umem_fill_to_kernel_ex(&xsk->umem->fq, descs, rcvd);
}
static void rx_drop_all(void)
@@ -830,7 +539,7 @@ static void rx_drop_all(void)
memset(fds, 0, sizeof(fds));
for (i = 0; i < num_socks; i++) {
- fds[i].fd = xsks[i]->sfd;
+ fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
fds[i].events = POLLIN;
timeout = 1000; /* 1sn */
}
@@ -847,14 +556,14 @@ static void rx_drop_all(void)
}
}
-static void tx_only(struct xdpsock *xsk)
+static void tx_only(struct xsk_socket_info *xsk)
{
int timeout, ret, nfds = 1;
struct pollfd fds[nfds + 1];
- unsigned int idx = 0;
+ u32 idx, frame_nb = 0;
memset(fds, 0, sizeof(fds));
- fds[0].fd = xsk->sfd;
+ fds[0].fd = xsk_socket__fd(xsk->xsk);
fds[0].events = POLLOUT;
timeout = 1000; /* 1sn */
@@ -864,50 +573,73 @@ static void tx_only(struct xdpsock *xsk)
if (ret <= 0)
continue;
- if (fds[0].fd != xsk->sfd ||
- !(fds[0].revents & POLLOUT))
+ if (!(fds[0].revents & POLLOUT))
continue;
}
- if (xq_nb_free(&xsk->tx, BATCH_SIZE) >= BATCH_SIZE) {
- lassert(xq_enq_tx_only(&xsk->tx, idx, BATCH_SIZE) == 0);
+ if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
+ BATCH_SIZE) {
+ unsigned int i;
+ for (i = 0; i < BATCH_SIZE; i++) {
+ xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
+ = (frame_nb + i) <<
+ XSK_UMEM__DEFAULT_FRAME_SHIFT;
+ xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+ sizeof(pkt_data) - 1;
+ }
+
+ xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
xsk->outstanding_tx += BATCH_SIZE;
- idx += BATCH_SIZE;
- idx %= NUM_FRAMES;
+ frame_nb += BATCH_SIZE;
+ frame_nb %= NUM_FRAMES;
}
complete_tx_only(xsk);
}
}
-static void l2fwd(struct xdpsock *xsk)
+static void l2fwd(struct xsk_socket_info *xsk)
{
for (;;) {
- struct xdp_desc descs[BATCH_SIZE];
unsigned int rcvd, i;
+ u32 idx_rx = 0, idx_tx = 0;
int ret;
for (;;) {
complete_tx_l2fwd(xsk);
- rcvd = xq_deq(&xsk->rx, descs, BATCH_SIZE);
+ rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
+ &idx_rx);
if (rcvd > 0)
break;
}
+ ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+ while (ret != rcvd) {
+ if (ret < 0)
+ exit_with_error(-ret);
+ ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+ }
+
for (i = 0; i < rcvd; i++) {
- char *pkt = xq_get_data(xsk, descs[i].addr);
+ u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
+ idx_rx)->addr;
+ u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
+ idx_rx++)->len;
+ char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
swap_mac_addresses(pkt);
- hex_dump(pkt, descs[i].len, descs[i].addr);
+ hex_dump(pkt, len, addr);
+ xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
+ xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
}
- xsk->rx_npkts += rcvd;
+ xsk_ring_prod__submit(&xsk->tx, rcvd);
+ xsk_ring_cons__release(&xsk->rx, rcvd);
- ret = xq_enq(&xsk->tx, descs, rcvd);
- lassert(ret == 0);
+ xsk->rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
}
}
@@ -915,17 +647,10 @@ static void l2fwd(struct xdpsock *xsk)
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
- struct bpf_prog_load_attr prog_load_attr = {
- .prog_type = BPF_PROG_TYPE_XDP,
- };
- int prog_fd, qidconf_map, xsks_map;
- struct bpf_prog_info info = {};
- __u32 info_len = sizeof(info);
- struct bpf_object *obj;
- char xdp_filename[256];
- struct bpf_map *map;
- int i, ret, key = 0;
+ struct xsk_umem_info *umem;
pthread_t pt;
+ void *bufs;
+ int ret;
parse_command_line(argc, argv);
@@ -935,67 +660,22 @@ int main(int argc, char **argv)
exit(EXIT_FAILURE);
}
- snprintf(xdp_filename, sizeof(xdp_filename), "%s_kern.o", argv[0]);
- prog_load_attr.file = xdp_filename;
-
- if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
- exit(EXIT_FAILURE);
- if (prog_fd < 0) {
- fprintf(stderr, "ERROR: no program found: %s\n",
- strerror(prog_fd));
- exit(EXIT_FAILURE);
- }
-
- map = bpf_object__find_map_by_name(obj, "qidconf_map");
- qidconf_map = bpf_map__fd(map);
- if (qidconf_map < 0) {
- fprintf(stderr, "ERROR: no qidconf map found: %s\n",
- strerror(qidconf_map));
- exit(EXIT_FAILURE);
- }
-
- map = bpf_object__find_map_by_name(obj, "xsks_map");
- xsks_map = bpf_map__fd(map);
- if (xsks_map < 0) {
- fprintf(stderr, "ERROR: no xsks map found: %s\n",
- strerror(xsks_map));
- exit(EXIT_FAILURE);
- }
-
- if (bpf_set_link_xdp_fd(opt_ifindex, prog_fd, opt_xdp_flags) < 0) {
- fprintf(stderr, "ERROR: link set xdp fd failed\n");
- exit(EXIT_FAILURE);
- }
-
- ret = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
- if (ret) {
- printf("can't get prog info - %s\n", strerror(errno));
- return 1;
- }
- prog_id = info.id;
+ ret = posix_memalign(&bufs, getpagesize(), /* PAGE_SIZE aligned */
+ NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ if (ret)
+ exit_with_error(ret);
- ret = bpf_map_update_elem(qidconf_map, &key, &opt_queue, 0);
- if (ret) {
- fprintf(stderr, "ERROR: bpf_map_update_elem qidconf\n");
- exit(EXIT_FAILURE);
- }
+ /* Create sockets... */
+ umem = xsk_configure_umem(bufs,
+ NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE);
+ xsks[num_socks++] = xsk_configure_socket(umem);
- /* Create sockets... */
- xsks[num_socks++] = xsk_configure(NULL);
-
-#if RR_LB
- for (i = 0; i < MAX_SOCKS - 1; i++)
- xsks[num_socks++] = xsk_configure(xsks[0]->umem);
-#endif
+ if (opt_bench == BENCH_TXONLY) {
+ int i;
- /* ...and insert them into the map. */
- for (i = 0; i < num_socks; i++) {
- key = i;
- ret = bpf_map_update_elem(xsks_map, &key, &xsks[i]->sfd, 0);
- if (ret) {
- fprintf(stderr, "ERROR: bpf_map_update_elem %d\n", i);
- exit(EXIT_FAILURE);
- }
+ for (i = 0; i < NUM_FRAMES * XSK_UMEM__DEFAULT_FRAME_SIZE;
+ i += XSK_UMEM__DEFAULT_FRAME_SIZE)
+ (void)gen_eth_frame(umem, i);
}
signal(SIGINT, int_exit);
@@ -1005,7 +685,8 @@ int main(int argc, char **argv)
setlocale(LC_ALL, "");
ret = pthread_create(&pt, NULL, poller, NULL);
- lassert(ret == 0);
+ if (ret)
+ exit_with_error(ret);
prev_time = get_nsecs();