diff options
-rw-r--r-- | include/uapi/linux/bpf.h | 10 | ||||
-rw-r--r-- | net/core/filter.c | 28 | ||||
-rw-r--r-- | samples/bpf/Makefile | 5 | ||||
-rwxr-xr-x | samples/bpf/do_hbm_test.sh | 436 | ||||
-rw-r--r-- | samples/bpf/hbm.c | 441 | ||||
-rw-r--r-- | samples/bpf/hbm.h | 31 | ||||
-rw-r--r-- | samples/bpf/hbm_kern.h | 137 | ||||
-rw-r--r-- | samples/bpf/hbm_out_kern.c | 157 | ||||
-rw-r--r-- | tools/include/uapi/linux/bpf.h | 10 | ||||
-rw-r--r-- | tools/testing/selftests/bpf/bpf_helpers.h | 2 |
10 files changed, 1255 insertions, 2 deletions
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 85749f6ec789..558ca72f2254 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0c62ac39c697..65e667bdf979 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -52,6 +52,7 @@ hostprogs-y += xdpsock hostprogs-y += xdp_fwd hostprogs-y += task_fd_query hostprogs-y += xdp_sample_pkts +hostprogs-y += hbm # Libbpf dependencies LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a @@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o xdp_fwd-objs := xdp_fwd_user.o task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS) xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS) +hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS) # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -164,6 +166,7 @@ always += xdp_adjust_tail_kern.o always += xdp_fwd_kern.o always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o +always += hbm_out_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -263,6 +266,8 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h +$(obj)/hbm.o: $(src)/hbm.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh new file mode 100755 index 000000000000..56c8b4115c95 --- /dev/null +++ b/samples/bpf/do_hbm_test.sh @@ -0,0 +1,436 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 Facebook +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of version 2 of the GNU General Public +# License as published by the Free Software Foundation. + +Usage() { + echo "Script for testing HBM (Host Bandwidth Manager) framework." + echo "It creates a cgroup to use for testing and load a BPF program to limit" + echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create" + echo "loads. The output is the goodput in Mbps (unless -D was used)." + echo "" + echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]" + echo " [-d=<delay>|--delay=<delay>] [--debug] [-E]" + echo " [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]" + echo " [-l] [-N] [-p=<port>|--port=<port>] [-P]" + echo " [-q=<qdisc>] [-R] [-s=<server>|--server=<server]" + echo " [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]" + echo " Where:" + echo " out egress (default)" + echo " -b or --bpf BPF program filename to load and attach." + echo " Default is hbm_out_kern.o for egress," + echo " -c or -cc TCP congestion control (cubic or dctcp)" + echo " --debug print BPF trace buffer" + echo " -d or --delay add a delay in ms using netem" + echo " -D In addition to the goodput in Mbps, it also outputs" + echo " other detailed information. This information is" + echo " test dependent (i.e. iperf3 or netperf)." + echo " -E enable ECN (not required for dctcp)" + echo " -f or --flows number of concurrent flows (default=1)" + echo " -i or --id cgroup id (an integer, default is 1)" + echo " -N use netperf instead of iperf3" + echo " -l do not limit flows using loopback" + echo " -h Help" + echo " -p or --port iperf3 port (default is 5201)" + echo " -P use an iperf3 instance for each flow" + echo " -q use the specified qdisc" + echo " -r or --rate rate in Mbps (default 1s 1Gbps)" + echo " -R Use TCP_RR for netperf. 1st flow has req" + echo " size of 10KB, rest of 1MB. Reply in all" + echo " cases is 1 byte." + echo " More detailed output for each flow can be found" + echo " in the files netperf.<cg>.<flow>, where <cg> is the" + echo " cgroup id as specified with the -i flag, and <flow>" + echo " is the flow id starting at 1 and increasing by 1 for" + echo " flow (as specified by -f)." + echo " -s or --server hostname of netperf server. Used to create netperf" + echo " test traffic between to hosts (default is within host)" + echo " netserver must be running on the host." + echo " -S or --stats whether to update hbm stats (default is yes)." + echo " -t or --time duration of iperf3 in seconds (default=5)" + echo " -w Work conserving flag. cgroup can increase its" + echo " bandwidth beyond the rate limit specified" + echo " while there is available bandwidth. Current" + echo " implementation assumes there is only one NIC" + echo " (eth0), but can be extended to support multiple" + echo " NICs." + echo " cubic or dctcp specify which TCP CC to use" + echo " " + exit +} + +#set -x + +debug_flag=0 +args="$@" +name="$0" +netem=0 +cc=x +dir="-o" +dir_name="out" +dur=5 +flows=1 +id=1 +prog="" +port=5201 +rate=1000 +multi_iperf=0 +flow_cnt=1 +use_netperf=0 +rr=0 +ecn=0 +details=0 +server="" +qdisc="" +flags="" +do_stats=0 + +function start_hbm () { + rm -f hbm.out + echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out + echo " " >> hbm.out + ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1 & + echo $! +} + +processArgs () { + for i in $args ; do + case $i in + # Support for upcomming ingress rate limiting + #in) # support for upcoming ingress rate limiting + # dir="-i" + # dir_name="in" + # ;; + out) + dir="-o" + dir_name="out" + ;; + -b=*|--bpf=*) + prog="${i#*=}" + ;; + -c=*|--cc=*) + cc="${i#*=}" + ;; + --debug) + flags="$flags -d" + debug_flag=1 + ;; + -d=*|--delay=*) + netem="${i#*=}" + ;; + -D) + details=1 + ;; + -E) + ecn=1 + ;; + # Support for upcomming fq Early Departure Time egress rate limiting + #--edt) + # prog="hbm_out_edt_kern.o" + # qdisc="fq" + # ;; + -f=*|--flows=*) + flows="${i#*=}" + ;; + -i=*|--id=*) + id="${i#*=}" + ;; + -l) + flags="$flags -l" + ;; + -N) + use_netperf=1 + ;; + -p=*|--port=*) + port="${i#*=}" + ;; + -P) + multi_iperf=1 + ;; + -q=*) + qdisc="${i#*=}" + ;; + -r=*|--rate=*) + rate="${i#*=}" + ;; + -R) + rr=1 + ;; + -s=*|--server=*) + server="${i#*=}" + ;; + -S|--stats) + flags="$flags -s" + do_stats=1 + ;; + -t=*|--time=*) + dur="${i#*=}" + ;; + -w) + flags="$flags -w" + ;; + cubic) + cc=cubic + ;; + dctcp) + cc=dctcp + ;; + *) + echo "Unknown arg:$i" + Usage + ;; + esac + done +} + +processArgs + +if [ $debug_flag -eq 1 ] ; then + rm -f hbm_out.log +fi + +hbm_pid=$(start_hbm) +usleep 100000 + +host=`hostname` +cg_base_dir=/sys/fs/cgroup +cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id" + +echo $$ >> $cg_dir/cgroup.procs + +ulimit -l unlimited + +rm -f ss.out +rm -f hbm.[0-9]*.$dir_name +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=1 +fi + +if [ $use_netperf -eq 0 ] ; then + cur_cc=`sysctl -n net.ipv4.tcp_congestion_control` + if [ "$cc" != "x" ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc + fi +fi + +if [ "$netem" -ne "0" ] ; then + if [ "$qdisc" != "" ] ; then + echo "WARNING: Ignoring -q options because -d option used" + fi + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1 +elif [ "$qdisc" != "" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 + tc qdisc add dev lo root $qdisc > /dev/null 2>&1 +fi + +n=0 +m=$[$dur * 5] +hn="::1" +if [ $use_netperf -ne 0 ] ; then + if [ "$server" != "" ] ; then + hn=$server + fi +fi + +( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) & + +if [ $use_netperf -ne 0 ] ; then + begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \ + awk '{ print $1 }'` + if [ "$begNetserverPid" == "" ] ; then + if [ "$server" == "" ] ; then + ( ./netserver > /dev/null 2>&1) & + usleep 100000 + fi + fi + flow_cnt=1 + if [ "$server" == "" ] ; then + np_server=$host + else + np_server=$server + fi + if [ "$cc" == "x" ] ; then + np_cc="" + else + np_cc="-K $cc,$cc" + fi + replySize=1 + while [ $flow_cnt -le $flows ] ; do + if [ $rr -ne 0 ] ; then + reqSize=1M + if [ $flow_cnt -eq 1 ] ; then + reqSize=10K + fi + if [ "$dir" == "-i" ] ; then + replySize=$reqSize + reqSize=1 + fi + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + if [ "$dir" == "-i" ] ; then + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + else + ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) & + fi + fi + flow_cnt=$[flow_cnt+1] + done + +# sleep for duration of test (plus some buffer) + n=$[dur+2] + sleep $n + +# force graceful termination of netperf + pids=`pgrep netperf` + for p in $pids ; do + kill -SIGALRM $p + done + + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + while [ $flow_cnt -le $flows ] ; do + if [ "$dir" == "-i" ] ; then + r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + else + r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"` + fi + echo "rate for flow $flow_cnt: $r" + rate=$[rate+r] + if [ $details -ne 0 ] ; then + echo "-----" + echo "Details for cgroup $id, flow $flow_cnt" + cat netperf.$id.$flow_cnt + fi + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + echo "" + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +elif [ $multi_iperf -eq 0 ] ; then + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + usleep 100000 + iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id + rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"` + rate=`echo $rates | grep -o "[0-9]*$"` + + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +else + flow_cnt=1 + while [ $flow_cnt -le $flows ] ; do + (iperf3 -s -p $port -1 > /dev/null 2>&1) & + ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) & + port=$[port+1] + flow_cnt=$[flow_cnt+1] + done + n=$[dur+1] + sleep $n + flow_cnt=1 + rate=0 + if [ $details -ne 0 ] ; then + echo "" + echo "Details for HBM in cgroup $id" + if [ $do_stats -eq 1 ] ; then + if [ -e hbm.$id.$dir_name ] ; then + cat hbm.$id.$dir_name + fi + fi + fi + + while [ $flow_cnt -le $flows ] ; do + r=`cat iperf3.$id.$flow_cnt` +# echo "rate for flow $flow_cnt: $r" + if [ $details -ne 0 ] ; then + echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r" + fi + rate=$[rate+r] + flow_cnt=$[flow_cnt+1] + done + if [ $details -ne 0 ] ; then + delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"` + echo "PING AVG DELAY:$delay" + echo "AGGREGATE_GOODPUT:$rate" + else + echo $rate + fi +fi + +if [ $use_netperf -eq 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc +fi +if [ $ecn -ne 0 ] ; then + sysctl -w -q -n net.ipv4.tcp_ecn=0 +fi +if [ "$netem" -ne "0" ] ; then + tc qdisc del dev lo root > /dev/null 2>&1 +fi + +sleep 2 + +hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'` +if [ "$hbmPid" == "$hbm_pid" ] ; then + kill $hbm_pid +fi + +sleep 1 + +# Detach any BPF programs that may have lingered +ttx=`bpftool cgroup tree | grep hbm` +v=2 +for x in $ttx ; do + if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then + cg=$x ; v=0 + else + if [ $v -eq 0 ] ; then + id=$x ; v=1 + else + if [ $v -eq 1 ] ; then + type=$x ; bpftool cgroup detach $cg $type id $id + v=0 + fi + fi + fi +done + +if [ $use_netperf -ne 0 ] ; then + if [ "$server" == "" ] ; then + if [ "$begNetserverPid" == "" ] ; then + netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'` + if [ "$netserverPid" != "" ] ; then + kill $netserverPid + fi + fi + fi +fi +exit diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c new file mode 100644 index 000000000000..8408ccb7409f --- /dev/null +++ b/samples/bpf/hbm.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Example program for Host Bandwidth Managment + * + * This program loads a cgroup skb BPF program to enforce cgroup output + * (egress) or input (ingress) bandwidth limits. + * + * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] + * Where: + * -d Print BPF trace debug buffer + * -l Also limit flows doing loopback + * -n <#> To create cgroup \"/hbm#\" and attach prog + * Default is /hbm1 + * -r <rate> Rate limit in Mbps + * -s Get HBM stats (marked, dropped, etc.) + * -t <time> Exit after specified seconds (deault is 0) + * -w Work conserving flag. cgroup can increase its bandwidth + * beyond the rate limit specified while there is available + * bandwidth. Current implementation assumes there is only + * NIC (eth0), but can be extended to support multiple NICs. + * Currrently only supported for egress. + * -h Print this info + * prog BPF program file name. Name defaults to hbm_out_kern.o + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <sys/resource.h> +#include <sys/time.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/unistd.h> + +#include <linux/bpf.h> +#include <bpf/bpf.h> + +#include "bpf_load.h" +#include "bpf_rlimit.h" +#include "cgroup_helpers.h" +#include "hbm.h" +#include "bpf_util.h" +#include "bpf/bpf.h" +#include "bpf/libbpf.h" + +bool outFlag = true; +int minRate = 1000; /* cgroup rate limit in Mbps */ +int rate = 1000; /* can grow if rate conserving is enabled */ +int dur = 1; +bool stats_flag; +bool loopback_flag; +bool debugFlag; +bool work_conserving_flag; + +static void Usage(void); +static void read_trace_pipe2(void); +static void do_error(char *msg, bool errno_flag); + +#define DEBUGFS "/sys/kernel/debug/tracing/" + +struct bpf_object *obj; +int bpfprog_fd; +int cgroup_storage_fd; + +static void read_trace_pipe2(void) +{ + int trace_fd; + FILE *outf; + char *outFname = "hbm_out.log"; + + trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); + if (trace_fd < 0) { + printf("Error opening trace_pipe\n"); + return; + } + +// Future support of ingress +// if (!outFlag) +// outFname = "hbm_in.log"; + outf = fopen(outFname, "w"); + + if (outf == NULL) + printf("Error creating %s\n", outFname); + + while (1) { + static char buf[4097]; + ssize_t sz; + + sz = read(trace_fd, buf, sizeof(buf) - 1); + if (sz > 0) { + buf[sz] = 0; + puts(buf); + if (outf != NULL) { + fprintf(outf, "%s\n", buf); + fflush(outf); + } + } + } +} + +static void do_error(char *msg, bool errno_flag) +{ + if (errno_flag) + printf("ERROR: %s, errno: %d\n", msg, errno); + else + printf("ERROR: %s\n", msg); + exit(1); +} + +static int prog_load(char *prog) +{ + struct bpf_prog_load_attr prog_load_attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .file = prog, + .expected_attach_type = BPF_CGROUP_INET_EGRESS, + }; + int map_fd; + struct bpf_map *map; + + int ret = 0; + + if (access(prog, O_RDONLY) < 0) { + printf("Error accessing file %s: %s\n", prog, strerror(errno)); + return 1; + } + if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd)) + ret = 1; + if (!ret) { + map = bpf_object__find_map_by_name(obj, "queue_stats"); + map_fd = bpf_map__fd(map); + if (map_fd < 0) { + printf("Map not found: %s\n", strerror(map_fd)); + ret = 1; + } + } + + if (ret) { + printf("ERROR: load_bpf_file failed for: %s\n", prog); + printf(" Output from verifier:\n%s\n------\n", bpf_log_buf); + ret = -1; + } else { + ret = map_fd; + } + + return ret; +} + +static int run_bpf_prog(char *prog, int cg_id) +{ + int map_fd; + int rc = 0; + int key = 0; + int cg1 = 0; + int type = BPF_CGROUP_INET_EGRESS; + char cg_dir[100]; + struct hbm_queue_stats qstats = {0}; + + sprintf(cg_dir, "/hbm%d", cg_id); + map_fd = prog_load(prog); + if (map_fd == -1) + return 1; + + if (setup_cgroup_environment()) { + printf("ERROR: setting cgroup environment\n"); + goto err; + } + cg1 = create_and_get_cgroup(cg_dir); + if (!cg1) { + printf("ERROR: create_and_get_cgroup\n"); + goto err; + } + if (join_cgroup(cg_dir)) { + printf("ERROR: join_cgroup\n"); + goto err; + } + + qstats.rate = rate; + qstats.stats = stats_flag ? 1 : 0; + qstats.loopback = loopback_flag ? 1 : 0; + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) { + printf("ERROR: Could not update map element\n"); + goto err; + } + + if (!outFlag) + type = BPF_CGROUP_INET_INGRESS; + if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) { + printf("ERROR: bpf_prog_attach fails!\n"); + log_err("Attaching prog"); + goto err; + } + + if (work_conserving_flag) { + struct timeval t0, t_last, t_new; + FILE *fin; + unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; + signed long long last_cg_tx_bytes, new_cg_tx_bytes; + signed long long delta_time, delta_bytes, delta_rate; + int delta_ms; +#define DELTA_RATE_CHECK 10000 /* in us */ +#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ + + bpf_map_lookup_elem(map_fd, &key, &qstats); + if (gettimeofday(&t0, NULL) < 0) + do_error("gettimeofday failed", true); + t_last = t0; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); + if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + last_cg_tx_bytes = qstats.bytes_total; + while (true) { + usleep(DELTA_RATE_CHECK); + if (gettimeofday(&t_new, NULL) < 0) + do_error("gettimeofday failed", true); + delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + + (t_new.tv_usec - t0.tv_usec)/1000; + if (delta_ms > dur * 1000) + break; + delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + + (t_new.tv_usec - t_last.tv_usec); + if (delta_time == 0) + continue; + t_last = t_new; + fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", + "r"); + if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) + do_error("fscanf fails", false); + fclose(fin); + printf(" new_eth_tx_bytes:%llu\n", + new_eth_tx_bytes); + bpf_map_lookup_elem(map_fd, &key, &qstats); + new_cg_tx_bytes = qstats.bytes_total; + delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; + last_eth_tx_bytes = new_eth_tx_bytes; + delta_rate = (delta_bytes * 8000000) / delta_time; + printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", + delta_ms, delta_rate/1000000000.0, + rate/1000.0); + if (delta_rate < RATE_THRESHOLD) { + /* can increase cgroup rate limit, but first + * check if we are using the current limit. + * Currently increasing by 6.25%, unknown + * if that is the optimal rate. + */ + int rate_diff100; + + delta_bytes = new_cg_tx_bytes - + last_cg_tx_bytes; + last_cg_tx_bytes = new_cg_tx_bytes; + delta_rate = (delta_bytes * 8000000) / + delta_time; + printf(" rate:%.3fGbps", + delta_rate/1000000000.0); + rate_diff100 = (((long long)rate)*1000000 - + delta_rate) * 100 / + (((long long) rate) * 1000000); + printf(" rdiff:%d", rate_diff100); + if (rate_diff100 <= 3) { + rate += (rate >> 4); + if (rate > RATE_THRESHOLD / 1000000) + rate = RATE_THRESHOLD / 1000000; + qstats.rate = rate; + printf(" INC\n"); + } else { + printf("\n"); + } + } else { + /* Need to decrease cgroup rate limit. + * Currently decreasing by 12.5%, unknown + * if that is optimal + */ + printf(" DEC\n"); + rate -= (rate >> 3); + if (rate < minRate) + rate = minRate; + qstats.rate = rate; + } + if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) + do_error("update map element fails", false); + } + } else { + sleep(dur); + } + // Get stats! + if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) { + char fname[100]; + FILE *fout; + + if (!outFlag) + sprintf(fname, "hbm.%d.in", cg_id); + else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "ERROR: Could not lookup queue_stats\n"); + } else if (stats_flag && qstats.lastPacketTime > + qstats.firstPacketTime) { + long long delta_us = (qstats.lastPacketTime - + qstats.firstPacketTime)/1000; + unsigned int rate_mbps = ((qstats.bytes_total - + qstats.bytes_dropped) * 8 / + delta_us); + double percent_pkts, percent_bytes; + char fname[100]; + FILE *fout; + +// Future support of ingress +// if (!outFlag) +// sprintf(fname, "hbm.%d.in", cg_id); +// else + sprintf(fname, "hbm.%d.out", cg_id); + fout = fopen(fname, "w"); + fprintf(fout, "id:%d\n", cg_id); + fprintf(fout, "rate_mbps:%d\n", rate_mbps); + fprintf(fout, "duration:%.1f secs\n", + (qstats.lastPacketTime - qstats.firstPacketTime) / + 1000000000.0); + fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); + fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / + 1000000)); + fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); + fprintf(fout, "bytes_dropped_MB:%d\n", + (int)(qstats.bytes_dropped / + 1000000)); + // Marked Pkts and Bytes + percent_pkts = (qstats.pkts_marked * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_marked * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); + + // Dropped Pkts and Bytes + percent_pkts = (qstats.pkts_dropped * 100.0) / + (qstats.pkts_total + 1); + percent_bytes = (qstats.bytes_dropped * 100.0) / + (qstats.bytes_total + 1); + fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); + fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); + fclose(fout); + } + + if (debugFlag) + read_trace_pipe2(); + return rc; +err: + rc = 1; + + if (cg1) + close(cg1); + cleanup_cgroup_environment(); + + return rc; +} + +static void Usage(void) +{ + printf("This program loads a cgroup skb BPF program to enforce\n" + "cgroup output (egress) bandwidth limits.\n\n" + "USAGE: hbm [-o] [-d] [-l] [-n <id>] [-r <rate>] [-s]\n" + " [-t <secs>] [-w] [-h] [prog]\n" + " Where:\n" + " -o indicates egress direction (default)\n" + " -d print BPF trace debug buffer\n" + " -l also limit flows using loopback\n" + " -n <#> to create cgroup \"/hbm#\" and attach prog\n" + " Default is /hbm1\n" + " -r <rate> Rate in Mbps\n" + " -s Update HBM stats\n" + " -t <time> Exit after specified seconds (deault is 0)\n" + " -w Work conserving flag. cgroup can increase\n" + " bandwidth beyond the rate limit specified\n" + " while there is available bandwidth. Current\n" + " implementation assumes there is only eth0\n" + " but can be extended to support multiple NICs\n" + " -h print this info\n" + " prog BPF program file name. Name defaults to\n" + " hbm_out_kern.o\n"); +} + +int main(int argc, char **argv) +{ + char *prog = "hbm_out_kern.o"; + int k; + int cg_id = 1; + char *optstring = "iodln:r:st:wh"; + + while ((k = getopt(argc, argv, optstring)) != -1) { + switch (k) { + case'o': + break; + case 'd': + debugFlag = true; + break; + case 'l': + loopback_flag = true; + break; + case 'n': + cg_id = atoi(optarg); + break; + case 'r': + minRate = atoi(optarg) * 1.024; + rate = minRate; + break; + case 's': + stats_flag = true; + break; + case 't': + dur = atoi(optarg); + break; + case 'w': + work_conserving_flag = true; + break; + case '?': + if (optopt == 'n' || optopt == 'r' || optopt == 't') + fprintf(stderr, + "Option -%c requires an argument.\n\n", + optopt); + case 'h': + // fallthrough + default: + Usage(); + return 0; + } + } + + if (optind < argc) + prog = argv[optind]; + printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); + + return run_bpf_prog(prog, cg_id); +} diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..518e8147d084 --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1; /* also limit flows using loopback */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; +}; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..c5635d924193 --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include <stddef.h> +#include <stdbool.h> +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include <net/inet_ecn.h> +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 + +#define HBM_DEBUG 0 // Set to 1 to enable debugging +#if HBM_DEBUG +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) +#else +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) + +struct bpf_map_def SEC("maps") queue_state = { + .type = BPF_MAP_TYPE_CGROUP_STORAGE, + .key_size = sizeof(struct bpf_cgroup_storage_key), + .value_size = sizeof(struct hbm_vqueue), +}; +BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, + struct hbm_vqueue); + +struct bpf_map_def SEC("maps") queue_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct hbm_queue_stats), + .max_entries = 1, +}; +BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); + +struct hbm_pkt_info { + bool is_ip; + bool is_tcp; + short ecn; +}; + +static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag) +{ + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag || drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..f806863d0b79 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, zero = 0; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + credit += CREDIT_PER_NS(delta, qdp->rate); + if (credit > MAX_CREDIT) + credit = MAX_CREDIT; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && + credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop flag + drop_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (!bpf_skb_ecn_set_ce(skb)) { + if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (drop_flag) + rv = DROP_PKT; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qdp->credit), len); + + return rv; +} +char _license[] SEC("license") = "GPL"; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 026bea831e03..c9433a496d54 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) BPF_FUNC_sk_fullsock; static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_tcp_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions |