From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: bpf: add bpf helper bpf_skb_ecn_set_ce This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 85749f6ec789..558ca72f2254 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); -- cgit v1.2.3-59-g8ed1b From 5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:47 -0800 Subject: bpf: sync bpf.h to tools and update bpf_helpers.h This patch syncs the uapi bpf.h to tools/ and also updates bpf_herlpers.h in tools/ Signed-off-by: Lawrence Brakmo Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 10 +++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 2 ++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 026bea831e03..c9433a496d54 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) BPF_FUNC_sk_fullsock; static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) BPF_FUNC_tcp_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3-59-g8ed1b From 187d0738ff351f725a58be3d606d3a7fc8db8aed Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:48 -0800 Subject: bpf: Sample HBM BPF program to limit egress bw A cgroup skb BPF program to limit cgroup output bandwidth. It uses a modified virtual token bucket queue to limit average egress bandwidth. The implementation uses credits instead of tokens. Negative credits imply that queueing would have happened (this is a virtual queue, so no queueing is done by it. However, queueing may occur at the actual qdisc (which is not used for rate limiting). This implementation uses 3 thresholds, one to start marking packets and the other two to drop packets: CREDIT - <--------------------------|------------------------> + | | | 0 | Large pkt | | drop thresh | Small pkt drop Mark threshold thresh The effect of marking depends on the type of packet: a) If the packet is ECN enabled, then the packet is ECN ce marked. The current mark threshold is tuned for DCTCP. c) Else, it is dropped if it is a large packet. If the credit is below the drop threshold, the packet is dropped. Note that dropping a packet through the BPF program does not trigger CWR (Congestion Window Reduction) in TCP packets. A future patch will add support for triggering CWR. This BPF program actually uses 2 drop thresholds, one threshold for larger packets (>= 120 bytes) and another for smaller packets. This protects smaller packets such as SYNs, ACKs, etc. The default bandwidth limit is set at 1Gbps but this can be changed by a user program through a shared BPF map. In addition, by default this BPF program does not limit connections using loopback. This behavior can be overwritten by the user program. There is also an option to calculate some statistics, such as percent of packets marked or dropped, which the user program can access. A latter patch provides such a program (hbm.c) Signed-off-by: Lawrence Brakmo Signed-off-by: Alexei Starovoitov --- samples/bpf/Makefile | 2 + samples/bpf/hbm.h | 31 +++++++++ samples/bpf/hbm_kern.h | 137 +++++++++++++++++++++++++++++++++++++++ samples/bpf/hbm_out_kern.c | 157 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 327 insertions(+) create mode 100644 samples/bpf/hbm.h create mode 100644 samples/bpf/hbm_kern.h create mode 100644 samples/bpf/hbm_out_kern.c diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 0c62ac39c697..e1bdc96486f6 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -164,6 +164,7 @@ always += xdp_adjust_tail_kern.o always += xdp_fwd_kern.o always += task_fd_query_kern.o always += xdp_sample_pkts_kern.o +always += hbm_out_kern.o KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -263,6 +264,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF) $(src)/*.c: verify_target_bpf $(LIBBPF) $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h +$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h # asm/sysreg.h - inline assembly used by it is incompatible with llvm. # But, there is no easy way to fix it, so just exclude it since it is diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h new file mode 100644 index 000000000000..518e8147d084 --- /dev/null +++ b/samples/bpf/hbm.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for Host Bandwidth Management (HBM) programs + */ +struct hbm_vqueue { + struct bpf_spin_lock lock; + /* 4 byte hole */ + unsigned long long lasttime; /* In ns */ + int credit; /* In bytes */ + unsigned int rate; /* In bytes per NS << 20 */ +}; + +struct hbm_queue_stats { + unsigned long rate; /* in Mbps*/ + unsigned long stats:1, /* get HBM stats (marked, dropped,..) */ + loopback:1; /* also limit flows using loopback */ + unsigned long long pkts_marked; + unsigned long long bytes_marked; + unsigned long long pkts_dropped; + unsigned long long bytes_dropped; + unsigned long long pkts_total; + unsigned long long bytes_total; + unsigned long long firstPacketTime; + unsigned long long lastPacketTime; +}; diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h new file mode 100644 index 000000000000..c5635d924193 --- /dev/null +++ b/samples/bpf/hbm_kern.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Include file for sample Host Bandwidth Manager (HBM) BPF programs + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_endian.h" +#include "bpf_helpers.h" +#include "hbm.h" + +#define DROP_PKT 0 +#define ALLOW_PKT 1 +#define TCP_ECN_OK 1 + +#define HBM_DEBUG 0 // Set to 1 to enable debugging +#if HBM_DEBUG +#define bpf_printk(fmt, ...) \ +({ \ + char ____fmt[] = fmt; \ + bpf_trace_printk(____fmt, sizeof(____fmt), \ + ##__VA_ARGS__); \ +}) +#else +#define bpf_printk(fmt, ...) +#endif + +#define INITIAL_CREDIT_PACKETS 100 +#define MAX_BYTES_PER_PACKET 1500 +#define MARK_THRESH (40 * MAX_BYTES_PER_PACKET) +#define DROP_THRESH (80 * 5 * MAX_BYTES_PER_PACKET) +#define LARGE_PKT_DROP_THRESH (DROP_THRESH - (15 * MAX_BYTES_PER_PACKET)) +#define MARK_REGION_SIZE (LARGE_PKT_DROP_THRESH - MARK_THRESH) +#define LARGE_PKT_THRESH 120 +#define MAX_CREDIT (100 * MAX_BYTES_PER_PACKET) +#define INIT_CREDIT (INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET) + +// rate in bytes per ns << 20 +#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20) + +struct bpf_map_def SEC("maps") queue_state = { + .type = BPF_MAP_TYPE_CGROUP_STORAGE, + .key_size = sizeof(struct bpf_cgroup_storage_key), + .value_size = sizeof(struct hbm_vqueue), +}; +BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key, + struct hbm_vqueue); + +struct bpf_map_def SEC("maps") queue_stats = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(struct hbm_queue_stats), + .max_entries = 1, +}; +BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats); + +struct hbm_pkt_info { + bool is_ip; + bool is_tcp; + short ecn; +}; + +static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb, + struct hbm_pkt_info *pkti) +{ + struct iphdr iph; + struct ipv6hdr *ip6h; + + bpf_skb_load_bytes(skb, 0, &iph, 12); + if (iph.version == 6) { + ip6h = (struct ipv6hdr *)&iph; + pkti->is_ip = true; + pkti->is_tcp = (ip6h->nexthdr == 6); + pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK; + } else if (iph.version == 4) { + pkti->is_ip = true; + pkti->is_tcp = (iph.protocol == 6); + pkti->ecn = iph.tos & INET_ECN_MASK; + } else { + pkti->is_ip = false; + pkti->is_tcp = false; + pkti->ecn = 0; + } +} + +static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate) +{ + bpf_printk("Initializing queue_state, rate:%d\n", rate * 128); + qdp->lasttime = bpf_ktime_get_ns(); + qdp->credit = INIT_CREDIT; + qdp->rate = rate * 128; +} + +static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp, + int len, + unsigned long long curtime, + bool congestion_flag, + bool drop_flag) +{ + if (qsp != NULL) { + // Following is needed for work conserving + __sync_add_and_fetch(&(qsp->bytes_total), len); + if (qsp->stats) { + // Optionally update statistics + if (qsp->firstPacketTime == 0) + qsp->firstPacketTime = curtime; + qsp->lastPacketTime = curtime; + __sync_add_and_fetch(&(qsp->pkts_total), 1); + if (congestion_flag || drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_marked), 1); + __sync_add_and_fetch(&(qsp->bytes_marked), len); + } + if (drop_flag) { + __sync_add_and_fetch(&(qsp->pkts_dropped), 1); + __sync_add_and_fetch(&(qsp->bytes_dropped), + len); + } + } + } +} diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c new file mode 100644 index 000000000000..f806863d0b79 --- /dev/null +++ b/samples/bpf/hbm_out_kern.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Sample Host Bandwidth Manager (HBM) BPF program. + * + * A cgroup skb BPF egress program to limit cgroup output bandwidth. + * It uses a modified virtual token bucket queue to limit average + * egress bandwidth. The implementation uses credits instead of tokens. + * Negative credits imply that queueing would have happened (this is + * a virtual queue, so no queueing is done by it. However, queueing may + * occur at the actual qdisc (which is not used for rate limiting). + * + * This implementation uses 3 thresholds, one to start marking packets and + * the other two to drop packets: + * CREDIT + * - <--------------------------|------------------------> + + * | | | 0 + * | Large pkt | + * | drop thresh | + * Small pkt drop Mark threshold + * thresh + * + * The effect of marking depends on the type of packet: + * a) If the packet is ECN enabled and it is a TCP packet, then the packet + * is ECN marked. + * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr + * to reduce the congestion window. The current implementation uses a linear + * distribution (0% probability at marking threshold, 100% probability + * at drop threshold). + * c) If the packet is not a TCP packet, then it is dropped. + * + * If the credit is below the drop threshold, the packet is dropped. If it + * is a TCP packet, then it also calls tcp_cwr since packets dropped by + * by a cgroup skb BPF program do not automatically trigger a call to + * tcp_cwr in the current kernel code. + * + * This BPF program actually uses 2 drop thresholds, one threshold + * for larger packets (>= 120 bytes) and another for smaller packets. This + * protects smaller packets such as SYNs, ACKs, etc. + * + * The default bandwidth limit is set at 1Gbps but this can be changed by + * a user program through a shared BPF map. In addition, by default this BPF + * program does not limit connections using loopback. This behavior can be + * overwritten by the user program. There is also an option to calculate + * some statistics, such as percent of packets marked or dropped, which + * the user program can access. + * + * A latter patch provides such a program (hbm.c) + */ + +#include "hbm_kern.h" + +SEC("cgroup_skb/egress") +int _hbm_out_cg(struct __sk_buff *skb) +{ + struct hbm_pkt_info pkti; + int len = skb->len; + unsigned int queue_index = 0; + unsigned long long curtime; + int credit; + signed long long delta = 0, zero = 0; + int max_credit = MAX_CREDIT; + bool congestion_flag = false; + bool drop_flag = false; + bool cwr_flag = false; + struct hbm_vqueue *qdp; + struct hbm_queue_stats *qsp = NULL; + int rv = ALLOW_PKT; + + qsp = bpf_map_lookup_elem(&queue_stats, &queue_index); + if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1)) + return ALLOW_PKT; + + hbm_get_pkt_info(skb, &pkti); + + // We may want to account for the length of headers in len + // calculation, like ETH header + overhead, specially if it + // is a gso packet. But I am not doing it right now. + + qdp = bpf_get_local_storage(&queue_state, 0); + if (!qdp) + return ALLOW_PKT; + else if (qdp->lasttime == 0) + hbm_init_vqueue(qdp, 1024); + + curtime = bpf_ktime_get_ns(); + + // Begin critical section + bpf_spin_lock(&qdp->lock); + credit = qdp->credit; + delta = curtime - qdp->lasttime; + /* delta < 0 implies that another process with a curtime greater + * than ours beat us to the critical section and already added + * the new credit, so we should not add it ourselves + */ + if (delta > 0) { + qdp->lasttime = curtime; + credit += CREDIT_PER_NS(delta, qdp->rate); + if (credit > MAX_CREDIT) + credit = MAX_CREDIT; + } + credit -= len; + qdp->credit = credit; + bpf_spin_unlock(&qdp->lock); + // End critical section + + // Check if we should update rate + if (qsp != NULL && (qsp->rate * 128) != qdp->rate) { + qdp->rate = qsp->rate * 128; + bpf_printk("Updating rate: %d (1sec:%llu bits)\n", + (int)qdp->rate, + CREDIT_PER_NS(1000000000, qdp->rate) * 8); + } + + // Set flags (drop, congestion, cwr) + // Dropping => we are congested, so ignore congestion flag + if (credit < -DROP_THRESH || + (len > LARGE_PKT_THRESH && + credit < -LARGE_PKT_DROP_THRESH)) { + // Very congested, set drop flag + drop_flag = true; + } else if (credit < 0) { + // Congested, set congestion flag + if (pkti.ecn) { + if (credit < -MARK_THRESH) + congestion_flag = true; + else + congestion_flag = false; + } else { + congestion_flag = true; + } + } + + if (congestion_flag) { + if (!bpf_skb_ecn_set_ce(skb)) { + if (len > LARGE_PKT_THRESH) { + // Problem if too many small packets? + drop_flag = true; + } + } + } + + if (drop_flag) + rv = DROP_PKT; + + hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag); + + if (rv == DROP_PKT) + __sync_add_and_fetch(&(qdp->credit), len); + + return rv; +} +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3-59-g8ed1b From a1270fe95b74eb3195b107c494ed1f11b932a278 Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:49 -0800 Subject: bpf: User program for testing HBM The program nrm creates a cgroup and attaches a BPF program to the cgroup for testing HBM (Host Bandwidth Manager) for egress traffic. One still needs to create network traffic. This can be done through netesto, netperf or iperf3. A follow-up patch contains a script to create traffic. USAGE: hbm [-d] [-l] [-n ] [-r ] [-s] [-t ] [-w] [-h] [prog] Where: -d Print BPF trace debug buffer -l Also limit flows doing loopback -n <#> To create cgroup "/hbm#" and attach prog. Default is /nrm1 This is convenient when testing HBM in more than 1 cgroup -r Rate limit in Mbps -s Get HBM stats (marked, dropped, etc.) -t