From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:46 -0800
Subject: bpf: add bpf helper bpf_skb_ecn_set_ce

This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
"int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because his type of bpf_prog cannot modify the skb directly.

This helper is used to set the ECN field of ECN capable IP packets to ce
(congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
used by a bpf_prog to manage egress or ingress network bandwdith limit
per cgroupv2 by inducing an ECN response in the TCP sender.
This works best when using DCTCP.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_tcp_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *     Description
+ *             Sets ECN of IP header to ce (congestion encountered) if
+ *             current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *     Return
+ *             1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
-	FN(tcp_sock),
+	FN(tcp_sock),			\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 85749f6ec789..558ca72f2254 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = {
 	.arg1_type	= ARG_PTR_TO_SOCK_COMMON,
 };
 
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+	unsigned int iphdr_len;
+
+	if (skb->protocol == cpu_to_be16(ETH_P_IP))
+		iphdr_len = sizeof(struct iphdr);
+	else if (skb->protocol == cpu_to_be16(ETH_P_IPV6))
+		iphdr_len = sizeof(struct ipv6hdr);
+	else
+		return 0;
+
+	if (skb_headlen(skb) < iphdr_len)
+		return 0;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+		return 0;
+
+	return INET_ECN_set_ce(skb);
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+	.func           = bpf_skb_ecn_set_ce,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #ifdef CONFIG_INET
 	case BPF_FUNC_tcp_sock:
 		return &bpf_tcp_sock_proto;
+	case BPF_FUNC_skb_ecn_set_ce:
+		return &bpf_skb_ecn_set_ce_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);
-- 
cgit v1.2.3-59-g8ed1b


From 5cce85c640ccc9d9aab8b05c77d7d076a44d4db2 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:47 -0800
Subject: bpf: sync bpf.h to tools and update bpf_helpers.h

This patch syncs the uapi bpf.h to tools/ and also updates
bpf_herlpers.h in tools/

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Acked-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 tools/include/uapi/linux/bpf.h            | 10 +++++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  2 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 2e308e90ffea..3c38ac9a92a7 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2359,6 +2359,13 @@ union bpf_attr {
  *	Return
  *		A **struct bpf_tcp_sock** pointer on success, or NULL in
  *		case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *     Description
+ *             Sets ECN of IP header to ce (congestion encountered) if
+ *             current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *     Return
+ *             1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2457,7 +2464,8 @@ union bpf_attr {
 	FN(spin_lock),			\
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
-	FN(tcp_sock),
+	FN(tcp_sock),			\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 026bea831e03..c9433a496d54 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -180,6 +180,8 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_sk_fullsock;
 static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) =
 	(void *) BPF_FUNC_tcp_sock;
+static int (*bpf_skb_ecn_set_ce)(void *ctx) =
+	(void *) BPF_FUNC_skb_ecn_set_ce;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.2.3-59-g8ed1b


From 187d0738ff351f725a58be3d606d3a7fc8db8aed Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:48 -0800
Subject: bpf: Sample HBM BPF program to limit egress bw

A cgroup skb BPF program to limit cgroup output bandwidth.
It uses a modified virtual token bucket queue to limit average
egress bandwidth. The implementation uses credits instead of tokens.
Negative credits imply that queueing would have happened (this is
a virtual queue, so no queueing is done by it. However, queueing may
occur at the actual qdisc (which is not used for rate limiting).

This implementation uses 3 thresholds, one to start marking packets and
the other two to drop packets:
                                 CREDIT
       - <--------------------------|------------------------> +
             |    |          |      0
             |  Large pkt    |
             |  drop thresh  |
  Small pkt drop             Mark threshold
      thresh

The effect of marking depends on the type of packet:
a) If the packet is ECN enabled, then the packet is ECN ce marked.
   The current mark threshold is tuned for DCTCP.
c) Else, it is dropped if it is a large packet.

If the credit is below the drop threshold, the packet is dropped.
Note that dropping a packet through the BPF program does not trigger CWR
(Congestion Window Reduction) in TCP packets. A future patch will add
support for triggering CWR.

This BPF program actually uses 2 drop thresholds, one threshold
for larger packets (>= 120 bytes) and another for smaller packets. This
protects smaller packets such as SYNs, ACKs, etc.

The default bandwidth limit is set at 1Gbps but this can be changed by
a user program through a shared BPF map. In addition, by default this BPF
program does not limit connections using loopback. This behavior can be
overwritten by the user program. There is also an option to calculate
some statistics, such as percent of packets marked or dropped, which
the user program can access.

A latter patch provides such a program (hbm.c)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile       |   2 +
 samples/bpf/hbm.h          |  31 +++++++++
 samples/bpf/hbm_kern.h     | 137 +++++++++++++++++++++++++++++++++++++++
 samples/bpf/hbm_out_kern.c | 157 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 327 insertions(+)
 create mode 100644 samples/bpf/hbm.h
 create mode 100644 samples/bpf/hbm_kern.h
 create mode 100644 samples/bpf/hbm_out_kern.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 0c62ac39c697..e1bdc96486f6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -164,6 +164,7 @@ always += xdp_adjust_tail_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
+always += hbm_out_kern.o
 
 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -263,6 +264,7 @@ $(BPF_SAMPLES_PATH)/*.c: verify_target_bpf $(LIBBPF)
 $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
+$(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.h b/samples/bpf/hbm.h
new file mode 100644
index 000000000000..518e8147d084
--- /dev/null
+++ b/samples/bpf/hbm.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for Host Bandwidth Management (HBM) programs
+ */
+struct hbm_vqueue {
+	struct bpf_spin_lock lock;
+	/* 4 byte hole */
+	unsigned long long lasttime;	/* In ns */
+	int credit;			/* In bytes */
+	unsigned int rate;		/* In bytes per NS << 20 */
+};
+
+struct hbm_queue_stats {
+	unsigned long rate;		/* in Mbps*/
+	unsigned long stats:1,		/* get HBM stats (marked, dropped,..) */
+		loopback:1;		/* also limit flows using loopback */
+	unsigned long long pkts_marked;
+	unsigned long long bytes_marked;
+	unsigned long long pkts_dropped;
+	unsigned long long bytes_dropped;
+	unsigned long long pkts_total;
+	unsigned long long bytes_total;
+	unsigned long long firstPacketTime;
+	unsigned long long lastPacketTime;
+};
diff --git a/samples/bpf/hbm_kern.h b/samples/bpf/hbm_kern.h
new file mode 100644
index 000000000000..c5635d924193
--- /dev/null
+++ b/samples/bpf/hbm_kern.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Include file for sample Host Bandwidth Manager (HBM) BPF programs
+ */
+#define KBUILD_MODNAME "foo"
+#include <stddef.h>
+#include <stdbool.h>
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/ipv6.h>
+#include <uapi/linux/in.h>
+#include <uapi/linux/tcp.h>
+#include <uapi/linux/filter.h>
+#include <uapi/linux/pkt_cls.h>
+#include <net/ipv6.h>
+#include <net/inet_ecn.h>
+#include "bpf_endian.h"
+#include "bpf_helpers.h"
+#include "hbm.h"
+
+#define DROP_PKT	0
+#define ALLOW_PKT	1
+#define TCP_ECN_OK	1
+
+#define HBM_DEBUG 0  // Set to 1 to enable debugging
+#if HBM_DEBUG
+#define bpf_printk(fmt, ...)					\
+({								\
+	char ____fmt[] = fmt;					\
+	bpf_trace_printk(____fmt, sizeof(____fmt),		\
+			 ##__VA_ARGS__);			\
+})
+#else
+#define bpf_printk(fmt, ...)
+#endif
+
+#define INITIAL_CREDIT_PACKETS	100
+#define MAX_BYTES_PER_PACKET	1500
+#define MARK_THRESH		(40 * MAX_BYTES_PER_PACKET)
+#define DROP_THRESH		(80 * 5 * MAX_BYTES_PER_PACKET)
+#define LARGE_PKT_DROP_THRESH	(DROP_THRESH - (15 * MAX_BYTES_PER_PACKET))
+#define MARK_REGION_SIZE	(LARGE_PKT_DROP_THRESH - MARK_THRESH)
+#define LARGE_PKT_THRESH	120
+#define MAX_CREDIT		(100 * MAX_BYTES_PER_PACKET)
+#define INIT_CREDIT		(INITIAL_CREDIT_PACKETS * MAX_BYTES_PER_PACKET)
+
+// rate in bytes per ns << 20
+#define CREDIT_PER_NS(delta, rate) ((((u64)(delta)) * (rate)) >> 20)
+
+struct bpf_map_def SEC("maps") queue_state = {
+	.type = BPF_MAP_TYPE_CGROUP_STORAGE,
+	.key_size = sizeof(struct bpf_cgroup_storage_key),
+	.value_size = sizeof(struct hbm_vqueue),
+};
+BPF_ANNOTATE_KV_PAIR(queue_state, struct bpf_cgroup_storage_key,
+		     struct hbm_vqueue);
+
+struct bpf_map_def SEC("maps") queue_stats = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(struct hbm_queue_stats),
+	.max_entries = 1,
+};
+BPF_ANNOTATE_KV_PAIR(queue_stats, int, struct hbm_queue_stats);
+
+struct hbm_pkt_info {
+	bool	is_ip;
+	bool	is_tcp;
+	short	ecn;
+};
+
+static __always_inline void hbm_get_pkt_info(struct __sk_buff *skb,
+					     struct hbm_pkt_info *pkti)
+{
+	struct iphdr iph;
+	struct ipv6hdr *ip6h;
+
+	bpf_skb_load_bytes(skb, 0, &iph, 12);
+	if (iph.version == 6) {
+		ip6h = (struct ipv6hdr *)&iph;
+		pkti->is_ip = true;
+		pkti->is_tcp = (ip6h->nexthdr == 6);
+		pkti->ecn = (ip6h->flow_lbl[0] >> 4) & INET_ECN_MASK;
+	} else if (iph.version == 4) {
+		pkti->is_ip = true;
+		pkti->is_tcp = (iph.protocol == 6);
+		pkti->ecn = iph.tos & INET_ECN_MASK;
+	} else {
+		pkti->is_ip = false;
+		pkti->is_tcp = false;
+		pkti->ecn = 0;
+	}
+}
+
+static __always_inline void hbm_init_vqueue(struct hbm_vqueue *qdp, int rate)
+{
+		bpf_printk("Initializing queue_state, rate:%d\n", rate * 128);
+		qdp->lasttime = bpf_ktime_get_ns();
+		qdp->credit = INIT_CREDIT;
+		qdp->rate = rate * 128;
+}
+
+static __always_inline void hbm_update_stats(struct hbm_queue_stats *qsp,
+					     int len,
+					     unsigned long long curtime,
+					     bool congestion_flag,
+					     bool drop_flag)
+{
+	if (qsp != NULL) {
+		// Following is needed for work conserving
+		__sync_add_and_fetch(&(qsp->bytes_total), len);
+		if (qsp->stats) {
+			// Optionally update statistics
+			if (qsp->firstPacketTime == 0)
+				qsp->firstPacketTime = curtime;
+			qsp->lastPacketTime = curtime;
+			__sync_add_and_fetch(&(qsp->pkts_total), 1);
+			if (congestion_flag || drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_marked), 1);
+				__sync_add_and_fetch(&(qsp->bytes_marked), len);
+			}
+			if (drop_flag) {
+				__sync_add_and_fetch(&(qsp->pkts_dropped), 1);
+				__sync_add_and_fetch(&(qsp->bytes_dropped),
+						     len);
+			}
+		}
+	}
+}
diff --git a/samples/bpf/hbm_out_kern.c b/samples/bpf/hbm_out_kern.c
new file mode 100644
index 000000000000..f806863d0b79
--- /dev/null
+++ b/samples/bpf/hbm_out_kern.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Sample Host Bandwidth Manager (HBM) BPF program.
+ *
+ * A cgroup skb BPF egress program to limit cgroup output bandwidth.
+ * It uses a modified virtual token bucket queue to limit average
+ * egress bandwidth. The implementation uses credits instead of tokens.
+ * Negative credits imply that queueing would have happened (this is
+ * a virtual queue, so no queueing is done by it. However, queueing may
+ * occur at the actual qdisc (which is not used for rate limiting).
+ *
+ * This implementation uses 3 thresholds, one to start marking packets and
+ * the other two to drop packets:
+ *                                  CREDIT
+ *        - <--------------------------|------------------------> +
+ *              |    |          |      0
+ *              |  Large pkt    |
+ *              |  drop thresh  |
+ *   Small pkt drop             Mark threshold
+ *       thresh
+ *
+ * The effect of marking depends on the type of packet:
+ * a) If the packet is ECN enabled and it is a TCP packet, then the packet
+ *    is ECN marked.
+ * b) If the packet is a TCP packet, then we probabilistically call tcp_cwr
+ *    to reduce the congestion window. The current implementation uses a linear
+ *    distribution (0% probability at marking threshold, 100% probability
+ *    at drop threshold).
+ * c) If the packet is not a TCP packet, then it is dropped.
+ *
+ * If the credit is below the drop threshold, the packet is dropped. If it
+ * is a TCP packet, then it also calls tcp_cwr since packets dropped by
+ * by a cgroup skb BPF program do not automatically trigger a call to
+ * tcp_cwr in the current kernel code.
+ *
+ * This BPF program actually uses 2 drop thresholds, one threshold
+ * for larger packets (>= 120 bytes) and another for smaller packets. This
+ * protects smaller packets such as SYNs, ACKs, etc.
+ *
+ * The default bandwidth limit is set at 1Gbps but this can be changed by
+ * a user program through a shared BPF map. In addition, by default this BPF
+ * program does not limit connections using loopback. This behavior can be
+ * overwritten by the user program. There is also an option to calculate
+ * some statistics, such as percent of packets marked or dropped, which
+ * the user program can access.
+ *
+ * A latter patch provides such a program (hbm.c)
+ */
+
+#include "hbm_kern.h"
+
+SEC("cgroup_skb/egress")
+int _hbm_out_cg(struct __sk_buff *skb)
+{
+	struct hbm_pkt_info pkti;
+	int len = skb->len;
+	unsigned int queue_index = 0;
+	unsigned long long curtime;
+	int credit;
+	signed long long delta = 0, zero = 0;
+	int max_credit = MAX_CREDIT;
+	bool congestion_flag = false;
+	bool drop_flag = false;
+	bool cwr_flag = false;
+	struct hbm_vqueue *qdp;
+	struct hbm_queue_stats *qsp = NULL;
+	int rv = ALLOW_PKT;
+
+	qsp = bpf_map_lookup_elem(&queue_stats, &queue_index);
+	if (qsp != NULL && !qsp->loopback && (skb->ifindex == 1))
+		return ALLOW_PKT;
+
+	hbm_get_pkt_info(skb, &pkti);
+
+	// We may want to account for the length of headers in len
+	// calculation, like ETH header + overhead, specially if it
+	// is a gso packet. But I am not doing it right now.
+
+	qdp = bpf_get_local_storage(&queue_state, 0);
+	if (!qdp)
+		return ALLOW_PKT;
+	else if (qdp->lasttime == 0)
+		hbm_init_vqueue(qdp, 1024);
+
+	curtime = bpf_ktime_get_ns();
+
+	// Begin critical section
+	bpf_spin_lock(&qdp->lock);
+	credit = qdp->credit;
+	delta = curtime - qdp->lasttime;
+	/* delta < 0 implies that another process with a curtime greater
+	 * than ours beat us to the critical section and already added
+	 * the new credit, so we should not add it ourselves
+	 */
+	if (delta > 0) {
+		qdp->lasttime = curtime;
+		credit += CREDIT_PER_NS(delta, qdp->rate);
+		if (credit > MAX_CREDIT)
+			credit = MAX_CREDIT;
+	}
+	credit -= len;
+	qdp->credit = credit;
+	bpf_spin_unlock(&qdp->lock);
+	// End critical section
+
+	// Check if we should update rate
+	if (qsp != NULL && (qsp->rate * 128) != qdp->rate) {
+		qdp->rate = qsp->rate * 128;
+		bpf_printk("Updating rate: %d (1sec:%llu bits)\n",
+			   (int)qdp->rate,
+			   CREDIT_PER_NS(1000000000, qdp->rate) * 8);
+	}
+
+	// Set flags (drop, congestion, cwr)
+	// Dropping => we are congested, so ignore congestion flag
+	if (credit < -DROP_THRESH ||
+	    (len > LARGE_PKT_THRESH &&
+	     credit < -LARGE_PKT_DROP_THRESH)) {
+		// Very congested, set drop flag
+		drop_flag = true;
+	} else if (credit < 0) {
+		// Congested, set congestion flag
+		if (pkti.ecn) {
+			if (credit < -MARK_THRESH)
+				congestion_flag = true;
+			else
+				congestion_flag = false;
+		} else {
+			congestion_flag = true;
+		}
+	}
+
+	if (congestion_flag) {
+		if (!bpf_skb_ecn_set_ce(skb)) {
+			if (len > LARGE_PKT_THRESH) {
+				// Problem if too many small packets?
+				drop_flag = true;
+			}
+		}
+	}
+
+	if (drop_flag)
+		rv = DROP_PKT;
+
+	hbm_update_stats(qsp, len, curtime, congestion_flag, drop_flag);
+
+	if (rv == DROP_PKT)
+		__sync_add_and_fetch(&(qdp->credit), len);
+
+	return rv;
+}
+char _license[] SEC("license") = "GPL";
-- 
cgit v1.2.3-59-g8ed1b


From a1270fe95b74eb3195b107c494ed1f11b932a278 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:49 -0800
Subject: bpf: User program for testing HBM

The program nrm creates a cgroup and attaches a BPF program to the
cgroup for testing HBM (Host Bandwidth Manager) for egress traffic.
One still needs to create network traffic. This can be done through
netesto, netperf or iperf3.
A follow-up patch contains a script to create traffic.

USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>]
           [-w] [-h] [prog]
  Where:
   -d        Print BPF trace debug buffer
   -l        Also limit flows doing loopback
   -n <#>    To create cgroup "/hbm#" and attach prog. Default is /nrm1
             This is convenient when testing HBM in more than 1 cgroup
   -r <rate> Rate limit in Mbps
   -s        Get HBM stats (marked, dropped, etc.)
   -t <time> Exit after specified seconds (deault is 0)
   -w        Work conserving flag. cgroup can increase its bandwidth
             beyond the rate limit specified while there is available
             bandwidth. Current implementation assumes there is only
             NIC (eth0), but can be extended to support multiple NICs.
             Currrently only supported for egress. Note, this is just
	     a proof of concept.
   -h        Print this info
   prog      BPF program file name. Name defaults to hbm_out_kern.o

More information about HBM can be found in the paper "BPF Host Resource
Management" presented at the 2018 Linux Plumbers Conference, Networking Track
(http://vger.kernel.org/lpc_net2018_talks/LPC%20BPF%20Network%20Resource%20Paper.pdf)

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/Makefile |   3 +
 samples/bpf/hbm.c    | 441 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 444 insertions(+)
 create mode 100644 samples/bpf/hbm.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index e1bdc96486f6..65e667bdf979 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
 hostprogs-y += xdp_sample_pkts
+hostprogs-y += hbm
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -107,6 +108,7 @@ xdpsock-objs := xdpsock_user.o
 xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -265,6 +267,7 @@ $(src)/*.c: verify_target_bpf $(LIBBPF)
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
+$(obj)/hbm.o: $(src)/hbm.h
 
 # asm/sysreg.h - inline assembly used by it is incompatible with llvm.
 # But, there is no easy way to fix it, so just exclude it since it is
diff --git a/samples/bpf/hbm.c b/samples/bpf/hbm.c
new file mode 100644
index 000000000000..8408ccb7409f
--- /dev/null
+++ b/samples/bpf/hbm.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Example program for Host Bandwidth Managment
+ *
+ * This program loads a cgroup skb BPF program to enforce cgroup output
+ * (egress) or input (ingress) bandwidth limits.
+ *
+ * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog]
+ *   Where:
+ *    -d	Print BPF trace debug buffer
+ *    -l	Also limit flows doing loopback
+ *    -n <#>	To create cgroup \"/hbm#\" and attach prog
+ *		Default is /hbm1
+ *    -r <rate>	Rate limit in Mbps
+ *    -s	Get HBM stats (marked, dropped, etc.)
+ *    -t <time>	Exit after specified seconds (deault is 0)
+ *    -w	Work conserving flag. cgroup can increase its bandwidth
+ *		beyond the rate limit specified while there is available
+ *		bandwidth. Current implementation assumes there is only
+ *		NIC (eth0), but can be extended to support multiple NICs.
+ *		Currrently only supported for egress.
+ *    -h	Print this info
+ *    prog	BPF program file name. Name defaults to hbm_out_kern.o
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "bpf_load.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+#include "hbm.h"
+#include "bpf_util.h"
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+bool outFlag = true;
+int minRate = 1000;		/* cgroup rate limit in Mbps */
+int rate = 1000;		/* can grow if rate conserving is enabled */
+int dur = 1;
+bool stats_flag;
+bool loopback_flag;
+bool debugFlag;
+bool work_conserving_flag;
+
+static void Usage(void);
+static void read_trace_pipe2(void);
+static void do_error(char *msg, bool errno_flag);
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+struct bpf_object *obj;
+int bpfprog_fd;
+int cgroup_storage_fd;
+
+static void read_trace_pipe2(void)
+{
+	int trace_fd;
+	FILE *outf;
+	char *outFname = "hbm_out.log";
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0) {
+		printf("Error opening trace_pipe\n");
+		return;
+	}
+
+//	Future support of ingress
+//	if (!outFlag)
+//		outFname = "hbm_in.log";
+	outf = fopen(outFname, "w");
+
+	if (outf == NULL)
+		printf("Error creating %s\n", outFname);
+
+	while (1) {
+		static char buf[4097];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf) - 1);
+		if (sz > 0) {
+			buf[sz] = 0;
+			puts(buf);
+			if (outf != NULL) {
+				fprintf(outf, "%s\n", buf);
+				fflush(outf);
+			}
+		}
+	}
+}
+
+static void do_error(char *msg, bool errno_flag)
+{
+	if (errno_flag)
+		printf("ERROR: %s, errno: %d\n", msg, errno);
+	else
+		printf("ERROR: %s\n", msg);
+	exit(1);
+}
+
+static int prog_load(char *prog)
+{
+	struct bpf_prog_load_attr prog_load_attr = {
+		.prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+		.file = prog,
+		.expected_attach_type = BPF_CGROUP_INET_EGRESS,
+	};
+	int map_fd;
+	struct bpf_map *map;
+
+	int ret = 0;
+
+	if (access(prog, O_RDONLY) < 0) {
+		printf("Error accessing file %s: %s\n", prog, strerror(errno));
+		return 1;
+	}
+	if (bpf_prog_load_xattr(&prog_load_attr, &obj, &bpfprog_fd))
+		ret = 1;
+	if (!ret) {
+		map = bpf_object__find_map_by_name(obj, "queue_stats");
+		map_fd = bpf_map__fd(map);
+		if (map_fd < 0) {
+			printf("Map not found: %s\n", strerror(map_fd));
+			ret = 1;
+		}
+	}
+
+	if (ret) {
+		printf("ERROR: load_bpf_file failed for: %s\n", prog);
+		printf("  Output from verifier:\n%s\n------\n", bpf_log_buf);
+		ret = -1;
+	} else {
+		ret = map_fd;
+	}
+
+	return ret;
+}
+
+static int run_bpf_prog(char *prog, int cg_id)
+{
+	int map_fd;
+	int rc = 0;
+	int key = 0;
+	int cg1 = 0;
+	int type = BPF_CGROUP_INET_EGRESS;
+	char cg_dir[100];
+	struct hbm_queue_stats qstats = {0};
+
+	sprintf(cg_dir, "/hbm%d", cg_id);
+	map_fd = prog_load(prog);
+	if (map_fd  == -1)
+		return 1;
+
+	if (setup_cgroup_environment()) {
+		printf("ERROR: setting cgroup environment\n");
+		goto err;
+	}
+	cg1 = create_and_get_cgroup(cg_dir);
+	if (!cg1) {
+		printf("ERROR: create_and_get_cgroup\n");
+		goto err;
+	}
+	if (join_cgroup(cg_dir)) {
+		printf("ERROR: join_cgroup\n");
+		goto err;
+	}
+
+	qstats.rate = rate;
+	qstats.stats = stats_flag ? 1 : 0;
+	qstats.loopback = loopback_flag ? 1 : 0;
+	if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY)) {
+		printf("ERROR: Could not update map element\n");
+		goto err;
+	}
+
+	if (!outFlag)
+		type = BPF_CGROUP_INET_INGRESS;
+	if (bpf_prog_attach(bpfprog_fd, cg1, type, 0)) {
+		printf("ERROR: bpf_prog_attach fails!\n");
+		log_err("Attaching prog");
+		goto err;
+	}
+
+	if (work_conserving_flag) {
+		struct timeval t0, t_last, t_new;
+		FILE *fin;
+		unsigned long long last_eth_tx_bytes, new_eth_tx_bytes;
+		signed long long last_cg_tx_bytes, new_cg_tx_bytes;
+		signed long long delta_time, delta_bytes, delta_rate;
+		int delta_ms;
+#define DELTA_RATE_CHECK 10000		/* in us */
+#define RATE_THRESHOLD 9500000000	/* 9.5 Gbps */
+
+		bpf_map_lookup_elem(map_fd, &key, &qstats);
+		if (gettimeofday(&t0, NULL) < 0)
+			do_error("gettimeofday failed", true);
+		t_last = t0;
+		fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r");
+		if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1)
+			do_error("fscanf fails", false);
+		fclose(fin);
+		last_cg_tx_bytes = qstats.bytes_total;
+		while (true) {
+			usleep(DELTA_RATE_CHECK);
+			if (gettimeofday(&t_new, NULL) < 0)
+				do_error("gettimeofday failed", true);
+			delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 +
+				(t_new.tv_usec - t0.tv_usec)/1000;
+			if (delta_ms > dur * 1000)
+				break;
+			delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 +
+				(t_new.tv_usec - t_last.tv_usec);
+			if (delta_time == 0)
+				continue;
+			t_last = t_new;
+			fin = fopen("/sys/class/net/eth0/statistics/tx_bytes",
+				    "r");
+			if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1)
+				do_error("fscanf fails", false);
+			fclose(fin);
+			printf("  new_eth_tx_bytes:%llu\n",
+			       new_eth_tx_bytes);
+			bpf_map_lookup_elem(map_fd, &key, &qstats);
+			new_cg_tx_bytes = qstats.bytes_total;
+			delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes;
+			last_eth_tx_bytes = new_eth_tx_bytes;
+			delta_rate = (delta_bytes * 8000000) / delta_time;
+			printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps",
+			       delta_ms, delta_rate/1000000000.0,
+			       rate/1000.0);
+			if (delta_rate < RATE_THRESHOLD) {
+				/* can increase cgroup rate limit, but first
+				 * check if we are using the current limit.
+				 * Currently increasing by 6.25%, unknown
+				 * if that is the optimal rate.
+				 */
+				int rate_diff100;
+
+				delta_bytes = new_cg_tx_bytes -
+					last_cg_tx_bytes;
+				last_cg_tx_bytes = new_cg_tx_bytes;
+				delta_rate = (delta_bytes * 8000000) /
+					delta_time;
+				printf(" rate:%.3fGbps",
+				       delta_rate/1000000000.0);
+				rate_diff100 = (((long long)rate)*1000000 -
+						     delta_rate) * 100 /
+					(((long long) rate) * 1000000);
+				printf("  rdiff:%d", rate_diff100);
+				if (rate_diff100  <= 3) {
+					rate += (rate >> 4);
+					if (rate > RATE_THRESHOLD / 1000000)
+						rate = RATE_THRESHOLD / 1000000;
+					qstats.rate = rate;
+					printf(" INC\n");
+				} else {
+					printf("\n");
+				}
+			} else {
+				/* Need to decrease cgroup rate limit.
+				 * Currently decreasing by 12.5%, unknown
+				 * if that is optimal
+				 */
+				printf(" DEC\n");
+				rate -= (rate >> 3);
+				if (rate < minRate)
+					rate = minRate;
+				qstats.rate = rate;
+			}
+			if (bpf_map_update_elem(map_fd, &key, &qstats, BPF_ANY))
+				do_error("update map element fails", false);
+		}
+	} else {
+		sleep(dur);
+	}
+	// Get stats!
+	if (stats_flag && bpf_map_lookup_elem(map_fd, &key, &qstats)) {
+		char fname[100];
+		FILE *fout;
+
+		if (!outFlag)
+			sprintf(fname, "hbm.%d.in", cg_id);
+		else
+			sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "ERROR: Could not lookup queue_stats\n");
+	} else if (stats_flag && qstats.lastPacketTime >
+		   qstats.firstPacketTime) {
+		long long delta_us = (qstats.lastPacketTime -
+				      qstats.firstPacketTime)/1000;
+		unsigned int rate_mbps = ((qstats.bytes_total -
+					   qstats.bytes_dropped) * 8 /
+					  delta_us);
+		double percent_pkts, percent_bytes;
+		char fname[100];
+		FILE *fout;
+
+// Future support of ingress
+//		if (!outFlag)
+//			sprintf(fname, "hbm.%d.in", cg_id);
+//		else
+		sprintf(fname, "hbm.%d.out", cg_id);
+		fout = fopen(fname, "w");
+		fprintf(fout, "id:%d\n", cg_id);
+		fprintf(fout, "rate_mbps:%d\n", rate_mbps);
+		fprintf(fout, "duration:%.1f secs\n",
+			(qstats.lastPacketTime - qstats.firstPacketTime) /
+			1000000000.0);
+		fprintf(fout, "packets:%d\n", (int)qstats.pkts_total);
+		fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total /
+						     1000000));
+		fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped);
+		fprintf(fout, "bytes_dropped_MB:%d\n",
+			(int)(qstats.bytes_dropped /
+						       1000000));
+		// Marked Pkts and Bytes
+		percent_pkts = (qstats.pkts_marked * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_marked * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes);
+
+		// Dropped Pkts and Bytes
+		percent_pkts = (qstats.pkts_dropped * 100.0) /
+			(qstats.pkts_total + 1);
+		percent_bytes = (qstats.bytes_dropped * 100.0) /
+			(qstats.bytes_total + 1);
+		fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts);
+		fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes);
+		fclose(fout);
+	}
+
+	if (debugFlag)
+		read_trace_pipe2();
+	return rc;
+err:
+	rc = 1;
+
+	if (cg1)
+		close(cg1);
+	cleanup_cgroup_environment();
+
+	return rc;
+}
+
+static void Usage(void)
+{
+	printf("This program loads a cgroup skb BPF program to enforce\n"
+	       "cgroup output (egress) bandwidth limits.\n\n"
+	       "USAGE: hbm [-o] [-d]  [-l] [-n <id>] [-r <rate>] [-s]\n"
+	       "           [-t <secs>] [-w] [-h] [prog]\n"
+	       "  Where:\n"
+	       "    -o         indicates egress direction (default)\n"
+	       "    -d         print BPF trace debug buffer\n"
+	       "    -l         also limit flows using loopback\n"
+	       "    -n <#>     to create cgroup \"/hbm#\" and attach prog\n"
+	       "               Default is /hbm1\n"
+	       "    -r <rate>  Rate in Mbps\n"
+	       "    -s         Update HBM stats\n"
+	       "    -t <time>  Exit after specified seconds (deault is 0)\n"
+	       "    -w	       Work conserving flag. cgroup can increase\n"
+	       "               bandwidth beyond the rate limit specified\n"
+	       "               while there is available bandwidth. Current\n"
+	       "               implementation assumes there is only eth0\n"
+	       "               but can be extended to support multiple NICs\n"
+	       "    -h         print this info\n"
+	       "    prog       BPF program file name. Name defaults to\n"
+	       "                 hbm_out_kern.o\n");
+}
+
+int main(int argc, char **argv)
+{
+	char *prog = "hbm_out_kern.o";
+	int  k;
+	int cg_id = 1;
+	char *optstring = "iodln:r:st:wh";
+
+	while ((k = getopt(argc, argv, optstring)) != -1) {
+		switch (k) {
+		case'o':
+			break;
+		case 'd':
+			debugFlag = true;
+			break;
+		case 'l':
+			loopback_flag = true;
+			break;
+		case 'n':
+			cg_id = atoi(optarg);
+			break;
+		case 'r':
+			minRate = atoi(optarg) * 1.024;
+			rate = minRate;
+			break;
+		case 's':
+			stats_flag = true;
+			break;
+		case 't':
+			dur = atoi(optarg);
+			break;
+		case 'w':
+			work_conserving_flag = true;
+			break;
+		case '?':
+			if (optopt == 'n' || optopt == 'r' || optopt == 't')
+				fprintf(stderr,
+					"Option -%c requires an argument.\n\n",
+					optopt);
+		case 'h':
+			// fallthrough
+		default:
+			Usage();
+			return 0;
+		}
+	}
+
+	if (optind < argc)
+		prog = argv[optind];
+	printf("HBM prog: %s\n", prog != NULL ? prog : "NULL");
+
+	return run_bpf_prog(prog, cg_id);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 4ffd44cfd147d157406a26c995cd0c373bffd7a0 Mon Sep 17 00:00:00 2001
From: brakmo <brakmo@fb.com>
Date: Fri, 1 Mar 2019 12:38:50 -0800
Subject: bpf: HBM test script

Script for testing HBM (Host Bandwidth Manager) framework.
It creates a cgroup to use for testing and load a BPF program to limit
egress bandwidht. It then uses iperf3 or netperf to create
loads. The output is the goodput in Mbps (unless -D is used).

It can work on a single host using loopback or among two hosts (with netperf).
When using loopback, it is recommended to also introduce a delay of at least
1ms (-d=1), otherwise the assigned bandwidth is likely to be underutilized.

USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]
             [-d=<delay>|--delay=<delay>] [--debug] [-E]
             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >] [-l]
	     [-N] [-p=<port>|--port=<port>] [-P] [-q=<qdisc>]
             [-R] [-s=<server>|--server=<server] [--stats]
	     [-t=<time>|--time=<time>] [-w] [cubic|dctcp]
  Where:
    out               Egress (default egress)
    -b or --bpf       BPF program filename to load and attach.
                      Default is nrm_out_kern.o for egress,
    -c or -cc         TCP congestion control (cubic or dctcp)
    -d or --delay     Add a delay in ms using netem
    -D                In addition to the goodput in Mbps, it also outputs
                      other detailed information. This information is
                      test dependent (i.e. iperf3 or netperf).
    --debug           Print BPF trace buffer
    -E                Enable ECN (not required for dctcp)
    -f or --flows     Number of concurrent flows (default=1)
    -i or --id        cgroup id (an integer, default is 1)
    -l                Do not limit flows using loopback
    -N                Use netperf instead of iperf3
    -h                Help
    -p or --port      iperf3 port (default is 5201)
    -P                Use an iperf3 instance for each flow
    -q                Use the specified qdisc.
    -r or --rate      Rate in Mbps (default 1s 1Gbps)
    -R                Use TCP_RR for netperf. 1st flow has req
                      size of 10KB, rest of 1MB. Reply in all
                      cases is 1 byte.
                      More detailed output for each flow can be found
                      in the files netperf.<cg>.<flow>, where <cg> is the
                      cgroup id as specified with the -i flag, and <flow>
                      is the flow id starting at 1 and increasing by 1 for
                      flow (as specified by -f).
    -s or --server    hostname of netperf server. Used to create netperf
                      test traffic between to hosts (default is within host)
                      netserver must be running on the host.
    --stats           Get HBM stats (marked, dropped, etc.)
    -t or --time      duration of iperf3 in seconds (default=5)
    -w                Work conserving flag. cgroup can increase its
                      bandwidth beyond the rate limit specified
                      while there is available bandwidth. Current
                      implementation assumes there is only one NIC
                      (eth0), but can be extended to support multiple
                      NICs. This is just a proof of concept.
    cubic or dctcp    specify TCP CC to use

Examples:
 ./do_hbm_test.sh -l -d=1 -D --stats
     Runs a 5 second test, using a single iperf3 flow and with the default
     rate limit of 1Gbps and a delay of 1ms (using netem) using the default
     TCP congestion control on the loopback device (hence we use "-l" to
     enforce bandwidth limit on loopback device). Since no direction is
     specified, it defaults to egress. Since no TCP CC algorithm is
     specified it uses the system default (Cubic for this test).
     With no -D flag, only the value of the AGGREGATE OUTPUT would show.
     id refers to the cgroup id and is useful when running multi cgroup
     tests (supported by a future patch).
     This patchset does not support calling TCP's congesion window
     reduction, even when packets are dropped by the BPF program, resulting
     in a large number of packets dropped. It is recommended that the  current
     HBM implemenation only be used with ECN enabled flows. A future patch
     will add support for reducing TCP's cwnd and will increase the
     performance of non-ECN enabled flows.
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:493
     duration:4.8 secs
     packets:11355
     bytes_MB:590
     pkts_dropped:4497
     bytes_dropped_MB:292
     pkts_marked_percent: 39.60
     bytes_marked_percent: 49.49
     pkts_dropped_percent: 39.60
     bytes_dropped_percent: 49.49
     PING AVG DELAY:2.075
     AGGREGATE_GOODPUT:505

./do_nrm_test.sh -l -d=1 -D --stats dctcp
     Same as above but using dctcp. Note that fewer bytes are dropped
     (0.01% vs. 49%).
   Output:
     Details for HBM in cgroup 1
     id:1
     rate_mbps:945
     duration:4.9 secs
     packets:16859
     bytes_MB:578
     pkts_dropped:1
     bytes_dropped_MB:0
     pkts_marked_percent: 28.74
     bytes_marked_percent: 45.15
     pkts_dropped_percent:  0.01
     bytes_dropped_percent:  0.01
     PING AVG DELAY:2.083
     AGGREGATE_GOODPUT:965

./do_nrm_test.sh -d=1 -D --stats
     As first example, but without limiting loopback device (i.e. no
     "-l" flag). Since there is no bandwidth limiting, no details for
     HBM are printed out.
   Output:
     Details for HBM in cgroup 1
     PING AVG DELAY:2.019
     AGGREGATE_GOODPUT:42655

./do_hbm.sh -l -d=1 -D --stats -f=2
     Uses iper3 and does 2 flows
./do_hbm.sh -l -d=1 -D --stats -f=4 -P
     Uses iperf3 and does 4 flows, each flow as a separate process.
./do_hbm.sh -l -d=1 -D --stats -f=4 -N
     Uses netperf, 4 flows
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats dctcp -s=<server-name>
     Uses netperf between two hosts. The remote host name is specified
     with -s= and you need to start the program netserver manually on
     the remote host. It will use 1 flow, a rate limit of 2Gbps and dctcp.
./do_hbm.sh -f=1 -r=2000 -t=5 -N -D --stats -w dctcp \
     -s=<server-name>
     As previous, but allows use of extra bandwidth. For this test the
     rate is 8Gbps vs. 1Gbps of the previous test.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 samples/bpf/do_hbm_test.sh | 436 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 436 insertions(+)
 create mode 100755 samples/bpf/do_hbm_test.sh

diff --git a/samples/bpf/do_hbm_test.sh b/samples/bpf/do_hbm_test.sh
new file mode 100755
index 000000000000..56c8b4115c95
--- /dev/null
+++ b/samples/bpf/do_hbm_test.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 Facebook
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of version 2 of the GNU General Public
+# License as published by the Free Software Foundation.
+
+Usage() {
+  echo "Script for testing HBM (Host Bandwidth Manager) framework."
+  echo "It creates a cgroup to use for testing and load a BPF program to limit"
+  echo "egress or ingress bandwidht. It then uses iperf3 or netperf to create"
+  echo "loads. The output is the goodput in Mbps (unless -D was used)."
+  echo ""
+  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>] [-D]"
+  echo "             [-d=<delay>|--delay=<delay>] [--debug] [-E]"
+  echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
+  echo "             [-l] [-N] [-p=<port>|--port=<port>] [-P]"
+  echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
+  echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
+  echo "  Where:"
+  echo "    out               egress (default)"
+  echo "    -b or --bpf       BPF program filename to load and attach."
+  echo "                      Default is hbm_out_kern.o for egress,"
+  echo "    -c or -cc         TCP congestion control (cubic or dctcp)"
+  echo "    --debug           print BPF trace buffer"
+  echo "    -d or --delay     add a delay in ms using netem"
+  echo "    -D                In addition to the goodput in Mbps, it also outputs"
+  echo "                      other detailed information. This information is"
+  echo "                      test dependent (i.e. iperf3 or netperf)."
+  echo "    -E                enable ECN (not required for dctcp)"
+  echo "    -f or --flows     number of concurrent flows (default=1)"
+  echo "    -i or --id        cgroup id (an integer, default is 1)"
+  echo "    -N                use netperf instead of iperf3"
+  echo "    -l                do not limit flows using loopback"
+  echo "    -h                Help"
+  echo "    -p or --port      iperf3 port (default is 5201)"
+  echo "    -P                use an iperf3 instance for each flow"
+  echo "    -q                use the specified qdisc"
+  echo "    -r or --rate      rate in Mbps (default 1s 1Gbps)"
+  echo "    -R                Use TCP_RR for netperf. 1st flow has req"
+  echo "                      size of 10KB, rest of 1MB. Reply in all"
+  echo "                      cases is 1 byte."
+  echo "                      More detailed output for each flow can be found"
+  echo "                      in the files netperf.<cg>.<flow>, where <cg> is the"
+  echo "                      cgroup id as specified with the -i flag, and <flow>"
+  echo "                      is the flow id starting at 1 and increasing by 1 for"
+  echo "                      flow (as specified by -f)."
+  echo "    -s or --server    hostname of netperf server. Used to create netperf"
+  echo "                      test traffic between to hosts (default is within host)"
+  echo "                      netserver must be running on the host."
+  echo "    -S or --stats     whether to update hbm stats (default is yes)."
+  echo "    -t or --time      duration of iperf3 in seconds (default=5)"
+  echo "    -w                Work conserving flag. cgroup can increase its"
+  echo "                      bandwidth beyond the rate limit specified"
+  echo "                      while there is available bandwidth. Current"
+  echo "                      implementation assumes there is only one NIC"
+  echo "                      (eth0), but can be extended to support multiple"
+  echo "                       NICs."
+  echo "    cubic or dctcp    specify which TCP CC to use"
+  echo " "
+  exit
+}
+
+#set -x
+
+debug_flag=0
+args="$@"
+name="$0"
+netem=0
+cc=x
+dir="-o"
+dir_name="out"
+dur=5
+flows=1
+id=1
+prog=""
+port=5201
+rate=1000
+multi_iperf=0
+flow_cnt=1
+use_netperf=0
+rr=0
+ecn=0
+details=0
+server=""
+qdisc=""
+flags=""
+do_stats=0
+
+function start_hbm () {
+  rm -f hbm.out
+  echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
+  echo " " >> hbm.out
+  ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1  &
+  echo $!
+}
+
+processArgs () {
+  for i in $args ; do
+    case $i in
+    # Support for upcomming ingress rate limiting
+    #in)         # support for upcoming ingress rate limiting
+    #  dir="-i"
+    #  dir_name="in"
+    #  ;;
+    out)
+      dir="-o"
+      dir_name="out"
+      ;;
+    -b=*|--bpf=*)
+      prog="${i#*=}"
+      ;;
+    -c=*|--cc=*)
+      cc="${i#*=}"
+      ;;
+    --debug)
+      flags="$flags -d"
+      debug_flag=1
+      ;;
+    -d=*|--delay=*)
+      netem="${i#*=}"
+      ;;
+    -D)
+      details=1
+      ;;
+    -E)
+     ecn=1
+     ;;
+    # Support for upcomming fq Early Departure Time egress rate limiting
+    #--edt)
+    # prog="hbm_out_edt_kern.o"
+    # qdisc="fq"
+    # ;;
+    -f=*|--flows=*)
+      flows="${i#*=}"
+      ;;
+    -i=*|--id=*)
+      id="${i#*=}"
+      ;;
+    -l)
+      flags="$flags -l"
+      ;;
+    -N)
+      use_netperf=1
+      ;;
+    -p=*|--port=*)
+      port="${i#*=}"
+      ;;
+    -P)
+      multi_iperf=1
+      ;;
+    -q=*)
+      qdisc="${i#*=}"
+      ;;
+    -r=*|--rate=*)
+      rate="${i#*=}"
+      ;;
+    -R)
+      rr=1
+      ;;
+    -s=*|--server=*)
+      server="${i#*=}"
+      ;;
+    -S|--stats)
+      flags="$flags -s"
+      do_stats=1
+      ;;
+    -t=*|--time=*)
+      dur="${i#*=}"
+      ;;
+    -w)
+      flags="$flags -w"
+      ;;
+    cubic)
+      cc=cubic
+      ;;
+    dctcp)
+      cc=dctcp
+      ;;
+    *)
+      echo "Unknown arg:$i"
+      Usage
+      ;;
+    esac
+  done
+}
+
+processArgs
+
+if [ $debug_flag -eq 1 ] ; then
+  rm -f hbm_out.log
+fi
+
+hbm_pid=$(start_hbm)
+usleep 100000
+
+host=`hostname`
+cg_base_dir=/sys/fs/cgroup
+cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
+
+echo $$ >> $cg_dir/cgroup.procs
+
+ulimit -l unlimited
+
+rm -f ss.out
+rm -f hbm.[0-9]*.$dir_name
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=1
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
+  if [ "$cc" != "x" ] ; then
+    sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
+  fi
+fi
+
+if [ "$netem" -ne "0" ] ; then
+  if [ "$qdisc" != "" ] ; then
+    echo "WARNING: Ignoring -q options because -d option used"
+  fi
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
+elif [ "$qdisc" != "" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+  tc qdisc add dev lo root $qdisc > /dev/null 2>&1
+fi
+
+n=0
+m=$[$dur * 5]
+hn="::1"
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" != "" ] ; then
+    hn=$server
+  fi
+fi
+
+( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
+
+if [ $use_netperf -ne 0 ] ; then
+  begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
+                   awk '{ print $1 }'`
+  if [ "$begNetserverPid" == "" ] ; then
+    if [ "$server" == "" ] ; then
+      ( ./netserver > /dev/null 2>&1) &
+      usleep 100000
+    fi
+  fi
+  flow_cnt=1
+  if [ "$server" == "" ] ; then
+    np_server=$host
+  else
+    np_server=$server
+  fi
+  if [ "$cc" == "x" ] ; then
+    np_cc=""
+  else
+    np_cc="-K $cc,$cc"
+  fi
+  replySize=1
+  while [ $flow_cnt -le $flows ] ; do
+    if [ $rr -ne 0 ] ; then
+      reqSize=1M
+      if [ $flow_cnt -eq 1 ] ; then
+        reqSize=10K
+      fi
+      if [ "$dir" == "-i" ] ; then
+        replySize=$reqSize
+        reqSize=1
+      fi
+      ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR  -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+    else
+      if [ "$dir" == "-i" ] ; then
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      else
+        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
+      fi
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+
+# sleep for duration of test (plus some buffer)
+  n=$[dur+2]
+  sleep $n
+
+# force graceful termination of netperf
+  pids=`pgrep netperf`
+  for p in $pids ; do
+    kill -SIGALRM $p
+  done
+
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+  while [ $flow_cnt -le $flows ] ; do
+    if [ "$dir" == "-i" ] ; then
+      r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    else
+      r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
+    fi
+    echo "rate for flow $flow_cnt: $r"
+    rate=$[rate+r]
+    if [ $details -ne 0 ] ; then
+      echo "-----"
+      echo "Details for cgroup $id, flow $flow_cnt"
+      cat netperf.$id.$flow_cnt
+    fi
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    echo ""
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+elif [ $multi_iperf -eq 0 ] ; then
+  (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+  usleep 100000
+  iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
+  rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
+  rate=`echo $rates | grep -o "[0-9]*$"`
+
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+else
+  flow_cnt=1
+  while [ $flow_cnt -le $flows ] ; do
+    (iperf3 -s -p $port -1 > /dev/null 2>&1) &
+    ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
+    port=$[port+1]
+    flow_cnt=$[flow_cnt+1]
+  done
+  n=$[dur+1]
+  sleep $n
+  flow_cnt=1
+  rate=0
+  if [ $details -ne 0 ] ; then
+    echo ""
+    echo "Details for HBM in cgroup $id"
+    if [ $do_stats -eq 1 ] ; then
+      if [ -e hbm.$id.$dir_name ] ; then
+        cat hbm.$id.$dir_name
+      fi
+    fi
+  fi
+
+  while [ $flow_cnt -le $flows ] ; do
+    r=`cat iperf3.$id.$flow_cnt`
+#    echo "rate for flow $flow_cnt: $r"
+  if [ $details -ne 0 ] ; then
+    echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
+  fi
+    rate=$[rate+r]
+    flow_cnt=$[flow_cnt+1]
+  done
+  if [ $details -ne 0 ] ; then
+    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
+    echo "PING AVG DELAY:$delay"
+    echo "AGGREGATE_GOODPUT:$rate"
+  else
+    echo $rate
+  fi
+fi
+
+if [ $use_netperf -eq 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
+fi
+if [ $ecn -ne 0 ] ; then
+  sysctl -w -q -n net.ipv4.tcp_ecn=0
+fi
+if [ "$netem" -ne "0" ] ; then
+  tc qdisc del dev lo root > /dev/null 2>&1
+fi
+
+sleep 2
+
+hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
+if [ "$hbmPid" == "$hbm_pid" ] ; then
+  kill $hbm_pid
+fi
+
+sleep 1
+
+# Detach any BPF programs that may have lingered
+ttx=`bpftool cgroup tree | grep hbm`
+v=2
+for x in $ttx ; do
+    if [ "${x:0:36}" == "/sys/fs/cgroup/cgroup-test-work-dir/" ] ; then
+	cg=$x ; v=0
+    else
+	if [ $v -eq 0 ] ; then
+	    id=$x ; v=1
+	else
+	    if [ $v -eq 1 ] ; then
+		type=$x ; bpftool cgroup detach $cg $type id $id
+		v=0
+	    fi
+	fi
+    fi
+done
+
+if [ $use_netperf -ne 0 ] ; then
+  if [ "$server" == "" ] ; then
+    if [ "$begNetserverPid" == "" ] ; then
+      netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
+      if [ "$netserverPid" != "" ] ; then
+        kill $netserverPid
+      fi
+    fi
+  fi
+fi
+exit
-- 
cgit v1.2.3-59-g8ed1b